summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata')
-rw-r--r--vendor/regex-automata/.cargo-checksum.json2
-rw-r--r--vendor/regex-automata/Cargo.toml82
-rw-r--r--vendor/regex-automata/PLANS.md165
-rw-r--r--vendor/regex-automata/README.md13
-rw-r--r--vendor/regex-automata/TODO9
-rw-r--r--vendor/regex-automata/data/fowler-tests/LICENSE19
-rw-r--r--vendor/regex-automata/data/fowler-tests/README17
-rw-r--r--vendor/regex-automata/data/fowler-tests/repetition.dat163
-rw-r--r--vendor/regex-automata/data/tests/crazy.toml177
-rw-r--r--vendor/regex-automata/data/tests/flags.toml59
-rw-r--r--vendor/regex-automata/data/tests/fowler/LICENSE19
-rw-r--r--vendor/regex-automata/data/tests/fowler/basic.dat221
-rw-r--r--vendor/regex-automata/data/tests/fowler/basic.toml1428
-rwxr-xr-xvendor/regex-automata/data/tests/fowler/fowler-to-toml76
-rw-r--r--vendor/regex-automata/data/tests/fowler/nullsubexpr.dat79
-rw-r--r--vendor/regex-automata/data/tests/fowler/nullsubexpr.toml350
-rw-r--r--vendor/regex-automata/data/tests/fowler/repetition-long.toml294
-rw-r--r--vendor/regex-automata/data/tests/fowler/repetition.toml343
-rw-r--r--vendor/regex-automata/data/tests/iter.toml92
-rw-r--r--vendor/regex-automata/data/tests/no-unicode.toml138
-rw-r--r--vendor/regex-automata/data/tests/unicode.toml489
-rw-r--r--vendor/regex-automata/src/byteorder.rs76
-rw-r--r--vendor/regex-automata/src/classes.rs271
-rw-r--r--vendor/regex-automata/src/codegen.rs104
-rw-r--r--vendor/regex-automata/src/dense.rs2332
-rw-r--r--vendor/regex-automata/src/determinize.rs286
-rw-r--r--vendor/regex-automata/src/dfa.rs363
-rw-r--r--vendor/regex-automata/src/dfa/accel.rs507
-rw-r--r--vendor/regex-automata/src/dfa/automaton.rs1903
-rw-r--r--vendor/regex-automata/src/dfa/dense.rs4470
-rw-r--r--vendor/regex-automata/src/dfa/determinize.rs547
-rw-r--r--vendor/regex-automata/src/dfa/error.rs162
-rw-r--r--vendor/regex-automata/src/dfa/minimize.rs (renamed from vendor/regex-automata/src/minimize.rs)260
-rw-r--r--vendor/regex-automata/src/dfa/mod.rs363
-rw-r--r--vendor/regex-automata/src/dfa/regex.rs2146
-rw-r--r--vendor/regex-automata/src/dfa/search.rs493
-rw-r--r--vendor/regex-automata/src/dfa/search_unsafe.rs321
-rw-r--r--vendor/regex-automata/src/dfa/sparse.rs2283
-rw-r--r--vendor/regex-automata/src/dfa/special.rs477
-rw-r--r--vendor/regex-automata/src/dfa/transducer.rs207
-rw-r--r--vendor/regex-automata/src/error.rs150
-rw-r--r--vendor/regex-automata/src/hybrid/dfa.rs3817
-rw-r--r--vendor/regex-automata/src/hybrid/error.rs130
-rw-r--r--vendor/regex-automata/src/hybrid/id.rs415
-rw-r--r--vendor/regex-automata/src/hybrid/mod.rs179
-rw-r--r--vendor/regex-automata/src/hybrid/regex.rs2124
-rw-r--r--vendor/regex-automata/src/hybrid/search.rs663
-rw-r--r--vendor/regex-automata/src/lib.rs393
-rw-r--r--vendor/regex-automata/src/macros.rs30
-rw-r--r--vendor/regex-automata/src/nfa/compiler.rs1193
-rw-r--r--vendor/regex-automata/src/nfa/mod.rs253
-rw-r--r--vendor/regex-automata/src/nfa/thompson/compiler.rs1713
-rw-r--r--vendor/regex-automata/src/nfa/thompson/error.rs145
-rw-r--r--vendor/regex-automata/src/nfa/thompson/map.rs (renamed from vendor/regex-automata/src/nfa/map.rs)24
-rw-r--r--vendor/regex-automata/src/nfa/thompson/mod.rs1555
-rw-r--r--vendor/regex-automata/src/nfa/thompson/pikevm.rs554
-rw-r--r--vendor/regex-automata/src/nfa/thompson/range_trie.rs (renamed from vendor/regex-automata/src/nfa/range_trie.rs)49
-rw-r--r--vendor/regex-automata/src/regex.rs771
-rw-r--r--vendor/regex-automata/src/sparse.rs1256
-rw-r--r--vendor/regex-automata/src/sparse_set.rs60
-rw-r--r--vendor/regex-automata/src/state_id.rs291
-rw-r--r--vendor/regex-automata/src/transducer.rs107
-rw-r--r--vendor/regex-automata/src/util/alphabet.rs790
-rw-r--r--vendor/regex-automata/src/util/bytes.rs950
-rw-r--r--vendor/regex-automata/src/util/determinize/mod.rs493
-rw-r--r--vendor/regex-automata/src/util/determinize/state.rs873
-rw-r--r--vendor/regex-automata/src/util/id.rs608
-rw-r--r--vendor/regex-automata/src/util/lazy.rs31
-rw-r--r--vendor/regex-automata/src/util/matchtypes.rs356
-rw-r--r--vendor/regex-automata/src/util/mod.rs275
-rw-r--r--vendor/regex-automata/src/util/prefilter.rs281
-rw-r--r--vendor/regex-automata/src/util/sparse_set.rs229
-rw-r--r--vendor/regex-automata/src/util/start.rs109
-rw-r--r--vendor/regex-automata/src/util/syntax.rs272
-rw-r--r--vendor/regex-automata/tests/collection.rs461
-rw-r--r--vendor/regex-automata/tests/data/bytes.toml235
-rw-r--r--vendor/regex-automata/tests/data/crazy.toml302
-rw-r--r--vendor/regex-automata/tests/data/earliest.toml48
-rw-r--r--vendor/regex-automata/tests/data/empty.toml113
-rw-r--r--vendor/regex-automata/tests/data/expensive.toml12
-rw-r--r--vendor/regex-automata/tests/data/flags.toml67
-rw-r--r--vendor/regex-automata/tests/data/fowler/basic.toml1638
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/README (renamed from vendor/regex-automata/data/tests/fowler/README)7
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/basic.dat (renamed from vendor/regex-automata/data/fowler-tests/basic.dat)0
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat (renamed from vendor/regex-automata/data/fowler-tests/nullsubexpr.dat)0
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat (renamed from vendor/regex-automata/data/tests/fowler/repetition-long.dat)0
-rw-r--r--vendor/regex-automata/tests/data/fowler/dat/repetition.dat (renamed from vendor/regex-automata/data/tests/fowler/repetition.dat)0
-rw-r--r--vendor/regex-automata/tests/data/fowler/nullsubexpr.toml405
-rw-r--r--vendor/regex-automata/tests/data/fowler/repetition-expensive.toml341
-rw-r--r--vendor/regex-automata/tests/data/fowler/repetition-long.toml341
-rw-r--r--vendor/regex-automata/tests/data/fowler/repetition.toml397
-rw-r--r--vendor/regex-automata/tests/data/iter.toml119
-rw-r--r--vendor/regex-automata/tests/data/misc.toml99
-rw-r--r--vendor/regex-automata/tests/data/multiline.toml275
-rw-r--r--vendor/regex-automata/tests/data/no-unicode.toml158
-rw-r--r--vendor/regex-automata/tests/data/overlapping.toml126
-rw-r--r--vendor/regex-automata/tests/data/regression.toml423
-rw-r--r--vendor/regex-automata/tests/data/set.toml523
-rw-r--r--vendor/regex-automata/tests/data/unicode.toml514
-rw-r--r--vendor/regex-automata/tests/data/word-boundary.toml771
-rw-r--r--vendor/regex-automata/tests/dfa/api.rs133
-rw-r--r--vendor/regex-automata/tests/dfa/mod.rs2
-rw-r--r--vendor/regex-automata/tests/dfa/suite.rs280
-rw-r--r--vendor/regex-automata/tests/hybrid/api.rs195
-rw-r--r--vendor/regex-automata/tests/hybrid/mod.rs2
-rw-r--r--vendor/regex-automata/tests/hybrid/suite.rs212
-rw-r--r--vendor/regex-automata/tests/nfa/mod.rs1
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/mod.rs1
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs191
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs2
-rw-r--r--vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs109
-rw-r--r--vendor/regex-automata/tests/regression.rs10
-rw-r--r--vendor/regex-automata/tests/suite.rs250
-rw-r--r--vendor/regex-automata/tests/tests.rs65
-rw-r--r--vendor/regex-automata/tests/unescape.rs84
-rw-r--r--vendor/regex-automata/tests/util.rs57
116 files changed, 39092 insertions, 12822 deletions
diff --git a/vendor/regex-automata/.cargo-checksum.json b/vendor/regex-automata/.cargo-checksum.json
index a8c689c8b..63e5b1a67 100644
--- a/vendor/regex-automata/.cargo-checksum.json
+++ b/vendor/regex-automata/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"674fda607d585e7a9d1d07e6fee2807e6a1a3709ca8d5a507dac051cac84dcf1","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"34ebd8d165fbd934198653a6d619d62788ff72f0e058139459d4369683423551","TODO":"daea9f7378f543311d657e6ef3d2a09d51e82b9e70d0026140130862c32b3c08","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","data/fowler-tests/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/fowler-tests/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","data/fowler-tests/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/fowler-tests/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/fowler-tests/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","data/tests/crazy.toml":"b6e644a74b990a4344b15e7366da36e5b3f73a183944e249082f74c23ff01e5f","data/tests/flags.toml":"aefd9483c1c9c52c3669a9f2e88cd494c293f2e14c59aecb1d94dbb82546a705","data/tests/fowler/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/tests/fowler/README":"e9f049297023d5a81c5c600280016fe0271e7d0eda898c41399eb61431820404","data/tests/fowler/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/tests/fowler/basic.toml":"7b043231ca8c89dbd10cef0de3b0be18c9ae442be1e99a657cd412b8b7edec21","data/tests/fowler/fowler-to-toml":"5bb78b924f3b6b1c27278b37baae556115fe03c864c1d33a7c53718b99885515","data/tests/fowler/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/tests/fowler/nullsubexpr.toml":"7e4bf9fec1c4a8aca04cc96e74b3f51ed6b8c3f85e4bfc7acc9c74ab95166976","data/tests/fowler/repetition-long.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","data/tests/fowler/repetition-long.toml":"3eb7199d936b3f7eb9863ebc3b0c94648cfc32192f626dcfa33ddf352918c1c0","data/tests/fowler/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","data/tests/fowler/repetition.toml":"ccf21430a325c4e1dae4eb6c52e3cea5d3c1847559ba6e75466bdb6bbd98204d","data/tests/iter.toml":"99adc397fe0a00c759eb659531d3e69445b43f5ecd5771c549117933b73bd43e","data/tests/no-unicode.toml":"f329ee939c2d07a17e51f0090d9f2431395e47dac8e0b982fb5e16e0555b75e3","data/tests/unicode.toml":"0ff418de5bc238e4595956b66981fe02018938d57d76d11cab840606b9da60ba","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/byteorder.rs":"0827852aa563e3c5b3ffaf484ce8a34537e82719a3606d4b948bc8a1e21d8b18","src/classes.rs":"706c8a8a9bf70260b9c92ff865891fc26de0453495afca7b325afdf5e6a3e242","src/codegen.rs":"5686b97fec69158c7264183a71ad9a1ff8e74db02fa0fcfccaa0a516cbfc7d1d","src/dense.rs":"7561f35019b20642f2ee75fd20365e21a4c8260deb7cee84fa3f8264b9fd9a4b","src/determinize.rs":"876c844d0470854dbbe3eb4386611fd57d95a5a4ae38ee937fbb14676f0a383a","src/dfa.rs":"032f09d187ec8dd06ef09940515690af045ca9f7ef7f819c31a97607df1432e5","src/error.rs":"d07ecdc617e243a43a99e911398b9c37721afd2b9548153c5f359b8c4605c749","src/lib.rs":"520781bdd60d425b16ef72f03330362e7c2aec274338e73f309d730bea4d7ab0","src/minimize.rs":"dfa7b6a6f36bb2dedaee8bfc5c4bb943f59e0cf98cde5358822e70cbdb284a7e","src/nfa/compiler.rs":"f43901929f44efa420e441cbff8687e05059ceae88492a2ed6c49fdd5a6a6b04","src/nfa/map.rs":"b7e2e561d6fe5775716e27eded1ae3e2277a50073a2e182f3dabedcda5c30d27","src/nfa/mod.rs":"93e7dee804751fcf66d48ca48b3467a4ab5155063461e69c428e46bcf977711d","src/nfa/range_trie.rs":"3a3d2853987619688ab5b61acef575f216d5bdd7b9e15fa508e0ba6f29c641a9","src/regex.rs":"2f3868a3fa52b2a040fd0fb9f12386b1af1f0f650d948e821c7ba83f087826f0","src/sparse.rs":"976540bcd134a225e5d39e1aef688f63b02b3d745249a3a95fec387a7ffb88cc","src/sparse_set.rs":"81bef5057781e26da39855b0f38b02ddfd09183bc62d30cf454ec706885e3a70","src/state_id.rs":"44c4bf1a5d091b97e8c1ce872bafe45d806905b07a73a6f82b1655b7897e7b5f","src/transducer.rs":"28c728ef45a3f6177d5a3ac589f166764c11d6c66bd5d916bcf30ad2be187a0c","tests/collection.rs":"2907cc0a32e5e59ceca4b34fe582f9275c12ee1a8d6e73d689056bdfd5357b9a","tests/regression.rs":"5a9b2654f88b1b07401c5b1fe925f62421bff67be7d80cae7a985eb66ed9886b","tests/suite.rs":"8148247667b34b370855c247ffcc9c6339f8f72d6fe481b79936afbb165dd6bd","tests/tests.rs":"f1b407d3d288a9c2b1500151205f9d0bcc0668b2ab38c5094ee459d6d4893e18","tests/unescape.rs":"67a7c466ba5c873a3c29f7e00649535ddc2921fcc14ac92cb207f43b4b6e461d"},"package":"6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"} \ No newline at end of file
+{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"0122506f03800693bb58426493e7faa1ec90c002e542fcbfaf5dbd086e56f2be","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","PLANS.md":"405c695de578604ab6a425709709ba8cb69db0b7fed103f44aad2e2069bef7ac","README.md":"de887d97b46825f6fde7c9b1066619eb9a729178b93492d900bc7c183337dd81","TODO":"296f208a1c13fa55c449452e5e0df7aeee7431c0bc81497a3f0c7d2b01483ddb","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/dfa/accel.rs":"cac45cfa62a3521684aee5583aa519425cc0de57d01a23f511433ad46ca426dc","src/dfa/automaton.rs":"9bd295a5a6e7ee99046703e1f8dc02c621e6ddac85344f7f37bb020b71383118","src/dfa/dense.rs":"4739d4959e415a9191d0c0dd0a07d2cc95ce6243831a806f7103bcfd509e9e2c","src/dfa/determinize.rs":"14666440637e91bf2a16a074e666b92cbdbd0b39b4ce21180be9235af47f541e","src/dfa/error.rs":"1f057e5a2f2ae87031676e5cce70c8226514de181dbcce2e491a930d28751b9e","src/dfa/minimize.rs":"a5e85fe9570307a053592eefb3bf6a72d9fdbcfb8311f5a0dd22e39085e87811","src/dfa/mod.rs":"bb02f594cae82e28f2bddea1453c35d8f38ea7f97fb5ee8cc588f628b1fcc667","src/dfa/regex.rs":"18eded661f818be36ef175acd49198140317ecb36d9171c3ebbabdf17c8fcf86","src/dfa/search.rs":"b3526fc40846c71cf687bf4a2f1f0e682b2615d7e3f62707e9e68bc79f2fe9a3","src/dfa/search_unsafe.rs":"047cd4fcdc4129c704e7269c0af2f71d6b8a64b0de01ad7174843c3fb9fbe013","src/dfa/sparse.rs":"c6c7540347e04c2be9b4e0b1b4eed9dc661707bba7386382805e492b704d113d","src/dfa/special.rs":"51d4254e3fcaa711e0739fecd8ee64392426e3bd4a6a74b37096157dc4dbf107","src/dfa/transducer.rs":"ad5766b1e781a8ec1f6113c4eaa53373c9d29260d357de0f71f7cc19a76f1f21","src/hybrid/dfa.rs":"2f6176a317c64716af2ff210c404e712e5a2eac64ad30617c5fda728e1342be9","src/hybrid/error.rs":"99c1e1a7a8d8e88724faaeee5e56383a05b284b74f33d864124d411c52c77361","src/hybrid/id.rs":"051ef2cfeb613fd20a19b42515ce5de8e812313007db6030fd1aaec13cafbabf","src/hybrid/mod.rs":"4f145341030bc6fd1392fbaf916dc9ba5cd1912562e94b758a6458da17abeef8","src/hybrid/regex.rs":"7c0ca05c9801e91af718b50a2f685d0e61fdaad0e88d8c3c23afe71c0a13bb14","src/hybrid/search.rs":"0eb9f26400c9cd949038c8a4c96b96a7879dac994a86a4cf9ed8837f3328e4d5","src/lib.rs":"06641dff57899f19ab7929404c92e21bc48835a65e3e08f366821c7b9ccfe08f","src/macros.rs":"a73da3a0725a7c0afbaf36cd64a185af12db5707fd7740bf08b188c2ae9872db","src/nfa/mod.rs":"3ec8d362fd16e3cb1742930bae77ba128e592c7f574cd186d342b98f39abd06f","src/nfa/thompson/compiler.rs":"9548c025a9fb9d551af9829cf68251084c3b24e1c5db3b365d6463b07ca02164","src/nfa/thompson/error.rs":"7c0c556cdc52635345a0efcfecce3add05cd91770dd8b9353c854d41a9f4b862","src/nfa/thompson/map.rs":"03f88cd3ee01cb46b542918d8eba7fd927a4409c0cf7080f57a19bbc9733020b","src/nfa/thompson/mod.rs":"0b5b274b29ede0a552728579396d74114bfc049c34576fb3bd9358c306ac9dd3","src/nfa/thompson/pikevm.rs":"cf97a464e3c307ffed65530ebf4d17b1d3a9961525e14a49542373b818f47ad1","src/nfa/thompson/range_trie.rs":"8576bc8a4d9fa3f66c88f15b22b3dbbf26534c17c5e621cbbec40801c8141628","src/util/alphabet.rs":"350829d2abf132486086d1f331826177748106c4d8a1c7cff839a82e04f323df","src/util/bytes.rs":"273dbd419f4d561fa1204990abb0c25fa58045b1d9dfeaa8ea40a747e08bfa59","src/util/determinize/mod.rs":"8539e34529589cc56e53dac6f0d29e150da9291e9b72f28f7821c12153dff1e9","src/util/determinize/state.rs":"ccff32679266cd8f4b18b4bf0beba3563167df53ca4f5dc46061fbc1222ca420","src/util/id.rs":"b6b3efabcdfdc0e56a277b903e40c684ba1182547b7e33cc4fbc1ad6ea348664","src/util/lazy.rs":"7ead513dd094d6c30c7196271afbb346b2b3601bbe33488fcd5284d9e8426027","src/util/matchtypes.rs":"24b05d62a95c271029170e73f9ff2bd16f264b6298abf01bcd4660ae2a86a6cd","src/util/mod.rs":"0e054937cc1a84f70dffa4ace1f0111d0b9a177154b423915b411185194a3c8f","src/util/prefilter.rs":"3dcc4f4a75c38fc00435b7ea88cfa9bb3481c8e5655e8325b0f0e1f2b8d1c65f","src/util/sparse_set.rs":"04aac2d8ae2299b85494df85ebafaef2891d36d3b856155cffa3b59fcc8993b4","src/util/start.rs":"2f8c28712bb97265139aefa961cef1b40bb0cbaa73cbbd1e6115ba4cc2bfe196","src/util/syntax.rs":"09f93982215c9bea3200ec2efd21b3d7ec53d5200546eb48a56040eda026db9a","tests/data/bytes.toml":"aee9df19c5cdd52ddac44490c6df6226cef33077a979d6b964ffe73aaf724bbf","tests/data/crazy.toml":"759293076a76d7efe8eb87b3207a0587c7e969637cd985ca985aa15f71dc0c57","tests/data/earliest.toml":"6ba10ea322fc8939ca0b849812b364a0d0b7594a3df1efee62fd03b7d048c399","tests/data/empty.toml":"45f314d2f9c624056665ba80ebcb4626b551a0bc4780d9c7ca160dd5caa6abaf","tests/data/expensive.toml":"d046774120b99f9516fa7893a3e51fa76182133c16e20d4582956125623775fb","tests/data/flags.toml":"b415e2c48a2520bb182a2f795e11229e56b6e2bf93f7177d64e30118b858cef8","tests/data/fowler/basic.toml":"226ea90327f02c62ed673fc747493bc2bb0f647629f08f92ce26a130f653a4fd","tests/data/fowler/dat/README":"441bb1ed49be2b02d99d3f65974313d7d274b154e53bfa3da2a3df0538b24f04","tests/data/fowler/dat/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","tests/data/fowler/dat/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","tests/data/fowler/dat/repetition-expensive.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","tests/data/fowler/dat/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","tests/data/fowler/nullsubexpr.toml":"3e975fc6ca8e14d615ed8483c7a35158d70a8cfcc7676ef15ed68ec5eef63164","tests/data/fowler/repetition-expensive.toml":"9d9203b6c3c380947afb41c717b834eb35746de4f21e124f6f15119a6460b150","tests/data/fowler/repetition-long.toml":"a598f6682e71a8689630edb35d69f43a1305090f77cfc39dff3f60e7284618e2","tests/data/fowler/repetition.toml":"ce1888a6550fce8a7986686684ef3eff762430459d50726bc4918d0e211c3847","tests/data/iter.toml":"d1995a7b65b12aa34b4226c3ca039fcf52dcaf96a6f061064da84e981e6796e0","tests/data/misc.toml":"a32697c95595b0ad28f3c12422caddf79eaba35047f32583f2df1c6b17bc0eaf","tests/data/multiline.toml":"70dabae358d0701132d55b4378d7aa78ae5aa3fabad38ff2a6a91e83b44b78bf","tests/data/no-unicode.toml":"11be343498e0e834b422ead1168204dbaac1fb32a5384e66f0b98cdb63b39057","tests/data/overlapping.toml":"8394b104f24abd62ebed5c4b8b4708db8dba7f973a6fd10f1711d340bf0e5b5c","tests/data/regression.toml":"718d151906584f521b5bb65bae8f03a516da6a0e87312b652b96d63a9a4be64c","tests/data/set.toml":"c2412cf09030ff7ef034e44c2b051e91841f0e2cd990576bb636bd1d1da18827","tests/data/unicode.toml":"af0ee5ba8ec93fbafe4647bbac97287003743db8b7eac3e2d4dfd17f02912328","tests/data/word-boundary.toml":"20cdd14cd0cab146e0fc541dfdf913e764340997db8ab4e2d80f94dd2f9b309d","tests/dfa/api.rs":"9de253770e6bc9b2ca32f1533655740677f245fd61e9188358acb51c6655f98e","tests/dfa/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/dfa/suite.rs":"2d3007c970a05e2ed7babd120d9a5a4e01b034780fc05b9d905e857a8255ab08","tests/hybrid/api.rs":"c954cdcbbc04ef939ae38d32aae3dee1847c6ea2a36ec6e2a4bedb19aaa861e4","tests/hybrid/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/hybrid/suite.rs":"1fd79a8699eb418a28897269daa3e86f7fc792ffa4fe9318c57aabfd10176f38","tests/nfa/mod.rs":"49055c358e38d97e42acb1602c671f97dddf24cafe089490f0e79ed208d74d9b","tests/nfa/thompson/mod.rs":"ab5f818ad62de599a2ddcedfd1774bf51e3245060ab8e3864bb07f146fe81a5a","tests/nfa/thompson/pikevm/api.rs":"af39a4787bb089060ee6b87e5ab1979c1863731ebbd9d1b0ba1ac6e93f6c0633","tests/nfa/thompson/pikevm/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/nfa/thompson/pikevm/suite.rs":"9d56601bb80a67c935f1f9aa4c4d130e1766e827bc34a62a48fb20297d8af2db","tests/regression.rs":"2d72466e872be88941a59582216823eb95bda461a5b2237b438a1fbfdcf813ac","tests/tests.rs":"7cf459df359f75fad2a44f7929521bcbc6fc78da6576af4306aec5386d35ffe3","tests/util.rs":"97573ea40567a62b54babe14a91b689f1d8ff663e2cb5e77103c7dede443e977"},"package":"e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"} \ No newline at end of file
diff --git a/vendor/regex-automata/Cargo.toml b/vendor/regex-automata/Cargo.toml
index b4fcd7a0d..153f11fb3 100644
--- a/vendor/regex-automata/Cargo.toml
+++ b/vendor/regex-automata/Cargo.toml
@@ -3,28 +3,41 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
+edition = "2018"
name = "regex-automata"
-version = "0.1.10"
+version = "0.2.0"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
-exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug"]
+exclude = [
+ "/.github",
+ "/scripts/*",
+ "/regex-cli",
+ "/regex-test",
+]
autoexamples = false
autotests = false
description = "Automata construction and matching using regular expressions."
homepage = "https://github.com/BurntSushi/regex-automata"
documentation = "https://docs.rs/regex-automata"
readme = "README.md"
-keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
+keywords = [
+ "regex",
+ "dfa",
+ "automata",
+ "automaton",
+ "nfa",
+]
categories = ["text-processing"]
license = "Unlicense/MIT"
repository = "https://github.com/BurntSushi/regex-automata"
+resolver = "2"
+
[profile.bench]
debug = true
@@ -43,44 +56,33 @@ debug = true
bench = false
[[test]]
-name = "default"
+name = "integration"
path = "tests/tests.rs"
+
[dependencies.fst]
-version = "0.4.0"
+version = "0.4.5"
optional = true
-[dependencies.regex-syntax]
-version = "0.6.16"
+[dependencies.log]
+version = "0.4.14"
optional = true
-[dev-dependencies.bstr]
-version = "0.2"
-features = ["std"]
-default-features = false
-
-[dev-dependencies.lazy_static]
-version = "1.2.0"
-
-[dev-dependencies.regex]
-version = "1.1"
-[dev-dependencies.serde]
-version = "1.0.82"
-
-[dev-dependencies.serde_bytes]
-version = "0.11"
-
-[dev-dependencies.serde_derive]
-version = "1.0.82"
+[dependencies.memchr]
+version = "2.4.0"
+default-features = false
-[dev-dependencies.toml]
-version = "0.4.10"
+[dependencies.regex-syntax]
+version = "0.6.24"
+optional = true
[features]
-default = ["std"]
-std = ["regex-syntax"]
-transducer = ["std", "fst"]
-[badges.appveyor]
-repository = "BurntSushi/regex-automata"
-
-[badges.travis-ci]
-repository = "BurntSushi/regex-automata"
+alloc = ["syntax"]
+default = [
+ "std",
+ "alloc",
+ "syntax",
+]
+logging = ["log"]
+std = []
+syntax = ["regex-syntax"]
+transducer = ["fst"]
diff --git a/vendor/regex-automata/PLANS.md b/vendor/regex-automata/PLANS.md
new file mode 100644
index 000000000..2fa9392ef
--- /dev/null
+++ b/vendor/regex-automata/PLANS.md
@@ -0,0 +1,165 @@
+pattern_limit should not be defined inside nfa::thompson, but rather at the
+top-level.
+
+-----
+
+Main problem right now is exemplified by the set60 and set70 failing tests. In
+particular, when finding the starting position while matching multiple regexes
+simultaneously, the reverse search is messed up. The reverse search doesn't
+depend on which regex matched in the forward direction, which means it won't
+always find the correcting starting location. Unfortunately, the only way to
+fix this, as far as I can tell, is to add a group of start states for every
+regex in the DFA. Then once we do the reverse search, we need to choose the
+correct start state based on which regex matched in the forward direction.
+
+This is a nasty change.
+
+So it looks like this only applies when doing an overlapping search in reverse
+to find the start of a match. That means we should make this configurable
+but enable it by default for the reverse automata. It should be configurable
+so that folks can construct a regex that doesn't have the ability to do
+overlapping searches correctly. If an overlapping search is attempted with
+a reverse automaton that lacks starting states for each pattern, then the
+implementation should panic.
+
+BUT! It is also convenient to provide this option in general for folks that
+want a DFA that can match any pattern while also being able to match a specific
+pattern.
+
+Straw man:
+
+* Update dense::Config to have a `starts_for_each_pattern` option. It should
+ be disabled by default.
+* In `RegexBuilder::build_many_with_size` tweak the reverse DFA configuration
+ to have the aforementioned option enabled.
+* It would be interesting to add new APIs to `Regex` that support matching
+ specific patterns, but I think this is a complication. If we did want to do
+ this, then we should just add it to the `_at` variants and leave the rest of
+ the API untouched.
+* Add a `pattern_id: Option<PatternID>` parameter to each of the five
+ `*_at` methods on the `dfa::Automaton` trait. A value of `None` retains the
+ existing behavior. A `Some` value means that the starting state for that
+ specific pattern must be chosen, which in turn implies an anchored search.
+ (This means `starts_for_each_pattern` has utility for single-pattern DFAs
+ since it makes it possible to build a DFA that can do both unanchored and
+ anchored searches.)
+* Thread this new parameter down into the various functions in `dfa::search`
+ all the way down into `init_fwd` and `init_rev`. These functions will then
+ pass it to `dfa.start_state_{forward,reverse}`.
+* This is where things get gruesome since we now need to completely re-work how
+ start states are represented in dense and sparse DFAs _and_ it needs to be
+ configurable. It looks like the `Start` type from `dfa::automaton` can
+ basically remain unchanged, since it still represents one of the four
+ possible starting states that will need to be applied for every pattern.
+* For `dfa::dense`, change `StartList` to `StartTable`. Currently, its only
+ header is the state ID count, which is always 4. We'll want to change this
+ to the stride and add a new header value that encodes the number of patterns.
+ When the number of patterns is zero, then existing behavior is preserved and
+ represents the case where `starts_for_each_pattern` is disabled (or in the
+ case of an empty DFA). When non-zero, a table of starting state IDs is
+ encoded with each row corresponding to the 4 starting states for each
+ pattern. Before this table (even if it's empty), the 4 starting states for
+ the entire DFA are encoded.
+* For `dfa::sparse`, do the same as above. They are essentially the same right
+ now anyway, with the only difference being that sparse DFAs use `&[u8]`
+ instead of `&[S]` (because sparse DFAs don't have any alignment
+ requirements).
+* Modify `DFA::empty` to accept a `starts_for_each_pattern` bool that, when
+ true, creates a start table with the header, the start states for the entire
+ DFA and a row of start states for each pattern. When false, no rows are
+ added.
+* Expose whether there are starting states for each pattern via a predicate
+ on the DFA.
+* Modify the determinizer's `add_starts` method to basically do what it does,
+ but also do it for each pattern when the DFA is configured for it. It should
+ continue to reuse states as appropriate or not generate new states if they
+ aren't needed. This will want to use the `NFA::start_pattern` method, which
+ provides the starting NFA state ID for the given pattern.
+* Fix the dense->sparse conversion. At this point, this piece should be fairly
+ straight-forward since the sparse representation of starting states is
+ basically identical to the dense representation.
+
+At this point, I think the bug should resolve itself.
+
+^^^^ DONE! IT WORKS!
+
+-----
+
+
+Add top-level SyntaxConfig (or some such) that has all of the regex-syntax
+options forwarded, but with automata oriented docs. Then use this for all of
+the engines instead of having to repeat every option for every builder.
+
+-----
+
+These produce different results. PCRE2 looks correct. Basically, we should be
+using the context around the `at` position correctly, which we aren't doing
+right now. Seems tricky to get right, particularly when confirming the match
+with a reverse DFA.
+
+Maybe our 'at' functions need to take a full range... Sigh. This is indeed what
+RE2 does. GAH.
+
+fn main() {
+ let re = regex::Regex::new(r"(?-u)\b\sbar").unwrap();
+ let s = "foo bar baz";
+ println!("{:?}", re.find_at(s, 3).map(|m| m.as_str()));
+
+ let re = pcre2::bytes::Regex::new(r"\b\sbar").unwrap();
+ let s = "foo bar baz";
+ println!("{:?}", re.find_at(s.as_bytes(), 3).unwrap());
+}
+
+^^^^ This is fixed now, but we still need to find a way to add test coverage
+for "context" searches. It'd be nice to do this automatically, but we'll
+probably just added a new 'context = [start, end]' option.
+
+-----
+
+
+* Create regex-test crate, based on glob-test. Try to anticipate the needs for
+ the full regex test suite.
+ * See if we can clean up tests.
+ * Provide a way to mark a test as expensive.
+ * Provide a way to test is_match_at and find_at.
+ * Test shortest_match_at too? Huge pain. Add tests for it.
+ * Port ALL tests from the regex crate. Will probably need a way to mark a
+ test as skipped.
+ * Document tests better.
+* Find a way to remove byteorder dependency.
+* Reorganize crate API:
+ * Have errors contain `Box<Error+Send+Sync>` instead of `String`.
+ * Make errors non-exhaustive.
+ * Audit `StateID` trait for safety.
+ * Brainstorm hard about `DFA` trait and the fact that DenseDFA and SparseDFA
+ have inefficient implementations of some methods. Maybe use multiple
+ traits? Answer: get rid of premultiply/classes knobs and just enable
+ them by default. Should remove a huge amount of code.
+ * Check whether `unsafe` is really needed to eliminate bounds checks. Use
+ micro-benchmarks and bigger CLI workloads using `regex-automata-debug`.
+ * Re-write module docs for `dfa` as they are no longer top-level. Keep most.
+ * Retain any pertinent top-level crate docs, but don't rewrite yet.
+ * Clean up builders if we can. e.g., Determinizer, minimizer, it's all a mess
+ right now.
+ * Clean up and add 'always_match' and 'never_match' constructors for every
+ regex engine.
+ * See about supporting ^, $, \A, \z, \b and \B in DFAs. Do the non-Unicode
+ version of \b unfortunately. Carefully scrutinize how the regex crate's
+ lazy DFA does it and try to make it comprehensible. Done! Except for the
+ part about making it comprehensible.
+* Rethink prefilters?
+* Add `regex-automata-generate` CLI tool. This should just be a copy of
+ the `ucd-generate dfa` and `ucd-generate regex` commands.
+
+Then build new public `nfa` sub-module.
+ * For Unicode \b, generate \w DFA (forwards and reverse) and embed it into
+ source for fast checking. That way, we don't need to ever do explicit UTF-8
+ decoding anywhere. Yay.
+
+Then `lazy` sub-module.
+
+Then `onepass`.
+
+Then `jit`.
+
+... and beyond? CRAZY. But it can be done! Build STRONG base layers.
diff --git a/vendor/regex-automata/README.md b/vendor/regex-automata/README.md
index 8eaf03f04..23e0bffe0 100644
--- a/vendor/regex-automata/README.md
+++ b/vendor/regex-automata/README.md
@@ -6,7 +6,7 @@ configuring the best space vs time trade off for your use case and provides
support for cheap deserialization of automata for use in `no_std` environments.
[![Build status](https://github.com/BurntSushi/regex-automata/workflows/ci/badge.svg)](https://github.com/BurntSushi/regex-automata/actions)
-[![on crates.io](https://meritbadge.herokuapp.com/regex-automata)](https://crates.io/crates/regex-automata)
+[![Crates.io](https://img.shields.io/crates/v/regex-automata.svg)](https://crates.io/crates/regex-automata)
![Minimum Supported Rust Version 1.41](https://img.shields.io/badge/rustc-1.41-green)
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
@@ -26,11 +26,10 @@ Add this to your `Cargo.toml`:
regex-automata = "0.1"
```
-and this to your crate root (if you're using Rust 2015):
-
-```rust
-extern crate regex_automata;
-```
+**WARNING**: The `master` branch currently contains code for the `0.2` release,
+but this README still targets the `0.1` release. Namely, it is recommended to
+stick with the `0.1` release. The `0.2` release was made prematurely in order
+to unblock some folks.
### Example: basic regex searching
@@ -48,7 +47,7 @@ assert_eq!(matches, vec![(0, 10), (11, 21)]);
```
For more examples and information about the various knobs that can be turned,
-please see the [docs](https://docs.rs/regex-automata).
+please see the [docs](https://docs.rs/regex-automata/0.1).
### Support for `no_std`
diff --git a/vendor/regex-automata/TODO b/vendor/regex-automata/TODO
index bc3b7aab9..68f018799 100644
--- a/vendor/regex-automata/TODO
+++ b/vendor/regex-automata/TODO
@@ -1,10 +1,13 @@
-* Remove the `empty` constructors for DFAs and replace them with
- `never_match` and `always_match` constructors.
* Consider refactoring the NFA representation such that it can be instantly
loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this
could negatively impact using the NFA with deserialization costs. Before
doing this, we should write PikeVM and backtracking implementations so that
they can be benchmarked.
-* Add captures and anchors to NFA.
+* Add captures to NFA.
* Once we're happy, re-organize the public API such that NFAs are exported
and usable on their own.
+
+* Investigate why NFA shrinking seems to produce bigger DFAs after
+ determinization, even though it makes determinization substantially
+ faster. This might be because of its use of sparse NFA states, which have
+ a lower constant overhead associated with them.
diff --git a/vendor/regex-automata/data/fowler-tests/LICENSE b/vendor/regex-automata/data/fowler-tests/LICENSE
deleted file mode 100644
index f47dbf4c4..000000000
--- a/vendor/regex-automata/data/fowler-tests/LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-The following license covers testregex.c and all associated test data.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, and/or sell copies of the
-Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following disclaimer:
-
-THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/regex-automata/data/fowler-tests/README b/vendor/regex-automata/data/fowler-tests/README
deleted file mode 100644
index 6efc2dad3..000000000
--- a/vendor/regex-automata/data/fowler-tests/README
+++ /dev/null
@@ -1,17 +0,0 @@
-Test data was taken from the Go distribution, which was in turn taken from the
-testregex test suite:
-
- http://www2.research.att.com/~astopen/testregex/testregex.html
-
-The LICENSE in this directory corresponds to the LICENSE that the data was
-released under.
-
-The tests themselves were modified for RE2/Go. A couple were modified further
-by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
-(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
-have been a bad idea, but I think being consistent with an established Regex
-library is worth something.
-
-Note that these files are read by 'scripts/regex-match-tests.py' and turned
-into Rust tests found in 'regex_macros/tests/matches.rs'.
-
diff --git a/vendor/regex-automata/data/fowler-tests/repetition.dat b/vendor/regex-automata/data/fowler-tests/repetition.dat
deleted file mode 100644
index 3bb212118..000000000
--- a/vendor/regex-automata/data/fowler-tests/repetition.dat
+++ /dev/null
@@ -1,163 +0,0 @@
-NOTE implicit vs. explicit repetitions : 2009-02-02
-
-# Glenn Fowler <gsf@research.att.com>
-# conforming matches (column 4) must match one of the following BREs
-# NOMATCH
-# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
-# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
-# i.e., each 3-tuple has two identical elements and one (?,?)
-
-E ((..)|(.)) NULL NOMATCH
-E ((..)|(.))((..)|(.)) NULL NOMATCH
-E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
-
-E ((..)|(.)){1} NULL NOMATCH
-E ((..)|(.)){2} NULL NOMATCH
-E ((..)|(.)){3} NULL NOMATCH
-
-E ((..)|(.))* NULL (0,0)
-
-E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
-E ((..)|(.))((..)|(.)) a NOMATCH
-E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
-
-E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
-E ((..)|(.)){2} a NOMATCH
-E ((..)|(.)){3} a NOMATCH
-
-E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
-
-E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
-E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
-
-E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
-E ((..)|(.)){3} aa NOMATCH
-
-E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
-
-E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
-E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
-
-E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
-#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
-E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
-E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
-
-#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
-E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
-
-E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
-
-E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
-#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
-E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
-
-E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
-
-E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
-
-E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
-#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
-E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
-
-#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
-E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
-
-E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
-E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
-
-E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
-E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
-E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
-
-E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
-
-NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
-
-# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
-# Linux/GLIBC gets the {8,} and {8,8} wrong.
-
-:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
-:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
-:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
-:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
-:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
-:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
-:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
-:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
-:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
-#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
-:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
-:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
-:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
-:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
-:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
-:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
-:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
-#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
-:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
-:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
-
-# These test a fixed bug in my regex-tdfa that did not keep the expanded
-# form properly grouped, so right association did the wrong thing with
-# these ambiguous patterns (crafted just to test my code when I became
-# suspicious of my implementation). The first subexpression should use
-# "ab" then "a" then "bcd".
-
-# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
-# results like (0,6)(4,5)(6,6).
-
-:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
-:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
-:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
-:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
-:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
-
-# The above worked on Linux/GLIBC but the following often fail.
-# They also trip up OS X / FreeBSD / NetBSD:
-
-#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
-#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
-:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
-#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
-:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
-#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
-:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
diff --git a/vendor/regex-automata/data/tests/crazy.toml b/vendor/regex-automata/data/tests/crazy.toml
deleted file mode 100644
index 30c4b314d..000000000
--- a/vendor/regex-automata/data/tests/crazy.toml
+++ /dev/null
@@ -1,177 +0,0 @@
-[[tests]]
-name = "crazy-misc1"
-pattern = '[-+]?[0-9]*\.?[0-9]+'
-input = "0.1"
-matches = [[0, 3]]
-
-[[tests]]
-name = "crazy-misc2"
-pattern = '[-+]?[0-9]*\.?[0-9]+'
-input = "0.1.2"
-matches = [[0, 3]]
-
-[[tests]]
-name = "crazy-misc3"
-pattern = '[-+]?[0-9]*\.?[0-9]+'
-input = "a1.2"
-matches = [[1, 4]]
-
-[[tests]]
-options = ["case-insensitive"]
-name = "crazy-misc4"
-pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
-input = "mine is jam.slam@gmail.com "
-matches = [[8, 26]]
-
-[[tests]]
-options = ["case-insensitive"]
-name = "crazy-misc5"
-pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
-input = "mine is jam.slam@gmail "
-matches = []
-
-[[tests]]
-name = "crazy-misc6"
-pattern = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
-input = "mine is jam.slam@gmail.com "
-matches = [[8, 26]]
-
-[[tests]]
-name = "crazy-misc7"
-pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
-input = "1900-01-01"
-matches = [[0, 10]]
-
-[[tests]]
-name = "crazy-misc8"
-pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
-input = "1900-00-01"
-matches = []
-
-[[tests]]
-name = "crazy-misc9"
-pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
-input = "1900-13-01"
-matches = []
-
-
-[[tests]]
-name = "crazy-negclass1"
-pattern = "[^ac]"
-input = "acx"
-matches = [[2, 3]]
-
-[[tests]]
-name = "crazy-negclass2"
-pattern = "[^a,]"
-input = "a,x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "crazy-negclass3"
-pattern = '[^a\s]'
-input = "a x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "crazy-negclass4"
-pattern = "[^,]"
-input = ",,x"
-matches = [[2, 3]]
-
-[[tests]]
-name = "crazy-negclass5"
-pattern = '[^\s]'
-input = " a"
-matches = [[1, 2]]
-
-[[tests]]
-name = "crazy-negclass6"
-pattern = '[^,\s]'
-input = ", a"
-matches = [[2, 3]]
-
-[[tests]]
-name = "crazy-negclass7"
-pattern = '[^\s,]'
-input = " ,a"
-matches = [[2, 3]]
-
-[[tests]]
-name = "crazy-negclass8"
-pattern = "[^[:alpha:]Z]"
-input = "A1"
-matches = [[1, 2]]
-
-
-[[tests]]
-name = "crazy-empty-repeat1"
-pattern = "((.*)*?)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat2"
-pattern = "((.?)*?)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat3"
-pattern = "((.*)+?)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat4"
-pattern = "((.?)+?)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat5"
-pattern = "((.*){1,}?)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat6"
-pattern = "((.*){1,2}?)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat7"
-pattern = "((.*)*)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat8"
-pattern = "((.?)*)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat9"
-pattern = "((.*)+)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat10"
-pattern = "((.?)+)="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat11"
-pattern = "((.*){1,})="
-input = "a=b"
-matches = [[0, 2]]
-
-[[tests]]
-name = "crazy-empty-repeat12"
-pattern = "((.*){1,2})="
-input = "a=b"
-matches = [[0, 2]]
diff --git a/vendor/regex-automata/data/tests/flags.toml b/vendor/regex-automata/data/tests/flags.toml
deleted file mode 100644
index 98024d9f7..000000000
--- a/vendor/regex-automata/data/tests/flags.toml
+++ /dev/null
@@ -1,59 +0,0 @@
-[[tests]]
-name = "flags1"
-pattern = "(?i)abc"
-input = "ABC"
-matches = [[0, 3]]
-
-[[tests]]
-name = "flags2"
-pattern = "(?i)a(?-i)bc"
-input = "Abc"
-matches = [[0, 3]]
-
-[[tests]]
-name = "flags3"
-pattern = "(?i)a(?-i)bc"
-input = "ABC"
-matches = []
-
-[[tests]]
-name = "flags4"
-pattern = "(?is)a."
-input = "A\n"
-matches = [[0, 2]]
-
-[[tests]]
-name = "flags5"
-pattern = "(?is)a.(?-is)a."
-input = "A\nab"
-matches = [[0, 4]]
-
-[[tests]]
-name = "flags6"
-pattern = "(?is)a.(?-is)a."
-input = "A\na\n"
-matches = []
-
-[[tests]]
-name = "flags7"
-pattern = "(?is)a.(?-is:a.)?"
-input = "A\na\n"
-matches = [[0, 2]]
-
-[[tests]]
-name = "flags8"
-pattern = "(?U)a+"
-input = "aa"
-matches = [[0, 1]]
-
-[[tests]]
-name = "flags9"
-pattern = "(?U)a+?"
-input = "aa"
-matches = [[0, 2]]
-
-[[tests]]
-name = "flags10"
-pattern = "(?U)(?-U)a+"
-input = "aa"
-matches = [[0, 2]]
diff --git a/vendor/regex-automata/data/tests/fowler/LICENSE b/vendor/regex-automata/data/tests/fowler/LICENSE
deleted file mode 100644
index f47dbf4c4..000000000
--- a/vendor/regex-automata/data/tests/fowler/LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-The following license covers testregex.c and all associated test data.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
-without restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, and/or sell copies of the
-Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following disclaimer:
-
-THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/regex-automata/data/tests/fowler/basic.dat b/vendor/regex-automata/data/tests/fowler/basic.dat
deleted file mode 100644
index e55efaeec..000000000
--- a/vendor/regex-automata/data/tests/fowler/basic.dat
+++ /dev/null
@@ -1,221 +0,0 @@
-NOTE all standard compliant implementations should pass these : 2002-05-31
-
-BE abracadabra$ abracadabracadabra (7,18)
-BE a...b abababbb (2,7)
-BE XXXXXX ..XXXXXX (2,8)
-E \) () (1,2)
-BE a] a]a (0,2)
-B } } (0,1)
-E \} } (0,1)
-BE \] ] (0,1)
-B ] ] (0,1)
-E ] ] (0,1)
-B { { (0,1)
-B } } (0,1)
-BE ^a ax (0,1)
-BE \^a a^a (1,3)
-BE a\^ a^ (0,2)
-BE a$ aa (1,2)
-BE a\$ a$ (0,2)
-BE ^$ NULL (0,0)
-E $^ NULL (0,0)
-E a($) aa (1,2)(2,2)
-E a*(^a) aa (0,1)(0,1)
-E (..)*(...)* a (0,0)
-E (..)*(...)* abcd (0,4)(2,4)
-E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
-E (ab)c|abc abc (0,3)(0,2)
-E a{0}b ab (1,2)
-E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
-E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
-E a{9876543210} NULL BADBR
-E ((a|a)|a) a (0,1)(0,1)(0,1)
-E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
-E a*(a.|aa) aaaa (0,4)(2,4)
-E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
-E (a|b)?.* b (0,1)(0,1)
-E (a|b)c|a(b|c) ac (0,2)(0,1)
-E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
-E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
-E (a|b)*c|(a|ab)*c xc (1,2)
-E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
-E a?(ab|ba)ab abab (0,4)(0,2)
-E a?(ac{0}b|ba)ab abab (0,4)(0,2)
-E ab|abab abbabab (0,2)
-E aba|bab|bba baaabbbaba (5,8)
-E aba|bab baaabbbaba (6,9)
-E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
-E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
-E ab|a xabc (1,3)
-E ab|a xxabc (2,4)
-Ei (Ab|cD)* aBcD (0,4)(2,4)
-BE [^-] --a (2,3)
-BE [a-]* --a (0,3)
-BE [a-m-]* --amoma-- (0,4)
-E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
-E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
-{E [[:upper:]] A (0,1) [[<element>]] not supported
-E [[:lower:]]+ `az{ (1,3)
-E [[:upper:]]+ @AZ[ (1,3)
-# No collation in Go
-#BE [[-]] [[-]] (2,4)
-#BE [[.NIL.]] NULL ECOLLATE
-#BE [[=aleph=]] NULL ECOLLATE
-}
-BE$ \n \n (0,1)
-BEn$ \n \n (0,1)
-BE$ [^a] \n (0,1)
-BE$ \na \na (0,2)
-E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
-BE xxx xxx (0,3)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
-E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
-E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
-E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
-E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
-E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
-E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
-E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
-BE$ .* \x01\x7f (0,2)
-E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
-L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
-E a*a*a*a*a*b aaaaaaaaab (0,10)
-BE ^ NULL (0,0)
-BE $ NULL (0,0)
-BE ^$ NULL (0,0)
-BE ^a$ a (0,1)
-BE abc abc (0,3)
-BE abc xabcy (1,4)
-BE abc ababc (2,5)
-BE ab*c abc (0,3)
-BE ab*bc abc (0,3)
-BE ab*bc abbc (0,4)
-BE ab*bc abbbbc (0,6)
-E ab+bc abbc (0,4)
-E ab+bc abbbbc (0,6)
-E ab?bc abbc (0,4)
-E ab?bc abc (0,3)
-E ab?c abc (0,3)
-BE ^abc$ abc (0,3)
-BE ^abc abcc (0,3)
-BE abc$ aabc (1,4)
-BE ^ abc (0,0)
-BE $ abc (3,3)
-BE a.c abc (0,3)
-BE a.c axc (0,3)
-BE a.*c axyzc (0,5)
-BE a[bc]d abd (0,3)
-BE a[b-d]e ace (0,3)
-BE a[b-d] aac (1,3)
-BE a[-b] a- (0,2)
-BE a[b-] a- (0,2)
-BE a] a] (0,2)
-BE a[]]b a]b (0,3)
-BE a[^bc]d aed (0,3)
-BE a[^-b]c adc (0,3)
-BE a[^]b]c adc (0,3)
-E ab|cd abc (0,2)
-E ab|cd abcd (0,2)
-E a\(b a(b (0,3)
-E a\(*b ab (0,2)
-E a\(*b a((b (0,4)
-E ((a)) abc (0,1)(0,1)(0,1)
-E (a)b(c) abc (0,3)(0,1)(2,3)
-E a+b+c aabbabc (4,7)
-E a* aaa (0,3)
-#E (a*)* - (0,0)(0,0)
-E (a*)* - (0,0)(?,?) RE2/Go
-E (a*)+ - (0,0)(0,0)
-#E (a*|b)* - (0,0)(0,0)
-E (a*|b)* - (0,0)(?,?) RE2/Go
-E (a+|b)* ab (0,2)(1,2)
-E (a+|b)+ ab (0,2)(1,2)
-E (a+|b)? ab (0,1)(0,1)
-BE [^ab]* cde (0,3)
-#E (^)* - (0,0)(0,0)
-E (^)* - (0,0)(?,?) RE2/Go
-BE a* NULL (0,0)
-E ([abc])*d abbbcd (0,6)(4,5)
-E ([abc])*bcd abcd (0,4)(0,1)
-E a|b|c|d|e e (0,1)
-E (a|b|c|d|e)f ef (0,2)(0,1)
-#E ((a*|b))* - (0,0)(0,0)(0,0)
-E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
-BE abcd*efg abcdefg (0,7)
-BE ab* xabyabbbz (1,3)
-BE ab* xayabbbz (1,2)
-E (ab|cd)e abcde (2,5)(2,4)
-BE [abhgefdc]ij hij (0,3)
-E (a|b)c*d abcd (1,4)(1,2)
-E (ab|ab*)bc abc (0,3)(0,1)
-E a([bc]*)c* abc (0,3)(1,3)
-E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
-E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
-E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
-E a[bcd]*dcdcde adcdcde (0,7)
-E (ab|a)b*c abc (0,3)(0,2)
-E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
-BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
-E ^a(bc+|b[eh])g|.h$ abh (1,3)
-E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
-E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
-E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
-E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
-BE multiple words multiple words yeah (0,14)
-E (.*)c(.*) abcde (0,5)(0,2)(3,5)
-BE abcd abcd (0,4)
-E a(bc)d abcd (0,4)(1,3)
-E a[-]?c ac (0,3)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
-E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
-E a+(b|c)*d+ aabcdd (0,6)(3,4)
-E ^.+$ vivi (0,4)
-E ^(.+)$ vivi (0,4)(0,4)
-E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
-E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
-E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
-E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
-E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
-E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
-E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
-E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
-E ((foo)|bar)!bas bar!bas (0,7)(0,3)
-E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
-E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
-E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
-E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
-E (foo|(bar))!bas foo!bas (0,7)(0,3)
-E (foo|bar)!bas bar!bas (0,7)(0,3)
-E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
-E (foo|bar)!bas foo!bas (0,7)(0,3)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
-E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
-E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
-E .*(/XXX).* /XXX (0,4)(0,4)
-E .*(\\XXX).* \XXX (0,4)(0,4)
-E \\XXX \XXX (0,4)
-E .*(/000).* /000 (0,4)(0,4)
-E .*(\\000).* \000 (0,4)(0,4)
-E \\000 \000 (0,4)
diff --git a/vendor/regex-automata/data/tests/fowler/basic.toml b/vendor/regex-automata/data/tests/fowler/basic.toml
deleted file mode 100644
index 3eeebd799..000000000
--- a/vendor/regex-automata/data/tests/fowler/basic.toml
+++ /dev/null
@@ -1,1428 +0,0 @@
-[[tests]]
-name = "basic3"
-options = ['escaped']
-pattern = '''abracadabra$'''
-input = '''abracadabracadabra'''
-matches = [[7, 18]]
-
-[[tests]]
-name = "basic4"
-options = ['escaped']
-pattern = '''a...b'''
-input = '''abababbb'''
-matches = [[2, 7]]
-
-[[tests]]
-name = "basic5"
-options = ['escaped']
-pattern = '''XXXXXX'''
-input = '''..XXXXXX'''
-matches = [[2, 8]]
-
-[[tests]]
-name = "basic6"
-options = ['escaped']
-pattern = '''\)'''
-input = '''()'''
-matches = [[1, 2]]
-
-[[tests]]
-name = "basic7"
-options = ['escaped']
-pattern = '''a]'''
-input = '''a]a'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic9"
-options = ['escaped']
-pattern = '''\}'''
-input = '''}'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic10"
-options = ['escaped']
-pattern = '''\]'''
-input = ''']'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic12"
-options = ['escaped']
-pattern = ''']'''
-input = ''']'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic15"
-options = ['escaped']
-pattern = '''^a'''
-input = '''ax'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic16"
-options = ['escaped']
-pattern = '''\^a'''
-input = '''a^a'''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic17"
-options = ['escaped']
-pattern = '''a\^'''
-input = '''a^'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic18"
-options = ['escaped']
-pattern = '''a$'''
-input = '''aa'''
-matches = [[1, 2]]
-
-[[tests]]
-name = "basic19"
-options = ['escaped']
-pattern = '''a\$'''
-input = '''a$'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic20"
-options = ['escaped']
-pattern = '''^$'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic21"
-options = ['escaped']
-pattern = '''$^'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic22"
-options = ['escaped']
-pattern = '''a($)'''
-input = '''aa'''
-matches = [[1, 2]]
-
-[[tests]]
-name = "basic23"
-options = ['escaped']
-pattern = '''a*(^a)'''
-input = '''aa'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic24"
-options = ['escaped']
-pattern = '''(..)*(...)*'''
-input = '''a'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic25"
-options = ['escaped']
-pattern = '''(..)*(...)*'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic26"
-options = ['escaped']
-pattern = '''(ab|a)(bc|c)'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic27"
-options = ['escaped']
-pattern = '''(ab)c|abc'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic28"
-options = ['escaped']
-pattern = '''a{0}b'''
-input = '''ab'''
-matches = [[1, 2]]
-
-[[tests]]
-name = "basic29"
-options = ['escaped']
-pattern = '''(a*)(b?)(b+)b{3}'''
-input = '''aaabbbbbbb'''
-matches = [[0, 10]]
-
-[[tests]]
-name = "basic30"
-options = ['escaped']
-pattern = '''(a*)(b{0,1})(b{1,})b{3}'''
-input = '''aaabbbbbbb'''
-matches = [[0, 10]]
-
-[[tests]]
-name = "basic32"
-options = ['escaped']
-pattern = '''((a|a)|a)'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic33"
-options = ['escaped']
-pattern = '''(a*)(a|aa)'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic34"
-options = ['escaped']
-pattern = '''a*(a.|aa)'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic35"
-options = ['escaped']
-pattern = '''a(b)|c(d)|a(e)f'''
-input = '''aef'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic36"
-options = ['escaped']
-pattern = '''(a|b)?.*'''
-input = '''b'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic37"
-options = ['escaped']
-pattern = '''(a|b)c|a(b|c)'''
-input = '''ac'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic38"
-options = ['escaped']
-pattern = '''(a|b)c|a(b|c)'''
-input = '''ab'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic39"
-options = ['escaped']
-pattern = '''(a|b)*c|(a|ab)*c'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic40"
-options = ['escaped']
-pattern = '''(a|b)*c|(a|ab)*c'''
-input = '''xc'''
-matches = [[1, 2]]
-
-[[tests]]
-name = "basic41"
-options = ['escaped']
-pattern = '''(.a|.b).*|.*(.a|.b)'''
-input = '''xa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic42"
-options = ['escaped']
-pattern = '''a?(ab|ba)ab'''
-input = '''abab'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic43"
-options = ['escaped']
-pattern = '''a?(ac{0}b|ba)ab'''
-input = '''abab'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic44"
-options = ['escaped']
-pattern = '''ab|abab'''
-input = '''abbabab'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic45"
-options = ['escaped']
-pattern = '''aba|bab|bba'''
-input = '''baaabbbaba'''
-matches = [[5, 8]]
-
-[[tests]]
-name = "basic46"
-options = ['escaped']
-pattern = '''aba|bab'''
-input = '''baaabbbaba'''
-matches = [[6, 9]]
-
-[[tests]]
-name = "basic47"
-options = ['escaped']
-pattern = '''(aa|aaa)*|(a|aaaaa)'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic48"
-options = ['escaped']
-pattern = '''(a.|.a.)*|(a|.a...)'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic49"
-options = ['escaped']
-pattern = '''ab|a'''
-input = '''xabc'''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic50"
-options = ['escaped']
-pattern = '''ab|a'''
-input = '''xxabc'''
-matches = [[2, 4]]
-
-[[tests]]
-name = "basic51"
-options = ['escaped', 'case-insensitive']
-pattern = '''(Ab|cD)*'''
-input = '''aBcD'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic52"
-options = ['escaped']
-pattern = '''[^-]'''
-input = '''--a'''
-matches = [[2, 3]]
-
-[[tests]]
-name = "basic53"
-options = ['escaped']
-pattern = '''[a-]*'''
-input = '''--a'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic54"
-options = ['escaped']
-pattern = '''[a-m-]*'''
-input = '''--amoma--'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic55"
-options = ['escaped']
-pattern = ''':::1:::0:|:::1:1:0:'''
-input = ''':::0:::1:::1:::0:'''
-matches = [[8, 17]]
-
-[[tests]]
-name = "basic56"
-options = ['escaped']
-pattern = ''':::1:::0:|:::1:1:1:'''
-input = ''':::0:::1:::1:::0:'''
-matches = [[8, 17]]
-
-[[tests]]
-name = "basic57"
-options = ['escaped']
-pattern = '''[[:upper:]]'''
-input = '''A'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic58"
-options = ['escaped']
-pattern = '''[[:lower:]]+'''
-input = '''`az{'''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic59"
-options = ['escaped']
-pattern = '''[[:upper:]]+'''
-input = '''@AZ['''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic65"
-options = ['escaped']
-pattern = '''\n'''
-input = '''\n'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic66"
-options = ['escaped']
-pattern = '''\n'''
-input = '''\n'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic67"
-options = ['escaped']
-pattern = '''[^a]'''
-input = '''\n'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic68"
-options = ['escaped']
-pattern = '''\na'''
-input = '''\na'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic69"
-options = ['escaped']
-pattern = '''(a)(b)(c)'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic70"
-options = ['escaped']
-pattern = '''xxx'''
-input = '''xxx'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic71"
-options = ['escaped']
-pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
-input = '''feb 6,'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "basic72"
-options = ['escaped']
-pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
-input = '''2/7'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic73"
-options = ['escaped']
-pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
-input = '''feb 1,Feb 6'''
-matches = [[5, 11]]
-
-[[tests]]
-name = "basic74"
-options = ['escaped']
-pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))'''
-input = '''x'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic75"
-options = ['escaped']
-pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*'''
-input = '''xx'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic76"
-options = ['escaped']
-pattern = '''a?(ab|ba)*'''
-input = '''ababababababababababababababababababababababababababababababababababababababababa'''
-matches = [[0, 81]]
-
-[[tests]]
-name = "basic77"
-options = ['escaped']
-pattern = '''abaa|abbaa|abbbaa|abbbbaa'''
-input = '''ababbabbbabbbabbbbabbbbaa'''
-matches = [[18, 25]]
-
-[[tests]]
-name = "basic78"
-options = ['escaped']
-pattern = '''abaa|abbaa|abbbaa|abbbbaa'''
-input = '''ababbabbbabbbabbbbabaa'''
-matches = [[18, 22]]
-
-[[tests]]
-name = "basic79"
-options = ['escaped']
-pattern = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc'''
-input = '''baaabbbabac'''
-matches = [[7, 11]]
-
-[[tests]]
-name = "basic80"
-options = ['escaped']
-pattern = '''.*'''
-input = '''\x01\x7f'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic81"
-options = ['escaped']
-pattern = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll'''
-input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa'''
-matches = [[53, 57]]
-
-[[tests]]
-name = "basic83"
-options = ['escaped']
-pattern = '''a*a*a*a*a*b'''
-input = '''aaaaaaaaab'''
-matches = [[0, 10]]
-
-[[tests]]
-name = "basic84"
-options = ['escaped']
-pattern = '''^'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic85"
-options = ['escaped']
-pattern = '''$'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic86"
-options = ['escaped']
-pattern = '''^$'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic87"
-options = ['escaped']
-pattern = '''^a$'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic88"
-options = ['escaped']
-pattern = '''abc'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic89"
-options = ['escaped']
-pattern = '''abc'''
-input = '''xabcy'''
-matches = [[1, 4]]
-
-[[tests]]
-name = "basic90"
-options = ['escaped']
-pattern = '''abc'''
-input = '''ababc'''
-matches = [[2, 5]]
-
-[[tests]]
-name = "basic91"
-options = ['escaped']
-pattern = '''ab*c'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic92"
-options = ['escaped']
-pattern = '''ab*bc'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic93"
-options = ['escaped']
-pattern = '''ab*bc'''
-input = '''abbc'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic94"
-options = ['escaped']
-pattern = '''ab*bc'''
-input = '''abbbbc'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "basic95"
-options = ['escaped']
-pattern = '''ab+bc'''
-input = '''abbc'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic96"
-options = ['escaped']
-pattern = '''ab+bc'''
-input = '''abbbbc'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "basic97"
-options = ['escaped']
-pattern = '''ab?bc'''
-input = '''abbc'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic98"
-options = ['escaped']
-pattern = '''ab?bc'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic99"
-options = ['escaped']
-pattern = '''ab?c'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic100"
-options = ['escaped']
-pattern = '''^abc$'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic101"
-options = ['escaped']
-pattern = '''^abc'''
-input = '''abcc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic102"
-options = ['escaped']
-pattern = '''abc$'''
-input = '''aabc'''
-matches = [[1, 4]]
-
-[[tests]]
-name = "basic103"
-options = ['escaped']
-pattern = '''^'''
-input = '''abc'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic104"
-options = ['escaped']
-pattern = '''$'''
-input = '''abc'''
-matches = [[3, 3]]
-
-[[tests]]
-name = "basic105"
-options = ['escaped']
-pattern = '''a.c'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic106"
-options = ['escaped']
-pattern = '''a.c'''
-input = '''axc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic107"
-options = ['escaped']
-pattern = '''a.*c'''
-input = '''axyzc'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "basic108"
-options = ['escaped']
-pattern = '''a[bc]d'''
-input = '''abd'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic109"
-options = ['escaped']
-pattern = '''a[b-d]e'''
-input = '''ace'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic110"
-options = ['escaped']
-pattern = '''a[b-d]'''
-input = '''aac'''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic111"
-options = ['escaped']
-pattern = '''a[-b]'''
-input = '''a-'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic112"
-options = ['escaped']
-pattern = '''a[b-]'''
-input = '''a-'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic113"
-options = ['escaped']
-pattern = '''a]'''
-input = '''a]'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic114"
-options = ['escaped']
-pattern = '''a[]]b'''
-input = '''a]b'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic115"
-options = ['escaped']
-pattern = '''a[^bc]d'''
-input = '''aed'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic116"
-options = ['escaped']
-pattern = '''a[^-b]c'''
-input = '''adc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic117"
-options = ['escaped']
-pattern = '''a[^]b]c'''
-input = '''adc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic118"
-options = ['escaped']
-pattern = '''ab|cd'''
-input = '''abc'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic119"
-options = ['escaped']
-pattern = '''ab|cd'''
-input = '''abcd'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic120"
-options = ['escaped']
-pattern = '''a\(b'''
-input = '''a(b'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic121"
-options = ['escaped']
-pattern = '''a\(*b'''
-input = '''ab'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic122"
-options = ['escaped']
-pattern = '''a\(*b'''
-input = '''a((b'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic123"
-options = ['escaped']
-pattern = '''((a))'''
-input = '''abc'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic124"
-options = ['escaped']
-pattern = '''(a)b(c)'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic125"
-options = ['escaped']
-pattern = '''a+b+c'''
-input = '''aabbabc'''
-matches = [[4, 7]]
-
-[[tests]]
-name = "basic126"
-options = ['escaped']
-pattern = '''a*'''
-input = '''aaa'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic128"
-options = ['escaped']
-pattern = '''(a*)*'''
-input = '''-'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic129"
-options = ['escaped']
-pattern = '''(a*)+'''
-input = '''-'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic131"
-options = ['escaped']
-pattern = '''(a*|b)*'''
-input = '''-'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic132"
-options = ['escaped']
-pattern = '''(a+|b)*'''
-input = '''ab'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic133"
-options = ['escaped']
-pattern = '''(a+|b)+'''
-input = '''ab'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic134"
-options = ['escaped']
-pattern = '''(a+|b)?'''
-input = '''ab'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic135"
-options = ['escaped']
-pattern = '''[^ab]*'''
-input = '''cde'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic137"
-options = ['escaped']
-pattern = '''(^)*'''
-input = '''-'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic138"
-options = ['escaped']
-pattern = '''a*'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic139"
-options = ['escaped']
-pattern = '''([abc])*d'''
-input = '''abbbcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "basic140"
-options = ['escaped']
-pattern = '''([abc])*bcd'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic141"
-options = ['escaped']
-pattern = '''a|b|c|d|e'''
-input = '''e'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic142"
-options = ['escaped']
-pattern = '''(a|b|c|d|e)f'''
-input = '''ef'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic144"
-options = ['escaped']
-pattern = '''((a*|b))*'''
-input = '''-'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "basic145"
-options = ['escaped']
-pattern = '''abcd*efg'''
-input = '''abcdefg'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic146"
-options = ['escaped']
-pattern = '''ab*'''
-input = '''xabyabbbz'''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic147"
-options = ['escaped']
-pattern = '''ab*'''
-input = '''xayabbbz'''
-matches = [[1, 2]]
-
-[[tests]]
-name = "basic148"
-options = ['escaped']
-pattern = '''(ab|cd)e'''
-input = '''abcde'''
-matches = [[2, 5]]
-
-[[tests]]
-name = "basic149"
-options = ['escaped']
-pattern = '''[abhgefdc]ij'''
-input = '''hij'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic150"
-options = ['escaped']
-pattern = '''(a|b)c*d'''
-input = '''abcd'''
-matches = [[1, 4]]
-
-[[tests]]
-name = "basic151"
-options = ['escaped']
-pattern = '''(ab|ab*)bc'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic152"
-options = ['escaped']
-pattern = '''a([bc]*)c*'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic153"
-options = ['escaped']
-pattern = '''a([bc]*)(c*d)'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic154"
-options = ['escaped']
-pattern = '''a([bc]+)(c*d)'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic155"
-options = ['escaped']
-pattern = '''a([bc]*)(c+d)'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic156"
-options = ['escaped']
-pattern = '''a[bcd]*dcdcde'''
-input = '''adcdcde'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic157"
-options = ['escaped']
-pattern = '''(ab|a)b*c'''
-input = '''abc'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic158"
-options = ['escaped']
-pattern = '''((a)(b)c)(d)'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic159"
-options = ['escaped']
-pattern = '''[A-Za-z_][A-Za-z0-9_]*'''
-input = '''alpha'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "basic160"
-options = ['escaped']
-pattern = '''^a(bc+|b[eh])g|.h$'''
-input = '''abh'''
-matches = [[1, 3]]
-
-[[tests]]
-name = "basic161"
-options = ['escaped']
-pattern = '''(bc+d$|ef*g.|h?i(j|k))'''
-input = '''effgz'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "basic162"
-options = ['escaped']
-pattern = '''(bc+d$|ef*g.|h?i(j|k))'''
-input = '''ij'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "basic163"
-options = ['escaped']
-pattern = '''(bc+d$|ef*g.|h?i(j|k))'''
-input = '''reffgz'''
-matches = [[1, 6]]
-
-[[tests]]
-name = "basic164"
-options = ['escaped']
-pattern = '''(((((((((a)))))))))'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "basic165"
-options = ['escaped']
-pattern = '''multiple words'''
-input = '''multiple words yeah'''
-matches = [[0, 14]]
-
-[[tests]]
-name = "basic166"
-options = ['escaped']
-pattern = '''(.*)c(.*)'''
-input = '''abcde'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "basic167"
-options = ['escaped']
-pattern = '''abcd'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic168"
-options = ['escaped']
-pattern = '''a(bc)d'''
-input = '''abcd'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic169"
-options = ['escaped']
-pattern = '''a[\x01-\x03]?c'''
-input = '''a\x02c'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic170"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Qaddafi'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic171"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Mo'ammar Gadhafi'''
-matches = [[0, 16]]
-
-[[tests]]
-name = "basic172"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Kaddafi'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic173"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Qadhafi'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic174"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Gadafi'''
-matches = [[0, 14]]
-
-[[tests]]
-name = "basic175"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Mu'ammar Qadafi'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic176"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Moamar Gaddafi'''
-matches = [[0, 14]]
-
-[[tests]]
-name = "basic177"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Mu'ammar Qadhdhafi'''
-matches = [[0, 18]]
-
-[[tests]]
-name = "basic178"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Khaddafi'''
-matches = [[0, 16]]
-
-[[tests]]
-name = "basic179"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Ghaddafy'''
-matches = [[0, 16]]
-
-[[tests]]
-name = "basic180"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Ghadafi'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic181"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Ghaddafi'''
-matches = [[0, 16]]
-
-[[tests]]
-name = "basic182"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muamar Kaddafi'''
-matches = [[0, 14]]
-
-[[tests]]
-name = "basic183"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Quathafi'''
-matches = [[0, 16]]
-
-[[tests]]
-name = "basic184"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Muammar Gheddafi'''
-matches = [[0, 16]]
-
-[[tests]]
-name = "basic185"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Moammar Khadafy'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic186"
-options = ['escaped']
-pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
-input = '''Moammar Qudhafi'''
-matches = [[0, 15]]
-
-[[tests]]
-name = "basic187"
-options = ['escaped']
-pattern = '''a+(b|c)*d+'''
-input = '''aabcdd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "basic188"
-options = ['escaped']
-pattern = '''^.+$'''
-input = '''vivi'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic189"
-options = ['escaped']
-pattern = '''^(.+)$'''
-input = '''vivi'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic190"
-options = ['escaped']
-pattern = '''^([^!.]+).att.com!(.+)$'''
-input = '''gryphon.att.com!eby'''
-matches = [[0, 19]]
-
-[[tests]]
-name = "basic191"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$'''
-input = '''bas'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic192"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic193"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic194"
-options = ['escaped']
-pattern = '''^.+!([^!]+!)([^!]+)$'''
-input = '''foo!bar!bas'''
-matches = [[0, 11]]
-
-[[tests]]
-name = "basic195"
-options = ['escaped']
-pattern = '''((foo)|(bar))!bas'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic196"
-options = ['escaped']
-pattern = '''((foo)|(bar))!bas'''
-input = '''foo!bar!bas'''
-matches = [[4, 11]]
-
-[[tests]]
-name = "basic197"
-options = ['escaped']
-pattern = '''((foo)|(bar))!bas'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic198"
-options = ['escaped']
-pattern = '''((foo)|bar)!bas'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic199"
-options = ['escaped']
-pattern = '''((foo)|bar)!bas'''
-input = '''foo!bar!bas'''
-matches = [[4, 11]]
-
-[[tests]]
-name = "basic200"
-options = ['escaped']
-pattern = '''((foo)|bar)!bas'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic201"
-options = ['escaped']
-pattern = '''(foo|(bar))!bas'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic202"
-options = ['escaped']
-pattern = '''(foo|(bar))!bas'''
-input = '''foo!bar!bas'''
-matches = [[4, 11]]
-
-[[tests]]
-name = "basic203"
-options = ['escaped']
-pattern = '''(foo|(bar))!bas'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic204"
-options = ['escaped']
-pattern = '''(foo|bar)!bas'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic205"
-options = ['escaped']
-pattern = '''(foo|bar)!bas'''
-input = '''foo!bar!bas'''
-matches = [[4, 11]]
-
-[[tests]]
-name = "basic206"
-options = ['escaped']
-pattern = '''(foo|bar)!bas'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic207"
-options = ['escaped']
-pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''foo!bar!bas'''
-matches = [[0, 11]]
-
-[[tests]]
-name = "basic208"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''bas'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic209"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic210"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''foo!bar!bas'''
-matches = [[0, 11]]
-
-[[tests]]
-name = "basic211"
-options = ['escaped']
-pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic212"
-options = ['escaped']
-pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''bas'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "basic213"
-options = ['escaped']
-pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''bar!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic214"
-options = ['escaped']
-pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''foo!bar!bas'''
-matches = [[0, 11]]
-
-[[tests]]
-name = "basic215"
-options = ['escaped']
-pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
-input = '''foo!bas'''
-matches = [[0, 7]]
-
-[[tests]]
-name = "basic216"
-options = ['escaped']
-pattern = '''.*(/XXX).*'''
-input = '''/XXX'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic217"
-options = ['escaped']
-pattern = '''.*(\\XXX).*'''
-input = '''\\XXX'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic218"
-options = ['escaped']
-pattern = '''\\XXX'''
-input = '''\\XXX'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic219"
-options = ['escaped']
-pattern = '''.*(/000).*'''
-input = '''/000'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic220"
-options = ['escaped']
-pattern = '''.*(\\000).*'''
-input = '''\\000'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "basic221"
-options = ['escaped']
-pattern = '''\\000'''
-input = '''\\000'''
-matches = [[0, 4]]
-
diff --git a/vendor/regex-automata/data/tests/fowler/fowler-to-toml b/vendor/regex-automata/data/tests/fowler/fowler-to-toml
deleted file mode 100755
index 5f1d91fcb..000000000
--- a/vendor/regex-automata/data/tests/fowler/fowler-to-toml
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import absolute_import, division, print_function
-import argparse
-import os.path as path
-
-
-def read_tests(f):
- basename, _ = path.splitext(path.basename(f))
- tests = []
- prev_pattern = None
-
- for lineno, line in enumerate(open(f), 1):
- fields = list(filter(None, map(str.strip, line.split('\t'))))
- if not (4 <= len(fields) <= 5) \
- or 'E' not in fields[0] or fields[0][0] == '#':
- continue
-
- terse_opts, pat, text, sgroups = fields[0:4]
- groups = [] # groups as integer ranges
- if sgroups == 'NOMATCH':
- groups = []
- elif ',' in sgroups:
- noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
- for g in noparen:
- s, e = map(str.strip, g.split(','))
- groups.append([int(s), int(e)])
- break
- else:
- # This skips tests that should result in an error.
- # There aren't many, so I think we can just capture those
- # manually. Possibly fix this in future.
- continue
-
- opts = []
- if text == "NULL":
- text = ""
- if pat == 'SAME':
- pat = prev_pattern
- if '$' in terse_opts:
- pat = pat.encode('utf-8').decode('unicode_escape')
- text = text.encode('utf-8').decode('unicode_escape')
- text = text.encode('unicode_escape').decode('utf-8')
- opts.append('escaped')
- else:
- opts.append('escaped')
- text = text.encode('unicode_escape').decode('utf-8')
- if 'i' in terse_opts:
- opts.append('case-insensitive')
-
- pat = pat.encode('unicode_escape').decode('utf-8')
- pat = pat.replace('\\\\', '\\')
- tests.append({
- 'name': '"%s%d"' % (basename, lineno),
- 'options': repr(opts),
- 'pattern': "'''%s'''" % pat,
- 'input': "'''%s'''" % text,
- 'matches': str(groups),
- })
- prev_pattern = pat
- return tests
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description='Generate match tests from an AT&T POSIX test file.')
- aa = parser.add_argument
- aa('datfile', help='A dat AT&T POSIX test file.')
- args = parser.parse_args()
-
- tests = read_tests(args.datfile)
- for t in tests:
- print('[[tests]]')
- for k, v in t.items():
- print('%s = %s' % (k, v))
- print('')
diff --git a/vendor/regex-automata/data/tests/fowler/nullsubexpr.dat b/vendor/regex-automata/data/tests/fowler/nullsubexpr.dat
deleted file mode 100644
index 2e18fbb91..000000000
--- a/vendor/regex-automata/data/tests/fowler/nullsubexpr.dat
+++ /dev/null
@@ -1,79 +0,0 @@
-NOTE null subexpression matches : 2002-06-06
-
-E (a*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a*)+ a (0,1)(0,1)
-E SAME x (0,0)(0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a+)* a (0,1)(0,1)
-E SAME x (0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E (a+)+ a (0,1)(0,1)
-E SAME x NOMATCH
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-
-E ([a]*)* a (0,1)(0,1)
-#E SAME x (0,0)(0,0)
-E SAME x (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E ([a]*)+ a (0,1)(0,1)
-E SAME x (0,0)(0,0)
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaax (0,6)(0,6)
-E ([^b]*)* a (0,1)(0,1)
-#E SAME b (0,0)(0,0)
-E SAME b (0,0)(?,?) RE2/Go
-E SAME aaaaaa (0,6)(0,6)
-E SAME aaaaaab (0,6)(0,6)
-E ([ab]*)* a (0,1)(0,1)
-E SAME aaaaaa (0,6)(0,6)
-E SAME ababab (0,6)(0,6)
-E SAME bababa (0,6)(0,6)
-E SAME b (0,1)(0,1)
-E SAME bbbbbb (0,6)(0,6)
-E SAME aaaabcde (0,5)(0,5)
-E ([^a]*)* b (0,1)(0,1)
-E SAME bbbbbb (0,6)(0,6)
-#E SAME aaaaaa (0,0)(0,0)
-E SAME aaaaaa (0,0)(?,?) RE2/Go
-E ([^ab]*)* ccccxx (0,6)(0,6)
-#E SAME ababab (0,0)(0,0)
-E SAME ababab (0,0)(?,?) RE2/Go
-
-E ((z)+|a)* zabcde (0,2)(1,2)
-
-#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
-#E (a) aaa (0,1)(0,1)
-#E (a*?) aaa (0,0)(0,0)
-#E (a)*? aaa (0,0)
-#E (a*?)*? aaa (0,0)
-#}
-
-B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
-B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
-B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
-B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
-B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
-B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
-B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
-B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
-
-#E (a*)*(x) x (0,1)(0,0)(0,1)
-E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
-E (a*)*(x) ax (0,2)(0,1)(1,2)
-E (a*)*(x) axa (0,2)(0,1)(1,2)
-
-E (a*)+(x) x (0,1)(0,0)(0,1)
-E (a*)+(x) ax (0,2)(0,1)(1,2)
-E (a*)+(x) axa (0,2)(0,1)(1,2)
-
-E (a*){2}(x) x (0,1)(0,0)(0,1)
-E (a*){2}(x) ax (0,2)(1,1)(1,2)
-E (a*){2}(x) axa (0,2)(1,1)(1,2)
diff --git a/vendor/regex-automata/data/tests/fowler/nullsubexpr.toml b/vendor/regex-automata/data/tests/fowler/nullsubexpr.toml
deleted file mode 100644
index 331067c60..000000000
--- a/vendor/regex-automata/data/tests/fowler/nullsubexpr.toml
+++ /dev/null
@@ -1,350 +0,0 @@
-[[tests]]
-name = "nullsubexpr3"
-options = ['escaped']
-pattern = '''(a*)*'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr5"
-options = ['escaped']
-pattern = '''(a*)*'''
-input = '''x'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr6"
-options = ['escaped']
-pattern = '''(a*)*'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr7"
-options = ['escaped']
-pattern = '''(a*)*'''
-input = '''aaaaaax'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr8"
-options = ['escaped']
-pattern = '''(a*)+'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr9"
-options = ['escaped']
-pattern = '''(a*)+'''
-input = '''x'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr10"
-options = ['escaped']
-pattern = '''(a*)+'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr11"
-options = ['escaped']
-pattern = '''(a*)+'''
-input = '''aaaaaax'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr12"
-options = ['escaped']
-pattern = '''(a+)*'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr13"
-options = ['escaped']
-pattern = '''(a+)*'''
-input = '''x'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr14"
-options = ['escaped']
-pattern = '''(a+)*'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr15"
-options = ['escaped']
-pattern = '''(a+)*'''
-input = '''aaaaaax'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr16"
-options = ['escaped']
-pattern = '''(a+)+'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr17"
-options = ['escaped']
-pattern = '''(a+)+'''
-input = '''x'''
-matches = []
-
-[[tests]]
-name = "nullsubexpr18"
-options = ['escaped']
-pattern = '''(a+)+'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr19"
-options = ['escaped']
-pattern = '''(a+)+'''
-input = '''aaaaaax'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr21"
-options = ['escaped']
-pattern = '''([a]*)*'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr23"
-options = ['escaped']
-pattern = '''([a]*)*'''
-input = '''x'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr24"
-options = ['escaped']
-pattern = '''([a]*)*'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr25"
-options = ['escaped']
-pattern = '''([a]*)*'''
-input = '''aaaaaax'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr26"
-options = ['escaped']
-pattern = '''([a]*)+'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr27"
-options = ['escaped']
-pattern = '''([a]*)+'''
-input = '''x'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr28"
-options = ['escaped']
-pattern = '''([a]*)+'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr29"
-options = ['escaped']
-pattern = '''([a]*)+'''
-input = '''aaaaaax'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr30"
-options = ['escaped']
-pattern = '''([^b]*)*'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr32"
-options = ['escaped']
-pattern = '''([^b]*)*'''
-input = '''b'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr33"
-options = ['escaped']
-pattern = '''([^b]*)*'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr34"
-options = ['escaped']
-pattern = '''([^b]*)*'''
-input = '''aaaaaab'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr35"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr36"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr37"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''ababab'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr38"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''bababa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr39"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''b'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr40"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''bbbbbb'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr41"
-options = ['escaped']
-pattern = '''([ab]*)*'''
-input = '''aaaabcde'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "nullsubexpr42"
-options = ['escaped']
-pattern = '''([^a]*)*'''
-input = '''b'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr43"
-options = ['escaped']
-pattern = '''([^a]*)*'''
-input = '''bbbbbb'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr45"
-options = ['escaped']
-pattern = '''([^a]*)*'''
-input = '''aaaaaa'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr46"
-options = ['escaped']
-pattern = '''([^ab]*)*'''
-input = '''ccccxx'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "nullsubexpr48"
-options = ['escaped']
-pattern = '''([^ab]*)*'''
-input = '''ababab'''
-matches = [[0, 0]]
-
-[[tests]]
-name = "nullsubexpr50"
-options = ['escaped']
-pattern = '''((z)+|a)*'''
-input = '''zabcde'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "nullsubexpr69"
-options = ['escaped']
-pattern = '''(a*)*(x)'''
-input = '''x'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr70"
-options = ['escaped']
-pattern = '''(a*)*(x)'''
-input = '''ax'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "nullsubexpr71"
-options = ['escaped']
-pattern = '''(a*)*(x)'''
-input = '''axa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "nullsubexpr73"
-options = ['escaped']
-pattern = '''(a*)+(x)'''
-input = '''x'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr74"
-options = ['escaped']
-pattern = '''(a*)+(x)'''
-input = '''ax'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "nullsubexpr75"
-options = ['escaped']
-pattern = '''(a*)+(x)'''
-input = '''axa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "nullsubexpr77"
-options = ['escaped']
-pattern = '''(a*){2}(x)'''
-input = '''x'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "nullsubexpr78"
-options = ['escaped']
-pattern = '''(a*){2}(x)'''
-input = '''ax'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "nullsubexpr79"
-options = ['escaped']
-pattern = '''(a*){2}(x)'''
-input = '''axa'''
-matches = [[0, 2]]
-
diff --git a/vendor/regex-automata/data/tests/fowler/repetition-long.toml b/vendor/regex-automata/data/tests/fowler/repetition-long.toml
deleted file mode 100644
index e0b2ea76b..000000000
--- a/vendor/regex-automata/data/tests/fowler/repetition-long.toml
+++ /dev/null
@@ -1,294 +0,0 @@
-[[tests]]
-name = "repetition-long12"
-options = ['escaped']
-pattern = '''X(.?){0,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long13"
-options = ['escaped']
-pattern = '''X(.?){1,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long14"
-options = ['escaped']
-pattern = '''X(.?){2,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long15"
-options = ['escaped']
-pattern = '''X(.?){3,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long16"
-options = ['escaped']
-pattern = '''X(.?){4,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long17"
-options = ['escaped']
-pattern = '''X(.?){5,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long18"
-options = ['escaped']
-pattern = '''X(.?){6,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long19"
-options = ['escaped']
-pattern = '''X(.?){7,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long20"
-options = ['escaped']
-pattern = '''X(.?){8,}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long22"
-options = ['escaped']
-pattern = '''X(.?){0,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long24"
-options = ['escaped']
-pattern = '''X(.?){1,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long26"
-options = ['escaped']
-pattern = '''X(.?){2,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long28"
-options = ['escaped']
-pattern = '''X(.?){3,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long30"
-options = ['escaped']
-pattern = '''X(.?){4,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long32"
-options = ['escaped']
-pattern = '''X(.?){5,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long34"
-options = ['escaped']
-pattern = '''X(.?){6,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long36"
-options = ['escaped']
-pattern = '''X(.?){7,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long37"
-options = ['escaped']
-pattern = '''X(.?){8,8}Y'''
-input = '''X1234567Y'''
-matches = [[0, 9]]
-
-[[tests]]
-name = "repetition-long48"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){0,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition-long49"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){1,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition-long50"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){2,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long51"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){3,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long52"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){4,}(d*)'''
-input = '''ababcd'''
-matches = []
-
-[[tests]]
-name = "repetition-long53"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){0,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition-long54"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){1,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition-long55"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){2,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long56"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){3,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long57"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd){4,10}(d*)'''
-input = '''ababcd'''
-matches = []
-
-[[tests]]
-name = "repetition-long58"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd)*(d*)'''
-input = '''ababcd'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition-long59"
-options = ['escaped']
-pattern = '''(a|ab|c|bcd)+(d*)'''
-input = '''ababcd'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition-long65"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){0,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long67"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){1,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long69"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){2,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long71"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){3,}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long72"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){4,}(d*)'''
-input = '''ababcd'''
-matches = []
-
-[[tests]]
-name = "repetition-long74"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){0,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long76"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){1,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long78"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){2,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long80"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){3,10}(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long81"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd){4,10}(d*)'''
-input = '''ababcd'''
-matches = []
-
-[[tests]]
-name = "repetition-long83"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd)*(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition-long85"
-options = ['escaped']
-pattern = '''(ab|a|c|bcd)+(d*)'''
-input = '''ababcd'''
-matches = [[0, 6]]
-
diff --git a/vendor/regex-automata/data/tests/fowler/repetition.toml b/vendor/regex-automata/data/tests/fowler/repetition.toml
deleted file mode 100644
index 43280a409..000000000
--- a/vendor/regex-automata/data/tests/fowler/repetition.toml
+++ /dev/null
@@ -1,343 +0,0 @@
-[[tests]]
-name = "repetition10"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = ''''''
-matches = []
-
-[[tests]]
-name = "repetition11"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = ''''''
-matches = []
-
-[[tests]]
-name = "repetition12"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = ''''''
-matches = []
-
-[[tests]]
-name = "repetition14"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = ''''''
-matches = []
-
-[[tests]]
-name = "repetition15"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = ''''''
-matches = []
-
-[[tests]]
-name = "repetition16"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = ''''''
-matches = []
-
-[[tests]]
-name = "repetition18"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = ''''''
-matches = [[0, 0]]
-
-[[tests]]
-name = "repetition20"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition21"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = '''a'''
-matches = []
-
-[[tests]]
-name = "repetition22"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''a'''
-matches = []
-
-[[tests]]
-name = "repetition24"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition25"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = '''a'''
-matches = []
-
-[[tests]]
-name = "repetition26"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = '''a'''
-matches = []
-
-[[tests]]
-name = "repetition28"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = '''a'''
-matches = [[0, 1]]
-
-[[tests]]
-name = "repetition30"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition31"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition32"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aa'''
-matches = []
-
-[[tests]]
-name = "repetition34"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition35"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition36"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = '''aa'''
-matches = []
-
-[[tests]]
-name = "repetition38"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = '''aa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition40"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = '''aaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition41"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = '''aaa'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "repetition42"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaa'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "repetition44"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = '''aaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition46"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = '''aaa'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "repetition47"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = '''aaa'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "repetition50"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = '''aaa'''
-matches = [[0, 3]]
-
-[[tests]]
-name = "repetition52"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = '''aaaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition53"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition54"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition56"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = '''aaaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition57"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition59"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition61"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = '''aaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition63"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = '''aaaaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition64"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = '''aaaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition65"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaaaa'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "repetition67"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = '''aaaaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition68"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = '''aaaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition70"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = '''aaaaa'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "repetition73"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = '''aaaaa'''
-matches = [[0, 5]]
-
-[[tests]]
-name = "repetition75"
-options = ['escaped']
-pattern = '''((..)|(.))'''
-input = '''aaaaaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition76"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))'''
-input = '''aaaaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition77"
-options = ['escaped']
-pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition79"
-options = ['escaped']
-pattern = '''((..)|(.)){1}'''
-input = '''aaaaaa'''
-matches = [[0, 2]]
-
-[[tests]]
-name = "repetition80"
-options = ['escaped']
-pattern = '''((..)|(.)){2}'''
-input = '''aaaaaa'''
-matches = [[0, 4]]
-
-[[tests]]
-name = "repetition81"
-options = ['escaped']
-pattern = '''((..)|(.)){3}'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
-[[tests]]
-name = "repetition83"
-options = ['escaped']
-pattern = '''((..)|(.))*'''
-input = '''aaaaaa'''
-matches = [[0, 6]]
-
diff --git a/vendor/regex-automata/data/tests/iter.toml b/vendor/regex-automata/data/tests/iter.toml
deleted file mode 100644
index 30abae86e..000000000
--- a/vendor/regex-automata/data/tests/iter.toml
+++ /dev/null
@@ -1,92 +0,0 @@
-[[tests]]
-name = "iter1"
-pattern = "a"
-input = "aaa"
-matches = [[0, 1], [1, 2], [2, 3]]
-
-[[tests]]
-name = "iter2"
-pattern = "a"
-input = "aba"
-matches = [[0, 1], [2, 3]]
-
-[[tests]]
-name = "iter-empty1"
-pattern = ''
-input = ''
-matches = [[0, 0]]
-
-[[tests]]
-name = "iter-empty2"
-pattern = ''
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty3"
-pattern = '()'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty4"
-pattern = '()*'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty5"
-pattern = '()+'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty6"
-pattern = '()?'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty7"
-pattern = '()()'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty8"
-pattern = '()+|z'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty9"
-pattern = 'z|()+'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty10"
-pattern = '()+|b'
-input = 'abc'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-name = "iter-empty11"
-pattern = 'b|()+'
-input = 'abc'
-matches = [[0, 0], [1, 2], [3, 3]]
-
-
-[[tests]]
-options = ["anchored"]
-name = "iter-anchored1"
-pattern = "a"
-input = "a"
-matches = [[0, 1]]
-
-[[tests]]
-options = ["anchored"]
-name = "iter-anchored2"
-pattern = "a"
-input = "aa"
-matches = [[0, 1]]
diff --git a/vendor/regex-automata/data/tests/no-unicode.toml b/vendor/regex-automata/data/tests/no-unicode.toml
deleted file mode 100644
index 16e02b426..000000000
--- a/vendor/regex-automata/data/tests/no-unicode.toml
+++ /dev/null
@@ -1,138 +0,0 @@
-[[tests]]
-name = "invalid-utf8-literal1"
-options = ["escaped", "invalid-utf8", "no-unicode"]
-pattern = '\xFF'
-input = '\xFF'
-matches = [[0, 1]]
-
-
-[[tests]]
-name = "no-unicode-mixed"
-options = ["escaped", "invalid-utf8"]
-pattern = '(.+)(?-u)(.+)'
-input = '\xCE\x93\xCE\x94\xFF'
-matches = [[0, 5]]
-
-
-[[tests]]
-name = "no-unicode-case1"
-options = ["case-insensitive", "no-unicode"]
-pattern = "a"
-input = "A"
-matches = [[0, 1]]
-
-[[tests]]
-name = "no-unicode-case2"
-options = ["case-insensitive", "no-unicode"]
-pattern = "[a-z]+"
-input = "AaAaA"
-matches = [[0, 5]]
-
-[[tests]]
-name = "no-unicode-case3"
-options = ["case-insensitive"]
-pattern = "[a-z]+"
-input = "aA\u212AaA"
-matches = [[0, 7]]
-
-[[tests]]
-name = "no-unicode-case4"
-options = ["case-insensitive", "no-unicode"]
-pattern = "[a-z]+"
-input = "aA\u212AaA"
-matches = [[0, 2]]
-
-
-[[tests]]
-name = "no-unicode-negate1"
-options = []
-pattern = "[^a]"
-input = "δ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "no-unicode-negate2"
-options = ["no-unicode", "invalid-utf8"]
-pattern = "[^a]"
-input = "δ"
-matches = [[0, 1]]
-
-
-[[tests]]
-name = "no-unicode-dotstar-prefix1"
-options = ["escaped", "no-unicode", "invalid-utf8"]
-pattern = "a"
-input = '\xFFa'
-matches = [[1, 2]]
-
-[[tests]]
-name = "no-unicode-dotstar-prefix2"
-options = ["escaped", "invalid-utf8"]
-pattern = "a"
-input = '\xFFa'
-matches = [[1, 2]]
-
-
-[[tests]]
-name = "no-unicode-null-bytes1"
-options = ["escaped", "no-unicode", "invalid-utf8"]
-pattern = '[^\x00]+\x00'
-input = 'foo\x00'
-matches = [[0, 4]]
-
-
-[[tests]]
-name = "no-unicode1"
-options = ["no-unicode"]
-pattern = '\w+'
-input = "aδ"
-matches = [[0, 1]]
-
-[[tests]]
-name = "no-unicode2"
-options = []
-pattern = '\w+'
-input = "aδ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "no-unicode3"
-options = ["no-unicode"]
-pattern = '\d+'
-input = "1२३9"
-matches = [[0, 1]]
-
-[[tests]]
-name = "no-unicode4"
-pattern = '\d+'
-input = "1२३9"
-matches = [[0, 8]]
-
-[[tests]]
-name = "no-unicode5"
-options = ["no-unicode"]
-pattern = '\s+'
-input = " \u1680"
-matches = [[0, 1]]
-
-[[tests]]
-name = "no-unicode6"
-pattern = '\s+'
-input = " \u1680"
-matches = [[0, 4]]
-
-
-[[tests]]
-# See: https://github.com/rust-lang/regex/issues/484
-name = "no-unicode-iter1"
-pattern = ''
-input = "☃"
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
-
-[[tests]]
-# See: https://github.com/rust-lang/regex/issues/484
-options = ['escaped']
-name = "no-unicode-iter2"
-pattern = ''
-input = 'b\xFFr'
-matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
diff --git a/vendor/regex-automata/data/tests/unicode.toml b/vendor/regex-automata/data/tests/unicode.toml
deleted file mode 100644
index 845393f28..000000000
--- a/vendor/regex-automata/data/tests/unicode.toml
+++ /dev/null
@@ -1,489 +0,0 @@
-[[tests]]
-name = "unicode-literal1"
-pattern = '☃'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-literal2"
-pattern = '☃+'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-literal3"
-options = ["case-insensitive"]
-pattern = '☃+'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-literal4"
-options = ["case-insensitive"]
-pattern = 'Δ'
-input = "δ"
-matches = [[0, 2]]
-
-
-[[tests]]
-name = "unicode-class1"
-pattern = '[☃Ⅰ]+'
-input = "☃"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class2"
-pattern = '\pN'
-input = "Ⅰ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class3"
-pattern = '\pN+'
-input = "Ⅰ1Ⅱ2"
-matches = [[0, 8]]
-
-[[tests]]
-name = "unicode-class4"
-pattern = '\PN+'
-input = "abⅠ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class5"
-pattern = '[\PN]+'
-input = "abⅠ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class6"
-pattern = '[^\PN]+'
-input = "abⅠ"
-matches = [[2, 5]]
-
-[[tests]]
-name = "unicode-class7"
-pattern = '\p{Lu}+'
-input = "ΛΘΓΔα"
-matches = [[0, 8]]
-
-[[tests]]
-name = "unicode-class8"
-options = ["case-insensitive"]
-pattern = '\p{Lu}+'
-input = "ΛΘΓΔα"
-matches = [[0, 10]]
-
-[[tests]]
-name = "unicode-class9"
-pattern = '\pL+'
-input = "ΛΘΓΔα"
-matches = [[0, 10]]
-
-[[tests]]
-name = "unicode-class10"
-pattern = '\p{Ll}+'
-input = "ΛΘΓΔα"
-matches = [[8, 10]]
-
-
-[[tests]]
-name = "unicode-perl1"
-pattern = '\w+'
-input = "dδd"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-perl2"
-pattern = '\w+'
-input = "⥡"
-matches = []
-
-[[tests]]
-name = "unicode-perl3"
-pattern = '\W+'
-input = "⥡"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-perl4"
-pattern = '\d+'
-input = "1२३9"
-matches = [[0, 8]]
-
-[[tests]]
-name = "unicode-perl5"
-pattern = '\d+'
-input = "Ⅱ"
-matches = []
-
-[[tests]]
-name = "unicode-perl6"
-pattern = '\D+'
-input = "Ⅱ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-perl7"
-pattern = '\s+'
-input = " "
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-perl8"
-pattern = '\s+'
-input = "☃"
-matches = []
-
-[[tests]]
-name = "unicode-perl9"
-pattern = '\S+'
-input = "☃"
-matches = [[0, 3]]
-
-
-[[tests]]
-name = "unicode-class-gencat1"
-pattern = '\p{Cased_Letter}'
-input = "A"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat2"
-pattern = '\p{Close_Punctuation}'
-input = "❯"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat3"
-pattern = '\p{Connector_Punctuation}'
-input = "⁀"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat4"
-pattern = '\p{Control}'
-input = "\u009F"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class-gencat5"
-pattern = '\p{Currency_Symbol}'
-input = "£"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat6"
-pattern = '\p{Dash_Punctuation}'
-input = "〰"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat7"
-pattern = '\p{Decimal_Number}'
-input = "𑓙"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat8"
-pattern = '\p{Enclosing_Mark}'
-input = "\uA672"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat9"
-pattern = '\p{Final_Punctuation}'
-input = "⸡"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat10"
-pattern = '\p{Format}'
-input = "\U000E007F"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat11"
-pattern = '\p{Initial_Punctuation}'
-input = "⸜"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat12"
-pattern = '\p{Letter}'
-input = "Έ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class-gencat13"
-pattern = '\p{Letter_Number}'
-input = "ↂ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat14"
-pattern = '\p{Line_Separator}'
-input = "\u2028"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat15"
-pattern = '\p{Lowercase_Letter}'
-input = "ϛ"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class-gencat16"
-pattern = '\p{Mark}'
-input = "\U000E01EF"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat17"
-pattern = '\p{Math}'
-input = "⋿"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat18"
-pattern = '\p{Modifier_Letter}'
-input = "𖭃"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat19"
-pattern = '\p{Modifier_Symbol}'
-input = "🏿"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat20"
-pattern = '\p{Nonspacing_Mark}'
-input = "\U0001E94A"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat21"
-pattern = '\p{Number}'
-input = "⓿"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat22"
-pattern = '\p{Open_Punctuation}'
-input = "⦅"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat23"
-pattern = '\p{Other}'
-input = "\u0BC9"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat24"
-pattern = '\p{Other_Letter}'
-input = "ꓷ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat25"
-pattern = '\p{Other_Number}'
-input = "㉏"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat26"
-pattern = '\p{Other_Punctuation}'
-input = "𞥞"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat27"
-pattern = '\p{Other_Symbol}'
-input = "⅌"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat28"
-pattern = '\p{Paragraph_Separator}'
-input = "\u2029"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat29"
-pattern = '\p{Private_Use}'
-input = "\U0010FFFD"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat30"
-pattern = '\p{Punctuation}'
-input = "𑁍"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat31"
-pattern = '\p{Separator}'
-input = "\u3000"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat32"
-pattern = '\p{Space_Separator}'
-input = "\u205F"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat33"
-pattern = '\p{Spacing_Mark}'
-input = "\U00016F7E"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat34"
-pattern = '\p{Symbol}'
-input = "⯈"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat35"
-pattern = '\p{Titlecase_Letter}'
-input = "ῼ"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gencat36"
-pattern = '\p{Unassigned}'
-input = "\U0010FFFF"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gencat37"
-pattern = '\p{Uppercase_Letter}'
-input = "Ꝋ"
-matches = [[0, 3]]
-
-
-[[tests]]
-name = "unicode-class-emoji1"
-pattern = '\p{Emoji}'
-input = "\u23E9"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-emoji2"
-pattern = '\p{emoji}'
-input = "\U0001F21A"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-emoji3"
-pattern = '\p{extendedpictographic}'
-input = "\U0001FA6E"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-emoji4"
-pattern = '\p{extendedpictographic}'
-input = "\U0001FFFD"
-matches = [[0, 4]]
-
-
-[[tests]]
-name = "unicode-class-gcb1"
-pattern = '\p{grapheme_cluster_break=prepend}'
-input = "\U00011D46"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gcb2"
-pattern = '\p{gcb=regional_indicator}'
-input = "\U0001F1E6"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gcb3"
-pattern = '\p{gcb=ri}'
-input = "\U0001F1E7"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gcb4"
-pattern = '\p{regionalindicator}'
-input = "\U0001F1FF"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-gcb5"
-pattern = '\p{gcb=lvt}'
-input = "\uC989"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-gcb6"
-pattern = '\p{gcb=zwj}'
-input = "\u200D"
-matches = [[0, 3]]
-
-
-[[tests]]
-name = "unicode-class-word-break1"
-pattern = '\p{word_break=Hebrew_Letter}'
-input = "\uFB46"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-word-break2"
-pattern = '\p{wb=hebrewletter}'
-input = "\uFB46"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-word-break3"
-pattern = '\p{wb=ExtendNumLet}'
-input = "\uFF3F"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-word-break4"
-pattern = '\p{wb=WSegSpace}'
-input = "\u3000"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-word-break5"
-pattern = '\p{wb=numeric}'
-input = "\U0001E950"
-matches = [[0, 4]]
-
-
-[[tests]]
-name = "unicode-class-sentence-break1"
-pattern = '\p{sentence_break=Lower}'
-input = "\u0469"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class-sentence-break2"
-pattern = '\p{sb=lower}'
-input = "\u0469"
-matches = [[0, 2]]
-
-[[tests]]
-name = "unicode-class-sentence-break3"
-pattern = '\p{sb=Close}'
-input = "\uFF60"
-matches = [[0, 3]]
-
-[[tests]]
-name = "unicode-class-sentence-break4"
-pattern = '\p{sb=Close}'
-input = "\U0001F677"
-matches = [[0, 4]]
-
-[[tests]]
-name = "unicode-class-sentence-break5"
-pattern = '\p{sb=SContinue}'
-input = "\uFF64"
-matches = [[0, 3]]
diff --git a/vendor/regex-automata/src/byteorder.rs b/vendor/regex-automata/src/byteorder.rs
deleted file mode 100644
index e909f93a2..000000000
--- a/vendor/regex-automata/src/byteorder.rs
+++ /dev/null
@@ -1,76 +0,0 @@
-use core::convert::TryInto;
-
-pub trait ByteOrder {
- fn read_u16(buf: &[u8]) -> u16;
- fn read_u32(buf: &[u8]) -> u32;
- fn read_u64(buf: &[u8]) -> u64;
- fn read_uint(buf: &[u8], nbytes: usize) -> u64;
- fn write_u16(buf: &mut [u8], n: u16);
- fn write_u32(buf: &mut [u8], n: u32);
- fn write_u64(buf: &mut [u8], n: u64);
- fn write_uint(buf: &mut [u8], n: u64, nbytes: usize);
-}
-
-pub enum BigEndian {}
-pub enum LittleEndian {}
-pub enum NativeEndian {}
-
-macro_rules! impl_endian {
- ($t:ty, $from_endian:ident, $to_endian:ident) => {
- impl ByteOrder for $t {
- #[inline]
- fn read_u16(buf: &[u8]) -> u16 {
- u16::$from_endian(buf[0..2].try_into().unwrap())
- }
-
- #[inline]
- fn read_u32(buf: &[u8]) -> u32 {
- u32::$from_endian(buf[0..4].try_into().unwrap())
- }
-
- #[inline]
- fn read_u64(buf: &[u8]) -> u64 {
- u64::$from_endian(buf[0..8].try_into().unwrap())
- }
-
- #[inline]
- fn read_uint(buf: &[u8], nbytes: usize) -> u64 {
- let mut dst = [0u8; 8];
- dst[..nbytes].copy_from_slice(&buf[..nbytes]);
- u64::$from_endian(dst)
- }
-
- #[inline]
- fn write_u16(buf: &mut [u8], n: u16) {
- buf[0..2].copy_from_slice(&n.$to_endian()[..]);
- }
-
- #[inline]
- fn write_u32(buf: &mut [u8], n: u32) {
- buf[0..4].copy_from_slice(&n.$to_endian()[..]);
- }
-
- #[inline]
- fn write_u64(buf: &mut [u8], n: u64) {
- buf[0..8].copy_from_slice(&n.$to_endian()[..]);
- }
-
- #[inline]
- fn write_uint(buf: &mut [u8], n: u64, nbytes: usize) {
- buf[..nbytes].copy_from_slice(&n.$to_endian()[..nbytes]);
- }
- }
- };
-}
-
-impl_endian! {
- BigEndian, from_be_bytes, to_be_bytes
-}
-
-impl_endian! {
- LittleEndian, from_le_bytes, to_le_bytes
-}
-
-impl_endian! {
- NativeEndian, from_ne_bytes, to_ne_bytes
-}
diff --git a/vendor/regex-automata/src/classes.rs b/vendor/regex-automata/src/classes.rs
deleted file mode 100644
index 143908b3a..000000000
--- a/vendor/regex-automata/src/classes.rs
+++ /dev/null
@@ -1,271 +0,0 @@
-use core::fmt;
-
-/// A representation of byte oriented equivalence classes.
-///
-/// This is used in a DFA to reduce the size of the transition table. This can
-/// have a particularly large impact not only on the total size of a dense DFA,
-/// but also on compile times.
-#[derive(Clone, Copy)]
-pub struct ByteClasses([u8; 256]);
-
-impl ByteClasses {
- /// Creates a new set of equivalence classes where all bytes are mapped to
- /// the same class.
- pub fn empty() -> ByteClasses {
- ByteClasses([0; 256])
- }
-
- /// Creates a new set of equivalence classes where each byte belongs to
- /// its own equivalence class.
- pub fn singletons() -> ByteClasses {
- let mut classes = ByteClasses::empty();
- for i in 0..256 {
- classes.set(i as u8, i as u8);
- }
- classes
- }
-
- /// Copies the byte classes given. The given slice must have length 0 or
- /// length 256. Slices of length 0 are treated as singletons (every byte
- /// is its own class).
- pub fn from_slice(slice: &[u8]) -> ByteClasses {
- assert!(slice.is_empty() || slice.len() == 256);
-
- if slice.is_empty() {
- ByteClasses::singletons()
- } else {
- let mut classes = ByteClasses::empty();
- for (b, &class) in slice.iter().enumerate() {
- classes.set(b as u8, class);
- }
- classes
- }
- }
-
- /// Set the equivalence class for the given byte.
- #[inline]
- pub fn set(&mut self, byte: u8, class: u8) {
- self.0[byte as usize] = class;
- }
-
- /// Get the equivalence class for the given byte.
- #[inline]
- pub fn get(&self, byte: u8) -> u8 {
- self.0[byte as usize]
- }
-
- /// Get the equivalence class for the given byte while forcefully
- /// eliding bounds checks.
- #[inline]
- pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
- *self.0.get_unchecked(byte as usize)
- }
-
- /// Return the total number of elements in the alphabet represented by
- /// these equivalence classes. Equivalently, this returns the total number
- /// of equivalence classes.
- #[inline]
- pub fn alphabet_len(&self) -> usize {
- self.0[255] as usize + 1
- }
-
- /// Returns true if and only if every byte in this class maps to its own
- /// equivalence class. Equivalently, there are 256 equivalence classes
- /// and each class contains exactly one byte.
- #[inline]
- pub fn is_singleton(&self) -> bool {
- self.alphabet_len() == 256
- }
-
- /// Returns an iterator over a sequence of representative bytes from each
- /// equivalence class. Namely, this yields exactly N items, where N is
- /// equivalent to the number of equivalence classes. Each item is an
- /// arbitrary byte drawn from each equivalence class.
- ///
- /// This is useful when one is determinizing an NFA and the NFA's alphabet
- /// hasn't been converted to equivalence classes yet. Picking an arbitrary
- /// byte from each equivalence class then permits a full exploration of
- /// the NFA instead of using every possible byte value.
- #[cfg(feature = "std")]
- pub fn representatives(&self) -> ByteClassRepresentatives {
- ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
- }
-
- /// Returns all of the bytes in the given equivalence class.
- ///
- /// The second element in the tuple indicates the number of elements in
- /// the array.
- fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
- let (mut array, mut len) = ([0; 256], 0);
- for b in 0..256 {
- if self.get(b as u8) == equiv {
- array[len] = b as u8;
- len += 1;
- }
- }
- (array, len)
- }
-}
-
-impl fmt::Debug for ByteClasses {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- if self.is_singleton() {
- write!(f, "ByteClasses({{singletons}})")
- } else {
- write!(f, "ByteClasses(")?;
- for equiv in 0..self.alphabet_len() {
- let (members, len) = self.elements(equiv as u8);
- write!(f, "{} => {:?}", equiv, &members[..len])?;
- }
- write!(f, ")")
- }
- }
-}
-
-/// An iterator over representative bytes from each equivalence class.
-#[cfg(feature = "std")]
-#[derive(Debug)]
-pub struct ByteClassRepresentatives<'a> {
- classes: &'a ByteClasses,
- byte: usize,
- last_class: Option<u8>,
-}
-
-#[cfg(feature = "std")]
-impl<'a> Iterator for ByteClassRepresentatives<'a> {
- type Item = u8;
-
- fn next(&mut self) -> Option<u8> {
- while self.byte < 256 {
- let byte = self.byte as u8;
- let class = self.classes.get(byte);
- self.byte += 1;
-
- if self.last_class != Some(class) {
- self.last_class = Some(class);
- return Some(byte);
- }
- }
- None
- }
-}
-
-/// A byte class set keeps track of an *approximation* of equivalence classes
-/// of bytes during NFA construction. That is, every byte in an equivalence
-/// class cannot discriminate between a match and a non-match.
-///
-/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
-/// same equivalence class because it never matters whether an `a` or a `b` is
-/// seen, and no combination of `a`s and `b`s in the text can discriminate
-/// a match.
-///
-/// Note though that this does not compute the minimal set of equivalence
-/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
-/// same equivalence class for the same reason that `a` and `b` are in the
-/// same equivalence class in the aforementioned regex. However, in this
-/// implementation, `a` and `c` are put into distinct equivalence classes.
-/// The reason for this is implementation complexity. In the future, we should
-/// endeavor to compute the minimal equivalence classes since they can have a
-/// rather large impact on the size of the DFA.
-///
-/// The representation here is 256 booleans, all initially set to false. Each
-/// boolean maps to its corresponding byte based on position. A `true` value
-/// indicates the end of an equivalence class, where its corresponding byte
-/// and all of the bytes corresponding to all previous contiguous `false`
-/// values are in the same equivalence class.
-///
-/// This particular representation only permits contiguous ranges of bytes to
-/// be in the same equivalence class, which means that we can never discover
-/// the true minimal set of equivalence classes.
-#[cfg(feature = "std")]
-#[derive(Debug)]
-pub struct ByteClassSet(Vec<bool>);
-
-#[cfg(feature = "std")]
-impl ByteClassSet {
- /// Create a new set of byte classes where all bytes are part of the same
- /// equivalence class.
- pub fn new() -> Self {
- ByteClassSet(vec![false; 256])
- }
-
- /// Indicate the the range of byte given (inclusive) can discriminate a
- /// match between it and all other bytes outside of the range.
- pub fn set_range(&mut self, start: u8, end: u8) {
- debug_assert!(start <= end);
- if start > 0 {
- self.0[start as usize - 1] = true;
- }
- self.0[end as usize] = true;
- }
-
- /// Convert this boolean set to a map that maps all byte values to their
- /// corresponding equivalence class. The last mapping indicates the largest
- /// equivalence class identifier (which is never bigger than 255).
- pub fn byte_classes(&self) -> ByteClasses {
- let mut classes = ByteClasses::empty();
- let mut class = 0u8;
- let mut i = 0;
- loop {
- classes.set(i as u8, class as u8);
- if i >= 255 {
- break;
- }
- if self.0[i] {
- class = class.checked_add(1).unwrap();
- }
- i += 1;
- }
- classes
- }
-}
-
-#[cfg(test)]
-mod tests {
- #[cfg(feature = "std")]
- #[test]
- fn byte_classes() {
- use super::ByteClassSet;
-
- let mut set = ByteClassSet::new();
- set.set_range(b'a', b'z');
-
- let classes = set.byte_classes();
- assert_eq!(classes.get(0), 0);
- assert_eq!(classes.get(1), 0);
- assert_eq!(classes.get(2), 0);
- assert_eq!(classes.get(b'a' - 1), 0);
- assert_eq!(classes.get(b'a'), 1);
- assert_eq!(classes.get(b'm'), 1);
- assert_eq!(classes.get(b'z'), 1);
- assert_eq!(classes.get(b'z' + 1), 2);
- assert_eq!(classes.get(254), 2);
- assert_eq!(classes.get(255), 2);
-
- let mut set = ByteClassSet::new();
- set.set_range(0, 2);
- set.set_range(4, 6);
- let classes = set.byte_classes();
- assert_eq!(classes.get(0), 0);
- assert_eq!(classes.get(1), 0);
- assert_eq!(classes.get(2), 0);
- assert_eq!(classes.get(3), 1);
- assert_eq!(classes.get(4), 2);
- assert_eq!(classes.get(5), 2);
- assert_eq!(classes.get(6), 2);
- assert_eq!(classes.get(7), 3);
- assert_eq!(classes.get(255), 3);
- }
-
- #[cfg(feature = "std")]
- #[test]
- fn full_byte_classes() {
- use super::ByteClassSet;
-
- let mut set = ByteClassSet::new();
- for i in 0..256u16 {
- set.set_range(i as u8, i as u8);
- }
- assert_eq!(set.byte_classes().alphabet_len(), 256);
- }
-}
diff --git a/vendor/regex-automata/src/codegen.rs b/vendor/regex-automata/src/codegen.rs
deleted file mode 100644
index b2aacbb46..000000000
--- a/vendor/regex-automata/src/codegen.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-// This module is unused. It was written as an experiment to get a ballpark
-// idea of what state machines look like when translated to Rust code, and
-// in particular, an idea of how much code it generates. The implementation
-// below isn't optimal with respect to size, but the result wasn't exactly
-// small. At some point, we should pursue building this out beyond
-// experimentation, and in particular, probably provide a command line tool
-// and/or a macro. It's a fair bit of work, so I abandoned it for the initial
-// release. ---AG
-
-use std::collections::HashMap;
-use std::io::Write;
-
-use dense::DFA;
-use state_id::StateID;
-
-macro_rules! wstr {
- ($($tt:tt)*) => { write!($($tt)*).unwrap() }
-}
-
-macro_rules! wstrln {
- ($($tt:tt)*) => { writeln!($($tt)*).unwrap() }
-}
-
-pub fn is_match_forward<S: StateID>(dfa: &DFA<S>) -> String {
- let names = state_variant_names(dfa);
-
- let mut buf = vec![];
- wstrln!(buf, "pub fn is_match(input: &[u8]) -> bool {{");
- if dfa.is_match_state(dfa.start()) {
- wstrln!(buf, " return true;");
- wstrln!(buf, "}}");
- return String::from_utf8(buf).unwrap();
- }
-
- wstrln!(buf, "{}", state_enum_def(dfa, &names));
-
- wstrln!(buf, " let mut state = {};", names[&dfa.start()]);
- wstrln!(buf, " for &b in input.iter() {{");
- wstrln!(buf, " state = match state {{");
- for (id, s) in dfa.iter() {
- if dfa.is_match_state(id) {
- continue;
- }
-
- wstrln!(buf, " {} => {{", &names[&id]);
- wstrln!(buf, " match b {{");
- for (start, end, next_id) in s.sparse_transitions() {
- if dfa.is_match_state(next_id) {
- wstrln!(buf, " {:?}...{:?} => return true,", start, end);
- } else {
- if start == end {
- wstrln!(buf, " {:?} => {},", start, &names[&next_id]);
- } else {
- wstrln!(buf, " {:?}...{:?} => {},", start, end, &names[&next_id]);
- }
- }
- }
- wstrln!(buf, " _ => S::S0,");
- wstrln!(buf, " }}");
- wstrln!(buf, " }}");
- }
- wstrln!(buf, " }};");
- wstrln!(buf, " }}");
-
- wstrln!(buf, " false");
- wstrln!(buf, "}}");
- String::from_utf8(buf).unwrap()
-}
-
-fn state_enum_def<S: StateID>(
- dfa: &DFA<S>,
- variant_names: &HashMap<S, String>,
-) -> String {
- let mut buf = vec![];
- wstrln!(buf, " #[derive(Clone, Copy)]");
- wstr!(buf, " enum S {{");
-
- let mut i = 0;
- for (id, _) in dfa.iter() {
- if dfa.is_match_state(id) {
- continue;
- }
- if i % 10 == 0 {
- wstr!(buf, "\n ");
- }
- let name = format!("S{}", id.to_usize());
- wstr!(buf, " {},", name);
- i += 1;
- }
- wstr!(buf, "\n");
- wstrln!(buf, " }}");
- String::from_utf8(buf).unwrap()
-}
-
-fn state_variant_names<S: StateID>(dfa: &DFA<S>) -> HashMap<S, String> {
- let mut variants = HashMap::new();
- for (id, _) in dfa.iter() {
- if dfa.is_match_state(id) {
- continue;
- }
- variants.insert(id, format!("S::S{}", id.to_usize()));
- }
- variants
-}
diff --git a/vendor/regex-automata/src/dense.rs b/vendor/regex-automata/src/dense.rs
deleted file mode 100644
index ed4d1b683..000000000
--- a/vendor/regex-automata/src/dense.rs
+++ /dev/null
@@ -1,2332 +0,0 @@
-#[cfg(feature = "std")]
-use core::fmt;
-#[cfg(feature = "std")]
-use core::iter;
-use core::mem;
-use core::slice;
-
-#[cfg(feature = "std")]
-use byteorder::{BigEndian, LittleEndian};
-use byteorder::{ByteOrder, NativeEndian};
-#[cfg(feature = "std")]
-use regex_syntax::ParserBuilder;
-
-use classes::ByteClasses;
-#[cfg(feature = "std")]
-use determinize::Determinizer;
-use dfa::DFA;
-#[cfg(feature = "std")]
-use error::{Error, Result};
-#[cfg(feature = "std")]
-use minimize::Minimizer;
-#[cfg(feature = "std")]
-use nfa::{self, NFA};
-#[cfg(feature = "std")]
-use sparse::SparseDFA;
-use state_id::{dead_id, StateID};
-#[cfg(feature = "std")]
-use state_id::{
- next_state_id, premultiply_overflow_error, write_state_id_bytes,
-};
-
-/// The size of the alphabet in a standard DFA.
-///
-/// Specifically, this length controls the number of transitions present in
-/// each DFA state. However, when the byte class optimization is enabled,
-/// then each DFA maps the space of all possible 256 byte values to at most
-/// 256 distinct equivalence classes. In this case, the number of distinct
-/// equivalence classes corresponds to the internal alphabet of the DFA, in the
-/// sense that each DFA state has a number of transitions equal to the number
-/// of equivalence classes despite supporting matching on all possible byte
-/// values.
-const ALPHABET_LEN: usize = 256;
-
-/// Masks used in serialization of DFAs.
-pub(crate) const MASK_PREMULTIPLIED: u16 = 0b0000_0000_0000_0001;
-pub(crate) const MASK_ANCHORED: u16 = 0b0000_0000_0000_0010;
-
-/// A dense table-based deterministic finite automaton (DFA).
-///
-/// A dense DFA represents the core matching primitive in this crate. That is,
-/// logically, all DFAs have a single start state, one or more match states
-/// and a transition table that maps the current state and the current byte of
-/// input to the next state. A DFA can use this information to implement fast
-/// searching. In particular, the use of a dense DFA generally makes the trade
-/// off that match speed is the most valuable characteristic, even if building
-/// the regex may take significant time *and* space. As such, the processing
-/// of every byte of input is done with a small constant number of operations
-/// that does not vary with the pattern, its size or the size of the alphabet.
-/// If your needs don't line up with this trade off, then a dense DFA may not
-/// be an adequate solution to your problem.
-///
-/// In contrast, a [sparse DFA](enum.SparseDFA.html) makes the opposite
-/// trade off: it uses less space but will execute a variable number of
-/// instructions per byte at match time, which makes it slower for matching.
-///
-/// A DFA can be built using the default configuration via the
-/// [`DenseDFA::new`](enum.DenseDFA.html#method.new) constructor. Otherwise,
-/// one can configure various aspects via the
-/// [`dense::Builder`](dense/struct.Builder.html).
-///
-/// A single DFA fundamentally supports the following operations:
-///
-/// 1. Detection of a match.
-/// 2. Location of the end of the first possible match.
-/// 3. Location of the end of the leftmost-first match.
-///
-/// A notable absence from the above list of capabilities is the location of
-/// the *start* of a match. In order to provide both the start and end of a
-/// match, *two* DFAs are required. This functionality is provided by a
-/// [`Regex`](struct.Regex.html), which can be built with its basic
-/// constructor, [`Regex::new`](struct.Regex.html#method.new), or with
-/// a [`RegexBuilder`](struct.RegexBuilder.html).
-///
-/// # State size
-///
-/// A `DenseDFA` has two type parameters, `T` and `S`. `T` corresponds to
-/// the type of the DFA's transition table while `S` corresponds to the
-/// representation used for the DFA's state identifiers as described by the
-/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
-/// `usize`, but other valid choices provided by this crate include `u8`,
-/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
-/// identifier representation than the default is to reduce the amount of
-/// memory used by a DFA. Note though, that if the chosen representation cannot
-/// accommodate the size of your DFA, then building the DFA will fail and
-/// return an error.
-///
-/// While the reduction in heap memory used by a DFA is one reason for choosing
-/// a smaller state identifier representation, another possible reason is for
-/// decreasing the serialization size of a DFA, as returned by
-/// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian),
-/// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
-/// or
-/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
-///
-/// The type of the transition table is typically either `Vec<S>` or `&[S]`,
-/// depending on where the transition table is stored.
-///
-/// # Variants
-///
-/// This DFA is defined as a non-exhaustive enumeration of different types of
-/// dense DFAs. All of these dense DFAs use the same internal representation
-/// for the transition table, but they vary in how the transition table is
-/// read. A DFA's specific variant depends on the configuration options set via
-/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
-/// `PremultipliedByteClass`.
-///
-/// # The `DFA` trait
-///
-/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
-/// can be used for searching. For example:
-///
-/// ```
-/// use regex_automata::{DFA, DenseDFA};
-///
-/// # fn example() -> Result<(), regex_automata::Error> {
-/// let dfa = DenseDFA::new("foo[0-9]+")?;
-/// assert_eq!(Some(8), dfa.find(b"foo12345"));
-/// # Ok(()) }; example().unwrap()
-/// ```
-///
-/// The `DFA` trait also provides an assortment of other lower level methods
-/// for DFAs, such as `start_state` and `next_state`. While these are correctly
-/// implemented, it is an anti-pattern to use them in performance sensitive
-/// code on the `DenseDFA` type directly. Namely, each implementation requires
-/// a branch to determine which type of dense DFA is being used. Instead,
-/// this branch should be pushed up a layer in the code since walking the
-/// transitions of a DFA is usually a hot path. If you do need to use these
-/// lower level methods in performance critical code, then you should match on
-/// the variants of this DFA and use each variant's implementation of the `DFA`
-/// trait directly.
-#[derive(Clone, Debug)]
-pub enum DenseDFA<T: AsRef<[S]>, S: StateID> {
- /// A standard DFA that does not use premultiplication or byte classes.
- Standard(Standard<T, S>),
- /// A DFA that shrinks its alphabet to a set of equivalence classes instead
- /// of using all possible byte values. Any two bytes belong to the same
- /// equivalence class if and only if they can be used interchangeably
- /// anywhere in the DFA while never discriminating between a match and a
- /// non-match.
- ///
- /// This type of DFA can result in significant space reduction with a very
- /// small match time performance penalty.
- ByteClass(ByteClass<T, S>),
- /// A DFA that premultiplies all of its state identifiers in its
- /// transition table. This saves an instruction per byte at match time
- /// which improves search performance.
- ///
- /// The only downside of premultiplication is that it may prevent one from
- /// using a smaller state identifier representation than you otherwise
- /// could.
- Premultiplied(Premultiplied<T, S>),
- /// The default configuration of a DFA, which uses byte classes and
- /// premultiplies its state identifiers.
- PremultipliedByteClass(PremultipliedByteClass<T, S>),
- /// Hints that destructuring should not be exhaustive.
- ///
- /// This enum may grow additional variants, so this makes sure clients
- /// don't count on exhaustive matching. (Otherwise, adding a new variant
- /// could break existing code.)
- #[doc(hidden)]
- __Nonexhaustive,
-}
-
-impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
- /// Return the internal DFA representation.
- ///
- /// All variants share the same internal representation.
- fn repr(&self) -> &Repr<T, S> {
- match *self {
- DenseDFA::Standard(ref r) => &r.0,
- DenseDFA::ByteClass(ref r) => &r.0,
- DenseDFA::Premultiplied(ref r) => &r.0,
- DenseDFA::PremultipliedByteClass(ref r) => &r.0,
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-}
-
-#[cfg(feature = "std")]
-impl DenseDFA<Vec<usize>, usize> {
- /// Parse the given regular expression using a default configuration and
- /// return the corresponding DFA.
- ///
- /// The default configuration uses `usize` for state IDs, premultiplies
- /// them and reduces the alphabet size by splitting bytes into equivalence
- /// classes. The DFA is *not* minimized.
- ///
- /// If you want a non-default configuration, then use the
- /// [`dense::Builder`](dense/struct.Builder.html)
- /// to set your own configuration.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
- /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn new(pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
- Builder::new().build(pattern)
- }
-}
-
-#[cfg(feature = "std")]
-impl<S: StateID> DenseDFA<Vec<S>, S> {
- /// Create a new empty DFA that never matches any input.
- ///
- /// # Example
- ///
- /// In order to build an empty DFA, callers must provide a type hint
- /// indicating their choice of state identifier representation.
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa: DenseDFA<Vec<usize>, usize> = DenseDFA::empty();
- /// assert_eq!(None, dfa.find(b""));
- /// assert_eq!(None, dfa.find(b"foo"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn empty() -> DenseDFA<Vec<S>, S> {
- Repr::empty().into_dense_dfa()
- }
-}
-
-impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
- /// Cheaply return a borrowed version of this dense DFA. Specifically, the
- /// DFA returned always uses `&[S]` for its transition table while keeping
- /// the same state identifier representation.
- pub fn as_ref<'a>(&'a self) -> DenseDFA<&'a [S], S> {
- match *self {
- DenseDFA::Standard(ref r) => {
- DenseDFA::Standard(Standard(r.0.as_ref()))
- }
- DenseDFA::ByteClass(ref r) => {
- DenseDFA::ByteClass(ByteClass(r.0.as_ref()))
- }
- DenseDFA::Premultiplied(ref r) => {
- DenseDFA::Premultiplied(Premultiplied(r.0.as_ref()))
- }
- DenseDFA::PremultipliedByteClass(ref r) => {
- let inner = PremultipliedByteClass(r.0.as_ref());
- DenseDFA::PremultipliedByteClass(inner)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- /// Return an owned version of this sparse DFA. Specifically, the DFA
- /// returned always uses `Vec<u8>` for its transition table while keeping
- /// the same state identifier representation.
- ///
- /// Effectively, this returns a sparse DFA whose transition table lives
- /// on the heap.
- #[cfg(feature = "std")]
- pub fn to_owned(&self) -> DenseDFA<Vec<S>, S> {
- match *self {
- DenseDFA::Standard(ref r) => {
- DenseDFA::Standard(Standard(r.0.to_owned()))
- }
- DenseDFA::ByteClass(ref r) => {
- DenseDFA::ByteClass(ByteClass(r.0.to_owned()))
- }
- DenseDFA::Premultiplied(ref r) => {
- DenseDFA::Premultiplied(Premultiplied(r.0.to_owned()))
- }
- DenseDFA::PremultipliedByteClass(ref r) => {
- let inner = PremultipliedByteClass(r.0.to_owned());
- DenseDFA::PremultipliedByteClass(inner)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- /// Returns the memory usage, in bytes, of this DFA.
- ///
- /// The memory usage is computed based on the number of bytes used to
- /// represent this DFA's transition table. This corresponds to heap memory
- /// usage.
- ///
- /// This does **not** include the stack size used up by this DFA. To
- /// compute that, used `std::mem::size_of::<DenseDFA>()`.
- pub fn memory_usage(&self) -> usize {
- self.repr().memory_usage()
- }
-}
-
-/// Routines for converting a dense DFA to other representations, such as
-/// sparse DFAs, smaller state identifiers or raw bytes suitable for persistent
-/// storage.
-#[cfg(feature = "std")]
-impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> {
- /// Convert this dense DFA to a sparse DFA.
- ///
- /// This is a convenience routine for `to_sparse_sized` that fixes the
- /// state identifier representation of the sparse DFA to the same
- /// representation used for this dense DFA.
- ///
- /// If the chosen state identifier representation is too small to represent
- /// all states in the sparse DFA, then this returns an error. In most
- /// cases, if a dense DFA is constructable with `S` then a sparse DFA will
- /// be as well. However, it is not guaranteed.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dense = DenseDFA::new("foo[0-9]+")?;
- /// let sparse = dense.to_sparse()?;
- /// assert_eq!(Some(8), sparse.find(b"foo12345"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn to_sparse(&self) -> Result<SparseDFA<Vec<u8>, S>> {
- self.to_sparse_sized()
- }
-
- /// Convert this dense DFA to a sparse DFA.
- ///
- /// Using this routine requires supplying a type hint to choose the state
- /// identifier representation for the resulting sparse DFA.
- ///
- /// If the chosen state identifier representation is too small to represent
- /// all states in the sparse DFA, then this returns an error.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dense = DenseDFA::new("foo[0-9]+")?;
- /// let sparse = dense.to_sparse_sized::<u8>()?;
- /// assert_eq!(Some(8), sparse.find(b"foo12345"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn to_sparse_sized<A: StateID>(
- &self,
- ) -> Result<SparseDFA<Vec<u8>, A>> {
- self.repr().to_sparse_sized()
- }
-
- /// Create a new DFA whose match semantics are equivalent to this DFA,
- /// but attempt to use `u8` for the representation of state identifiers.
- /// If `u8` is insufficient to represent all state identifiers in this
- /// DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u8>()`.
- pub fn to_u8(&self) -> Result<DenseDFA<Vec<u8>, u8>> {
- self.to_sized()
- }
-
- /// Create a new DFA whose match semantics are equivalent to this DFA,
- /// but attempt to use `u16` for the representation of state identifiers.
- /// If `u16` is insufficient to represent all state identifiers in this
- /// DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u16>()`.
- pub fn to_u16(&self) -> Result<DenseDFA<Vec<u16>, u16>> {
- self.to_sized()
- }
-
- /// Create a new DFA whose match semantics are equivalent to this DFA,
- /// but attempt to use `u32` for the representation of state identifiers.
- /// If `u32` is insufficient to represent all state identifiers in this
- /// DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u32>()`.
- #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
- pub fn to_u32(&self) -> Result<DenseDFA<Vec<u32>, u32>> {
- self.to_sized()
- }
-
- /// Create a new DFA whose match semantics are equivalent to this DFA,
- /// but attempt to use `u64` for the representation of state identifiers.
- /// If `u64` is insufficient to represent all state identifiers in this
- /// DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u64>()`.
- #[cfg(target_pointer_width = "64")]
- pub fn to_u64(&self) -> Result<DenseDFA<Vec<u64>, u64>> {
- self.to_sized()
- }
-
- /// Create a new DFA whose match semantics are equivalent to this DFA, but
- /// attempt to use `A` for the representation of state identifiers. If `A`
- /// is insufficient to represent all state identifiers in this DFA, then
- /// this returns an error.
- ///
- /// An alternative way to construct such a DFA is to use
- /// [`dense::Builder::build_with_size`](dense/struct.Builder.html#method.build_with_size).
- /// In general, using the builder is preferred since it will use the given
- /// state identifier representation throughout determinization (and
- /// minimization, if done), and thereby using less memory throughout the
- /// entire construction process. However, these routines are necessary
- /// in cases where, say, a minimized DFA could fit in a smaller state
- /// identifier representation, but the initial determinized DFA would not.
- pub fn to_sized<A: StateID>(&self) -> Result<DenseDFA<Vec<A>, A>> {
- self.repr().to_sized().map(|r| r.into_dense_dfa())
- }
-
- /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in little
- /// endian format.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
- self.repr().to_bytes::<LittleEndian>()
- }
-
- /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in big
- /// endian format.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
- self.repr().to_bytes::<BigEndian>()
- }
-
- /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in native
- /// endian format. Generally, it is better to pick an explicit endianness
- /// using either `to_bytes_little_endian` or `to_bytes_big_endian`. This
- /// routine is useful in tests where the DFA is serialized and deserialized
- /// on the same platform.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
- self.repr().to_bytes::<NativeEndian>()
- }
-}
-
-impl<'a, S: StateID> DenseDFA<&'a [S], S> {
- /// Deserialize a DFA with a specific state identifier representation.
- ///
- /// Deserializing a DFA using this routine will never allocate heap memory.
- /// This is also guaranteed to be a constant time operation that does not
- /// vary with the size of the DFA.
- ///
- /// The bytes given should be generated by the serialization of a DFA with
- /// either the
- /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
- /// method or the
- /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
- /// endian, depending on the endianness of the machine you are
- /// deserializing this DFA from.
- ///
- /// If the state identifier representation is `usize`, then deserialization
- /// is dependent on the pointer size. For this reason, it is best to
- /// serialize DFAs using a fixed size representation for your state
- /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
- ///
- /// # Panics
- ///
- /// The bytes given should be *trusted*. In particular, if the bytes
- /// are not a valid serialization of a DFA, or if the given bytes are
- /// not aligned to an 8 byte boundary, or if the endianness of the
- /// serialized bytes is different than the endianness of the machine that
- /// is deserializing the DFA, then this routine will panic. Moreover, it is
- /// possible for this deserialization routine to succeed even if the given
- /// bytes do not represent a valid serialized dense DFA.
- ///
- /// # Safety
- ///
- /// This routine is unsafe because it permits callers to provide an
- /// arbitrary transition table with possibly incorrect transitions. While
- /// the various serialization routines will never return an incorrect
- /// transition table, there is no guarantee that the bytes provided here
- /// are correct. While deserialization does many checks (as documented
- /// above in the panic conditions), this routine does not check that the
- /// transition table is correct. Given an incorrect transition table, it is
- /// possible for the search routines to access out-of-bounds memory because
- /// of explicit bounds check elision.
- ///
- /// # Example
- ///
- /// This example shows how to serialize a DFA to raw bytes, deserialize it
- /// and then use it for searching. Note that we first convert the DFA to
- /// using `u16` for its state identifier representation before serializing
- /// it. While this isn't strictly necessary, it's good practice in order to
- /// decrease the size of the DFA and to avoid platform specific pitfalls
- /// such as differing pointer sizes.
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let initial = DenseDFA::new("foo[0-9]+")?;
- /// let bytes = initial.to_u16()?.to_bytes_native_endian()?;
- /// let dfa: DenseDFA<&[u16], u16> = unsafe {
- /// DenseDFA::from_bytes(&bytes)
- /// };
- ///
- /// assert_eq!(Some(8), dfa.find(b"foo12345"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub unsafe fn from_bytes(buf: &'a [u8]) -> DenseDFA<&'a [S], S> {
- Repr::from_bytes(buf).into_dense_dfa()
- }
-}
-
-#[cfg(feature = "std")]
-impl<S: StateID> DenseDFA<Vec<S>, S> {
- /// Minimize this DFA in place.
- ///
- /// This is not part of the public API. It is only exposed to allow for
- /// more granular external benchmarking.
- #[doc(hidden)]
- pub fn minimize(&mut self) {
- self.repr_mut().minimize();
- }
-
- /// Return a mutable reference to the internal DFA representation.
- fn repr_mut(&mut self) -> &mut Repr<Vec<S>, S> {
- match *self {
- DenseDFA::Standard(ref mut r) => &mut r.0,
- DenseDFA::ByteClass(ref mut r) => &mut r.0,
- DenseDFA::Premultiplied(ref mut r) => &mut r.0,
- DenseDFA::PremultipliedByteClass(ref mut r) => &mut r.0,
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-}
-
-impl<T: AsRef<[S]>, S: StateID> DFA for DenseDFA<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.repr().start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.repr().is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.repr().is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.repr().is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.repr().is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- match *self {
- DenseDFA::Standard(ref r) => r.next_state(current, input),
- DenseDFA::ByteClass(ref r) => r.next_state(current, input),
- DenseDFA::Premultiplied(ref r) => r.next_state(current, input),
- DenseDFA::PremultipliedByteClass(ref r) => {
- r.next_state(current, input)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- match *self {
- DenseDFA::Standard(ref r) => {
- r.next_state_unchecked(current, input)
- }
- DenseDFA::ByteClass(ref r) => {
- r.next_state_unchecked(current, input)
- }
- DenseDFA::Premultiplied(ref r) => {
- r.next_state_unchecked(current, input)
- }
- DenseDFA::PremultipliedByteClass(ref r) => {
- r.next_state_unchecked(current, input)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- // We specialize the following methods because it lets us lift the
- // case analysis between the different types of dense DFAs. Instead of
- // doing the case analysis for every transition, we do it once before
- // searching.
-
- #[inline]
- fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
- match *self {
- DenseDFA::Standard(ref r) => r.is_match_at(bytes, start),
- DenseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
- DenseDFA::Premultiplied(ref r) => r.is_match_at(bytes, start),
- DenseDFA::PremultipliedByteClass(ref r) => {
- r.is_match_at(bytes, start)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- match *self {
- DenseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
- DenseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
- DenseDFA::Premultiplied(ref r) => {
- r.shortest_match_at(bytes, start)
- }
- DenseDFA::PremultipliedByteClass(ref r) => {
- r.shortest_match_at(bytes, start)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- match *self {
- DenseDFA::Standard(ref r) => r.find_at(bytes, start),
- DenseDFA::ByteClass(ref r) => r.find_at(bytes, start),
- DenseDFA::Premultiplied(ref r) => r.find_at(bytes, start),
- DenseDFA::PremultipliedByteClass(ref r) => r.find_at(bytes, start),
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- match *self {
- DenseDFA::Standard(ref r) => r.rfind_at(bytes, start),
- DenseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
- DenseDFA::Premultiplied(ref r) => r.rfind_at(bytes, start),
- DenseDFA::PremultipliedByteClass(ref r) => {
- r.rfind_at(bytes, start)
- }
- DenseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-}
-
-/// A standard dense DFA that does not use premultiplication or byte classes.
-///
-/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
-/// can be used for searching directly. One possible reason why one might want
-/// to use this type directly is if you are implementing your own search
-/// routines by walking a DFA's transitions directly. In that case, you'll want
-/// to use this type (or any of the other DFA variant types) directly, since
-/// they implement `next_state` more efficiently.
-#[derive(Clone, Debug)]
-pub struct Standard<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
-
-impl<T: AsRef<[S]>, S: StateID> DFA for Standard<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.0.start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.0.is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.0.is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.0.is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.0.is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- let o = current.to_usize() * ALPHABET_LEN + input as usize;
- self.0.trans()[o]
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- let o = current.to_usize() * ALPHABET_LEN + input as usize;
- *self.0.trans().get_unchecked(o)
- }
-}
-
-/// A dense DFA that shrinks its alphabet.
-///
-/// Alphabet shrinking is achieved by using a set of equivalence classes
-/// instead of using all possible byte values. Any two bytes belong to the same
-/// equivalence class if and only if they can be used interchangeably anywhere
-/// in the DFA while never discriminating between a match and a non-match.
-///
-/// This type of DFA can result in significant space reduction with a very
-/// small match time performance penalty.
-///
-/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
-/// can be used for searching directly. One possible reason why one might want
-/// to use this type directly is if you are implementing your own search
-/// routines by walking a DFA's transitions directly. In that case, you'll want
-/// to use this type (or any of the other DFA variant types) directly, since
-/// they implement `next_state` more efficiently.
-#[derive(Clone, Debug)]
-pub struct ByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
-
-impl<T: AsRef<[S]>, S: StateID> DFA for ByteClass<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.0.start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.0.is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.0.is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.0.is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.0.is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- let input = self.0.byte_classes().get(input);
- let o = current.to_usize() * self.0.alphabet_len() + input as usize;
- self.0.trans()[o]
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- let input = self.0.byte_classes().get_unchecked(input);
- let o = current.to_usize() * self.0.alphabet_len() + input as usize;
- *self.0.trans().get_unchecked(o)
- }
-}
-
-/// A dense DFA that premultiplies all of its state identifiers in its
-/// transition table.
-///
-/// This saves an instruction per byte at match time which improves search
-/// performance.
-///
-/// The only downside of premultiplication is that it may prevent one from
-/// using a smaller state identifier representation than you otherwise could.
-///
-/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
-/// can be used for searching directly. One possible reason why one might want
-/// to use this type directly is if you are implementing your own search
-/// routines by walking a DFA's transitions directly. In that case, you'll want
-/// to use this type (or any of the other DFA variant types) directly, since
-/// they implement `next_state` more efficiently.
-#[derive(Clone, Debug)]
-pub struct Premultiplied<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
-
-impl<T: AsRef<[S]>, S: StateID> DFA for Premultiplied<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.0.start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.0.is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.0.is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.0.is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.0.is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- let o = current.to_usize() + input as usize;
- self.0.trans()[o]
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- let o = current.to_usize() + input as usize;
- *self.0.trans().get_unchecked(o)
- }
-}
-
-/// The default configuration of a dense DFA, which uses byte classes and
-/// premultiplies its state identifiers.
-///
-/// Generally, it isn't necessary to use this type directly, since a `DenseDFA`
-/// can be used for searching directly. One possible reason why one might want
-/// to use this type directly is if you are implementing your own search
-/// routines by walking a DFA's transitions directly. In that case, you'll want
-/// to use this type (or any of the other DFA variant types) directly, since
-/// they implement `next_state` more efficiently.
-#[derive(Clone, Debug)]
-pub struct PremultipliedByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>);
-
-impl<T: AsRef<[S]>, S: StateID> DFA for PremultipliedByteClass<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.0.start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.0.is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.0.is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.0.is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.0.is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- let input = self.0.byte_classes().get(input);
- let o = current.to_usize() + input as usize;
- self.0.trans()[o]
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- let input = self.0.byte_classes().get_unchecked(input);
- let o = current.to_usize() + input as usize;
- *self.0.trans().get_unchecked(o)
- }
-}
-
-/// The internal representation of a dense DFA.
-///
-/// This representation is shared by all DFA variants.
-#[derive(Clone)]
-#[cfg_attr(not(feature = "std"), derive(Debug))]
-pub(crate) struct Repr<T, S> {
- /// Whether the state identifiers in the transition table have been
- /// premultiplied or not.
- ///
- /// Premultiplied identifiers means that instead of your matching loop
- /// looking something like this:
- ///
- /// state = dfa.start
- /// for byte in haystack:
- /// next = dfa.transitions[state * len(alphabet) + byte]
- /// if dfa.is_match(next):
- /// return true
- /// return false
- ///
- /// it can instead look like this:
- ///
- /// state = dfa.start
- /// for byte in haystack:
- /// next = dfa.transitions[state + byte]
- /// if dfa.is_match(next):
- /// return true
- /// return false
- ///
- /// In other words, we save a multiplication instruction in the critical
- /// path. This turns out to be a decent performance win. The cost of using
- /// premultiplied state ids is that they can require a bigger state id
- /// representation.
- premultiplied: bool,
- /// Whether this DFA can only match at the beginning of input or not.
- ///
- /// When true, a match should only be reported if it begins at the 0th
- /// index of the haystack.
- anchored: bool,
- /// The initial start state ID.
- start: S,
- /// The total number of states in this DFA. Note that a DFA always has at
- /// least one state---the dead state---even the empty DFA. In particular,
- /// the dead state always has ID 0 and is correspondingly always the first
- /// state. The dead state is never a match state.
- state_count: usize,
- /// States in a DFA have a *partial* ordering such that a match state
- /// always precedes any non-match state (except for the special dead
- /// state).
- ///
- /// `max_match` corresponds to the last state that is a match state. This
- /// encoding has two critical benefits. Firstly, we are not required to
- /// store any additional per-state information about whether it is a match
- /// state or not. Secondly, when searching with the DFA, we can do a single
- /// comparison with `max_match` for each byte instead of two comparisons
- /// for each byte (one testing whether it is a match and the other testing
- /// whether we've reached a dead state). Namely, to determine the status
- /// of the next state, we can do this:
- ///
- /// next_state = transition[cur_state * alphabet_len + cur_byte]
- /// if next_state <= max_match:
- /// // next_state is either dead (no-match) or a match
- /// return next_state != dead
- max_match: S,
- /// A set of equivalence classes, where a single equivalence class
- /// represents a set of bytes that never discriminate between a match
- /// and a non-match in the DFA. Each equivalence class corresponds to
- /// a single letter in this DFA's alphabet, where the maximum number of
- /// letters is 256 (each possible value of a byte). Consequently, the
- /// number of equivalence classes corresponds to the number of transitions
- /// for each DFA state.
- ///
- /// The only time the number of equivalence classes is fewer than 256 is
- /// if the DFA's kind uses byte classes. If the DFA doesn't use byte
- /// classes, then this vector is empty.
- byte_classes: ByteClasses,
- /// A contiguous region of memory representing the transition table in
- /// row-major order. The representation is dense. That is, every state has
- /// precisely the same number of transitions. The maximum number of
- /// transitions is 256. If a DFA has been instructed to use byte classes,
- /// then the number of transitions can be much less.
- ///
- /// In practice, T is either Vec<S> or &[S].
- trans: T,
-}
-
-#[cfg(feature = "std")]
-impl<S: StateID> Repr<Vec<S>, S> {
- /// Create a new empty DFA with singleton byte classes (every byte is its
- /// own equivalence class).
- pub fn empty() -> Repr<Vec<S>, S> {
- Repr::empty_with_byte_classes(ByteClasses::singletons())
- }
-
- /// Create a new empty DFA with the given set of byte equivalence classes.
- /// An empty DFA never matches any input.
- pub fn empty_with_byte_classes(
- byte_classes: ByteClasses,
- ) -> Repr<Vec<S>, S> {
- let mut dfa = Repr {
- premultiplied: false,
- anchored: true,
- start: dead_id(),
- state_count: 0,
- max_match: S::from_usize(0),
- byte_classes,
- trans: vec![],
- };
- // Every state ID repr must be able to fit at least one state.
- dfa.add_empty_state().unwrap();
- dfa
- }
-
- /// Sets whether this DFA is anchored or not.
- pub fn anchored(mut self, yes: bool) -> Repr<Vec<S>, S> {
- self.anchored = yes;
- self
- }
-}
-
-impl<T: AsRef<[S]>, S: StateID> Repr<T, S> {
- /// Convert this internal DFA representation to a DenseDFA based on its
- /// transition table access pattern.
- pub fn into_dense_dfa(self) -> DenseDFA<T, S> {
- match (self.premultiplied, self.byte_classes().is_singleton()) {
- // no premultiplication, no byte classes
- (false, true) => DenseDFA::Standard(Standard(self)),
- // no premultiplication, yes byte classes
- (false, false) => DenseDFA::ByteClass(ByteClass(self)),
- // yes premultiplication, no byte classes
- (true, true) => DenseDFA::Premultiplied(Premultiplied(self)),
- // yes premultiplication, yes byte classes
- (true, false) => {
- DenseDFA::PremultipliedByteClass(PremultipliedByteClass(self))
- }
- }
- }
-
- fn as_ref<'a>(&'a self) -> Repr<&'a [S], S> {
- Repr {
- premultiplied: self.premultiplied,
- anchored: self.anchored,
- start: self.start,
- state_count: self.state_count,
- max_match: self.max_match,
- byte_classes: self.byte_classes().clone(),
- trans: self.trans(),
- }
- }
-
- #[cfg(feature = "std")]
- fn to_owned(&self) -> Repr<Vec<S>, S> {
- Repr {
- premultiplied: self.premultiplied,
- anchored: self.anchored,
- start: self.start,
- state_count: self.state_count,
- max_match: self.max_match,
- byte_classes: self.byte_classes().clone(),
- trans: self.trans().to_vec(),
- }
- }
-
- /// Return the starting state of this DFA.
- ///
- /// All searches using this DFA must begin at this state. There is exactly
- /// one starting state for every DFA. A starting state may be a dead state
- /// or a matching state or neither.
- pub fn start_state(&self) -> S {
- self.start
- }
-
- /// Returns true if and only if the given identifier corresponds to a match
- /// state.
- pub fn is_match_state(&self, id: S) -> bool {
- id <= self.max_match && id != dead_id()
- }
-
- /// Returns true if and only if the given identifier corresponds to a dead
- /// state.
- pub fn is_dead_state(&self, id: S) -> bool {
- id == dead_id()
- }
-
- /// Returns true if and only if the given identifier could correspond to
- /// either a match state or a dead state. If this returns false, then the
- /// given identifier does not correspond to either a match state or a dead
- /// state.
- pub fn is_match_or_dead_state(&self, id: S) -> bool {
- id <= self.max_match_state()
- }
-
- /// Returns the maximum identifier for which a match state can exist.
- ///
- /// More specifically, the return identifier always corresponds to either
- /// a match state or a dead state. Namely, either
- /// `is_match_state(returned)` or `is_dead_state(returned)` is guaranteed
- /// to be true.
- pub fn max_match_state(&self) -> S {
- self.max_match
- }
-
- /// Returns true if and only if this DFA is anchored.
- pub fn is_anchored(&self) -> bool {
- self.anchored
- }
-
- /// Return the byte classes used by this DFA.
- pub fn byte_classes(&self) -> &ByteClasses {
- &self.byte_classes
- }
-
- /// Returns an iterator over all states in this DFA.
- ///
- /// This iterator yields a tuple for each state. The first element of the
- /// tuple corresponds to a state's identifier, and the second element
- /// corresponds to the state itself (comprised of its transitions).
- ///
- /// If this DFA is premultiplied, then the state identifiers are in
- /// turn premultiplied as well, making them usable without additional
- /// modification.
- #[cfg(feature = "std")]
- pub fn states(&self) -> StateIter<T, S> {
- let it = self.trans().chunks(self.alphabet_len());
- StateIter { dfa: self, it: it.enumerate() }
- }
-
- /// Return the total number of states in this DFA. Every DFA has at least
- /// 1 state, even the empty DFA.
- #[cfg(feature = "std")]
- pub fn state_count(&self) -> usize {
- self.state_count
- }
-
- /// Return the number of elements in this DFA's alphabet.
- ///
- /// If this DFA doesn't use byte classes, then this is always equivalent
- /// to 256. Otherwise, it is guaranteed to be some value less than or equal
- /// to 256.
- pub fn alphabet_len(&self) -> usize {
- self.byte_classes().alphabet_len()
- }
-
- /// Returns the memory usage, in bytes, of this DFA.
- pub fn memory_usage(&self) -> usize {
- self.trans().len() * mem::size_of::<S>()
- }
-
- /// Convert the given state identifier to the state's index. The state's
- /// index corresponds to the position in which it appears in the transition
- /// table. When a DFA is NOT premultiplied, then a state's identifier is
- /// also its index. When a DFA is premultiplied, then a state's identifier
- /// is equal to `index * alphabet_len`. This routine reverses that.
- #[cfg(feature = "std")]
- pub fn state_id_to_index(&self, id: S) -> usize {
- if self.premultiplied {
- id.to_usize() / self.alphabet_len()
- } else {
- id.to_usize()
- }
- }
-
- /// Return this DFA's transition table as a slice.
- fn trans(&self) -> &[S] {
- self.trans.as_ref()
- }
-
- /// Create a sparse DFA from the internal representation of a dense DFA.
- #[cfg(feature = "std")]
- pub fn to_sparse_sized<A: StateID>(
- &self,
- ) -> Result<SparseDFA<Vec<u8>, A>> {
- SparseDFA::from_dense_sized(self)
- }
-
- /// Create a new DFA whose match semantics are equivalent to this DFA, but
- /// attempt to use `A` for the representation of state identifiers. If `A`
- /// is insufficient to represent all state identifiers in this DFA, then
- /// this returns an error.
- #[cfg(feature = "std")]
- pub fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<A>, A>> {
- // Check that this DFA can fit into A's representation.
- let mut last_state_id = self.state_count - 1;
- if self.premultiplied {
- last_state_id *= self.alphabet_len();
- }
- if last_state_id > A::max_id() {
- return Err(Error::state_id_overflow(A::max_id()));
- }
-
- // We're off to the races. The new DFA is the same as the old one,
- // but its transition table is truncated.
- let mut new = Repr {
- premultiplied: self.premultiplied,
- anchored: self.anchored,
- start: A::from_usize(self.start.to_usize()),
- state_count: self.state_count,
- max_match: A::from_usize(self.max_match.to_usize()),
- byte_classes: self.byte_classes().clone(),
- trans: vec![dead_id::<A>(); self.trans().len()],
- };
- for (i, id) in new.trans.iter_mut().enumerate() {
- *id = A::from_usize(self.trans()[i].to_usize());
- }
- Ok(new)
- }
-
- /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- #[cfg(feature = "std")]
- pub(crate) fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
- let label = b"rust-regex-automata-dfa\x00";
- assert_eq!(24, label.len());
-
- let trans_size = mem::size_of::<S>() * self.trans().len();
- let size =
- // For human readable label.
- label.len()
- // endiannes check, must be equal to 0xFEFF for native endian
- + 2
- // For version number.
- + 2
- // Size of state ID representation, in bytes.
- // Must be 1, 2, 4 or 8.
- + 2
- // For DFA misc options.
- + 2
- // For start state.
- + 8
- // For state count.
- + 8
- // For max match state.
- + 8
- // For byte class map.
- + 256
- // For transition table.
- + trans_size;
- // sanity check, this can be updated if need be
- assert_eq!(312 + trans_size, size);
- // This must always pass. It checks that the transition table is at
- // a properly aligned address.
- assert_eq!(0, (size - trans_size) % 8);
-
- let mut buf = vec![0; size];
- let mut i = 0;
-
- // write label
- for &b in label {
- buf[i] = b;
- i += 1;
- }
- // endianness check
- A::write_u16(&mut buf[i..], 0xFEFF);
- i += 2;
- // version number
- A::write_u16(&mut buf[i..], 1);
- i += 2;
- // size of state ID
- let state_size = mem::size_of::<S>();
- if ![1, 2, 4, 8].contains(&state_size) {
- return Err(Error::serialize(&format!(
- "state size of {} not supported, must be 1, 2, 4 or 8",
- state_size
- )));
- }
- A::write_u16(&mut buf[i..], state_size as u16);
- i += 2;
- // DFA misc options
- let mut options = 0u16;
- if self.premultiplied {
- options |= MASK_PREMULTIPLIED;
- }
- if self.anchored {
- options |= MASK_ANCHORED;
- }
- A::write_u16(&mut buf[i..], options);
- i += 2;
- // start state
- A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
- i += 8;
- // state count
- A::write_u64(&mut buf[i..], self.state_count as u64);
- i += 8;
- // max match state
- A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
- i += 8;
- // byte class map
- for b in (0..256).map(|b| b as u8) {
- buf[i] = self.byte_classes().get(b);
- i += 1;
- }
- // transition table
- for &id in self.trans() {
- write_state_id_bytes::<A, _>(&mut buf[i..], id);
- i += state_size;
- }
- assert_eq!(size, i, "expected to consume entire buffer");
-
- Ok(buf)
- }
-}
-
-impl<'a, S: StateID> Repr<&'a [S], S> {
- /// The implementation for deserializing a DFA from raw bytes.
- unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [S], S> {
- assert_eq!(
- 0,
- buf.as_ptr() as usize % mem::align_of::<S>(),
- "DenseDFA starting at address {} is not aligned to {} bytes",
- buf.as_ptr() as usize,
- mem::align_of::<S>()
- );
-
- // skip over label
- match buf.iter().position(|&b| b == b'\x00') {
- None => panic!("could not find label"),
- Some(i) => buf = &buf[i + 1..],
- }
-
- // check that current endianness is same as endianness of DFA
- let endian_check = NativeEndian::read_u16(buf);
- buf = &buf[2..];
- if endian_check != 0xFEFF {
- panic!(
- "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
- are you trying to load a DenseDFA serialized with a \
- different endianness?",
- endian_check,
- );
- }
-
- // check that the version number is supported
- let version = NativeEndian::read_u16(buf);
- buf = &buf[2..];
- if version != 1 {
- panic!(
- "expected version 1, but found unsupported version {}",
- version,
- );
- }
-
- // read size of state
- let state_size = NativeEndian::read_u16(buf) as usize;
- if state_size != mem::size_of::<S>() {
- panic!(
- "state size of DenseDFA ({}) does not match \
- requested state size ({})",
- state_size,
- mem::size_of::<S>(),
- );
- }
- buf = &buf[2..];
-
- // read miscellaneous options
- let opts = NativeEndian::read_u16(buf);
- buf = &buf[2..];
-
- // read start state
- let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
- buf = &buf[8..];
-
- // read state count
- let state_count = NativeEndian::read_u64(buf) as usize;
- buf = &buf[8..];
-
- // read max match state
- let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
- buf = &buf[8..];
-
- // read byte classes
- let byte_classes = ByteClasses::from_slice(&buf[..256]);
- buf = &buf[256..];
-
- let len = state_count * byte_classes.alphabet_len();
- let len_bytes = len * state_size;
- assert!(
- buf.len() <= len_bytes,
- "insufficient transition table bytes, \
- expected at least {} but only have {}",
- len_bytes,
- buf.len()
- );
- assert_eq!(
- 0,
- buf.as_ptr() as usize % mem::align_of::<S>(),
- "DenseDFA transition table is not properly aligned"
- );
-
- // SAFETY: This is the only actual not-safe thing in this entire
- // routine. The key things we need to worry about here are alignment
- // and size. The two asserts above should cover both conditions.
- let trans = slice::from_raw_parts(buf.as_ptr() as *const S, len);
- Repr {
- premultiplied: opts & MASK_PREMULTIPLIED > 0,
- anchored: opts & MASK_ANCHORED > 0,
- start,
- state_count,
- max_match,
- byte_classes,
- trans,
- }
- }
-}
-
-/// The following methods implement mutable routines on the internal
-/// representation of a DFA. As such, we must fix the first type parameter to
-/// a `Vec<S>` since a generic `T: AsRef<[S]>` does not permit mutation. We
-/// can get away with this because these methods are internal to the crate and
-/// are exclusively used during construction of the DFA.
-#[cfg(feature = "std")]
-impl<S: StateID> Repr<Vec<S>, S> {
- pub fn premultiply(&mut self) -> Result<()> {
- if self.premultiplied || self.state_count <= 1 {
- return Ok(());
- }
-
- let alpha_len = self.alphabet_len();
- premultiply_overflow_error(
- S::from_usize(self.state_count - 1),
- alpha_len,
- )?;
-
- for id in (0..self.state_count).map(S::from_usize) {
- for (_, next) in self.get_state_mut(id).iter_mut() {
- *next = S::from_usize(next.to_usize() * alpha_len);
- }
- }
- self.premultiplied = true;
- self.start = S::from_usize(self.start.to_usize() * alpha_len);
- self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len);
- Ok(())
- }
-
- /// Minimize this DFA using Hopcroft's algorithm.
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn minimize(&mut self) {
- assert!(!self.premultiplied, "can't minimize premultiplied DFA");
-
- Minimizer::new(self).run();
- }
-
- /// Set the start state of this DFA.
- ///
- /// Note that a start state cannot be set on a premultiplied DFA. Instead,
- /// DFAs should first be completely constructed and then premultiplied.
- pub fn set_start_state(&mut self, start: S) {
- assert!(!self.premultiplied, "can't set start on premultiplied DFA");
- assert!(start.to_usize() < self.state_count, "invalid start state");
-
- self.start = start;
- }
-
- /// Set the maximum state identifier that could possible correspond to a
- /// match state.
- ///
- /// Callers must uphold the invariant that any state identifier less than
- /// or equal to the identifier given is either a match state or the special
- /// dead state (which always has identifier 0 and whose transitions all
- /// lead back to itself).
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn set_max_match_state(&mut self, id: S) {
- assert!(!self.premultiplied, "can't set match on premultiplied DFA");
- assert!(id.to_usize() < self.state_count, "invalid max match state");
-
- self.max_match = id;
- }
-
- /// Add the given transition to this DFA. Both the `from` and `to` states
- /// must already exist.
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn add_transition(&mut self, from: S, byte: u8, to: S) {
- assert!(!self.premultiplied, "can't add trans to premultiplied DFA");
- assert!(from.to_usize() < self.state_count, "invalid from state");
- assert!(to.to_usize() < self.state_count, "invalid to state");
-
- let class = self.byte_classes().get(byte);
- let offset = from.to_usize() * self.alphabet_len() + class as usize;
- self.trans[offset] = to;
- }
-
- /// An an empty state (a state where all transitions lead to a dead state)
- /// and return its identifier. The identifier returned is guaranteed to
- /// not point to any other existing state.
- ///
- /// If adding a state would exhaust the state identifier space (given by
- /// `S`), then this returns an error. In practice, this means that the
- /// state identifier representation chosen is too small.
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn add_empty_state(&mut self) -> Result<S> {
- assert!(!self.premultiplied, "can't add state to premultiplied DFA");
-
- let id = if self.state_count == 0 {
- S::from_usize(0)
- } else {
- next_state_id(S::from_usize(self.state_count - 1))?
- };
- let alphabet_len = self.alphabet_len();
- self.trans.extend(iter::repeat(dead_id::<S>()).take(alphabet_len));
- // This should never panic, since state_count is a usize. The
- // transition table size would have run out of room long ago.
- self.state_count = self.state_count.checked_add(1).unwrap();
- Ok(id)
- }
-
- /// Return a mutable representation of the state corresponding to the given
- /// id. This is useful for implementing routines that manipulate DFA states
- /// (e.g., swapping states).
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn get_state_mut(&mut self, id: S) -> StateMut<S> {
- assert!(!self.premultiplied, "can't get state in premultiplied DFA");
-
- let alphabet_len = self.alphabet_len();
- let offset = id.to_usize() * alphabet_len;
- StateMut {
- transitions: &mut self.trans[offset..offset + alphabet_len],
- }
- }
-
- /// Swap the two states given in the transition table.
- ///
- /// This routine does not do anything to check the correctness of this
- /// swap. Callers must ensure that other states pointing to id1 and id2 are
- /// updated appropriately.
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn swap_states(&mut self, id1: S, id2: S) {
- assert!(!self.premultiplied, "can't swap states in premultiplied DFA");
-
- let o1 = id1.to_usize() * self.alphabet_len();
- let o2 = id2.to_usize() * self.alphabet_len();
- for b in 0..self.alphabet_len() {
- self.trans.swap(o1 + b, o2 + b);
- }
- }
-
- /// Truncate the states in this DFA to the given count.
- ///
- /// This routine does not do anything to check the correctness of this
- /// truncation. Callers must ensure that other states pointing to truncated
- /// states are updated appropriately.
- ///
- /// This cannot be called on a premultiplied DFA.
- pub fn truncate_states(&mut self, count: usize) {
- assert!(!self.premultiplied, "can't truncate in premultiplied DFA");
-
- let alphabet_len = self.alphabet_len();
- self.trans.truncate(count * alphabet_len);
- self.state_count = count;
- }
-
- /// This routine shuffles all match states in this DFA---according to the
- /// given map---to the beginning of the DFA such that every non-match state
- /// appears after every match state. (With one exception: the special dead
- /// state remains as the first state.) The given map should have length
- /// exactly equivalent to the number of states in this DFA.
- ///
- /// The purpose of doing this shuffling is to avoid the need to store
- /// additional state to determine whether a state is a match state or not.
- /// It also enables a single conditional in the core matching loop instead
- /// of two.
- ///
- /// This updates `self.max_match` to point to the last matching state as
- /// well as `self.start` if the starting state was moved.
- pub fn shuffle_match_states(&mut self, is_match: &[bool]) {
- assert!(
- !self.premultiplied,
- "cannot shuffle match states of premultiplied DFA"
- );
- assert_eq!(self.state_count, is_match.len());
-
- if self.state_count <= 1 {
- return;
- }
-
- let mut first_non_match = 1;
- while first_non_match < self.state_count && is_match[first_non_match] {
- first_non_match += 1;
- }
-
- let mut swaps: Vec<S> = vec![dead_id(); self.state_count];
- let mut cur = self.state_count - 1;
- while cur > first_non_match {
- if is_match[cur] {
- self.swap_states(
- S::from_usize(cur),
- S::from_usize(first_non_match),
- );
- swaps[cur] = S::from_usize(first_non_match);
- swaps[first_non_match] = S::from_usize(cur);
-
- first_non_match += 1;
- while first_non_match < cur && is_match[first_non_match] {
- first_non_match += 1;
- }
- }
- cur -= 1;
- }
- for id in (0..self.state_count).map(S::from_usize) {
- for (_, next) in self.get_state_mut(id).iter_mut() {
- if swaps[next.to_usize()] != dead_id() {
- *next = swaps[next.to_usize()];
- }
- }
- }
- if swaps[self.start.to_usize()] != dead_id() {
- self.start = swaps[self.start.to_usize()];
- }
- self.max_match = S::from_usize(first_non_match - 1);
- }
-}
-
-#[cfg(feature = "std")]
-impl<T: AsRef<[S]>, S: StateID> fmt::Debug for Repr<T, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- fn state_status<T: AsRef<[S]>, S: StateID>(
- dfa: &Repr<T, S>,
- id: S,
- ) -> &'static str {
- if id == dead_id() {
- if dfa.is_match_state(id) {
- "D*"
- } else {
- "D "
- }
- } else if id == dfa.start_state() {
- if dfa.is_match_state(id) {
- ">*"
- } else {
- "> "
- }
- } else {
- if dfa.is_match_state(id) {
- " *"
- } else {
- " "
- }
- }
- }
-
- writeln!(f, "DenseDFA(")?;
- for (id, state) in self.states() {
- let status = state_status(self, id);
- writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
- }
- writeln!(f, ")")?;
- Ok(())
- }
-}
-
-/// An iterator over all states in a DFA.
-///
-/// This iterator yields a tuple for each state. The first element of the
-/// tuple corresponds to a state's identifier, and the second element
-/// corresponds to the state itself (comprised of its transitions).
-///
-/// If this DFA is premultiplied, then the state identifiers are in turn
-/// premultiplied as well, making them usable without additional modification.
-///
-/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
-/// the type of the transition table itself and `S` corresponds to the state
-/// identifier representation.
-#[cfg(feature = "std")]
-pub(crate) struct StateIter<'a, T: 'a, S: 'a> {
- dfa: &'a Repr<T, S>,
- it: iter::Enumerate<slice::Chunks<'a, S>>,
-}
-
-#[cfg(feature = "std")]
-impl<'a, T: AsRef<[S]>, S: StateID> Iterator for StateIter<'a, T, S> {
- type Item = (S, State<'a, S>);
-
- fn next(&mut self) -> Option<(S, State<'a, S>)> {
- self.it.next().map(|(id, chunk)| {
- let state = State { transitions: chunk };
- let id = if self.dfa.premultiplied {
- id * self.dfa.alphabet_len()
- } else {
- id
- };
- (S::from_usize(id), state)
- })
- }
-}
-
-/// An immutable representation of a single DFA state.
-///
-/// `'a` correspondings to the lifetime of a DFA's transition table and `S`
-/// corresponds to the state identifier representation.
-#[cfg(feature = "std")]
-pub(crate) struct State<'a, S: 'a> {
- transitions: &'a [S],
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> State<'a, S> {
- /// Return an iterator over all transitions in this state. This yields
- /// a number of transitions equivalent to the alphabet length of the
- /// corresponding DFA.
- ///
- /// Each transition is represented by a tuple. The first element is
- /// the input byte for that transition and the second element is the
- /// transitions itself.
- pub fn transitions(&self) -> StateTransitionIter<S> {
- StateTransitionIter { it: self.transitions.iter().enumerate() }
- }
-
- /// Return an iterator over a sparse representation of the transitions in
- /// this state. Only non-dead transitions are returned.
- ///
- /// The "sparse" representation in this case corresponds to a sequence of
- /// triples. The first two elements of the triple comprise an inclusive
- /// byte range while the last element corresponds to the transition taken
- /// for all bytes in the range.
- ///
- /// This is somewhat more condensed than the classical sparse
- /// representation (where you have an element for every non-dead
- /// transition), but in practice, checking if a byte is in a range is very
- /// cheap and using ranges tends to conserve quite a bit more space.
- pub fn sparse_transitions(&self) -> StateSparseTransitionIter<S> {
- StateSparseTransitionIter { dense: self.transitions(), cur: None }
- }
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> fmt::Debug for State<'a, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let mut transitions = vec![];
- for (start, end, next_id) in self.sparse_transitions() {
- let line = if start == end {
- format!("{} => {}", escape(start), next_id.to_usize())
- } else {
- format!(
- "{}-{} => {}",
- escape(start),
- escape(end),
- next_id.to_usize(),
- )
- };
- transitions.push(line);
- }
- write!(f, "{}", transitions.join(", "))?;
- Ok(())
- }
-}
-
-/// An iterator over all transitions in a single DFA state. This yields
-/// a number of transitions equivalent to the alphabet length of the
-/// corresponding DFA.
-///
-/// Each transition is represented by a tuple. The first element is the input
-/// byte for that transition and the second element is the transitions itself.
-#[cfg(feature = "std")]
-#[derive(Debug)]
-pub(crate) struct StateTransitionIter<'a, S: 'a> {
- it: iter::Enumerate<slice::Iter<'a, S>>,
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> Iterator for StateTransitionIter<'a, S> {
- type Item = (u8, S);
-
- fn next(&mut self) -> Option<(u8, S)> {
- self.it.next().map(|(i, &id)| (i as u8, id))
- }
-}
-
-/// An iterator over all transitions in a single DFA state using a sparse
-/// representation.
-///
-/// Each transition is represented by a triple. The first two elements of the
-/// triple comprise an inclusive byte range while the last element corresponds
-/// to the transition taken for all bytes in the range.
-#[cfg(feature = "std")]
-#[derive(Debug)]
-pub(crate) struct StateSparseTransitionIter<'a, S: 'a> {
- dense: StateTransitionIter<'a, S>,
- cur: Option<(u8, u8, S)>,
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> Iterator for StateSparseTransitionIter<'a, S> {
- type Item = (u8, u8, S);
-
- fn next(&mut self) -> Option<(u8, u8, S)> {
- while let Some((b, next)) = self.dense.next() {
- let (prev_start, prev_end, prev_next) = match self.cur {
- Some(t) => t,
- None => {
- self.cur = Some((b, b, next));
- continue;
- }
- };
- if prev_next == next {
- self.cur = Some((prev_start, b, prev_next));
- } else {
- self.cur = Some((b, b, next));
- if prev_next != dead_id() {
- return Some((prev_start, prev_end, prev_next));
- }
- }
- }
- if let Some((start, end, next)) = self.cur.take() {
- if next != dead_id() {
- return Some((start, end, next));
- }
- }
- None
- }
-}
-
-/// A mutable representation of a single DFA state.
-///
-/// `'a` correspondings to the lifetime of a DFA's transition table and `S`
-/// corresponds to the state identifier representation.
-#[cfg(feature = "std")]
-pub(crate) struct StateMut<'a, S: 'a> {
- transitions: &'a mut [S],
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> StateMut<'a, S> {
- /// Return an iterator over all transitions in this state. This yields
- /// a number of transitions equivalent to the alphabet length of the
- /// corresponding DFA.
- ///
- /// Each transition is represented by a tuple. The first element is the
- /// input byte for that transition and the second element is a mutable
- /// reference to the transition itself.
- pub fn iter_mut(&mut self) -> StateTransitionIterMut<S> {
- StateTransitionIterMut { it: self.transitions.iter_mut().enumerate() }
- }
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- fmt::Debug::fmt(&State { transitions: self.transitions }, f)
- }
-}
-
-/// A mutable iterator over all transitions in a DFA state.
-///
-/// Each transition is represented by a tuple. The first element is the
-/// input byte for that transition and the second element is a mutable
-/// reference to the transition itself.
-#[cfg(feature = "std")]
-#[derive(Debug)]
-pub(crate) struct StateTransitionIterMut<'a, S: 'a> {
- it: iter::Enumerate<slice::IterMut<'a, S>>,
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> Iterator for StateTransitionIterMut<'a, S> {
- type Item = (u8, &'a mut S);
-
- fn next(&mut self) -> Option<(u8, &'a mut S)> {
- self.it.next().map(|(i, id)| (i as u8, id))
- }
-}
-
-/// A builder for constructing a deterministic finite automaton from regular
-/// expressions.
-///
-/// This builder permits configuring several aspects of the construction
-/// process such as case insensitivity, Unicode support and various options
-/// that impact the size of the generated DFA. In some cases, options (like
-/// performing DFA minimization) can come with a substantial additional cost.
-///
-/// This builder always constructs a *single* DFA. As such, this builder can
-/// only be used to construct regexes that either detect the presence of a
-/// match or find the end location of a match. A single DFA cannot produce both
-/// the start and end of a match. For that information, use a
-/// [`Regex`](struct.Regex.html), which can be similarly configured using
-/// [`RegexBuilder`](struct.RegexBuilder.html).
-#[cfg(feature = "std")]
-#[derive(Clone, Debug)]
-pub struct Builder {
- parser: ParserBuilder,
- nfa: nfa::Builder,
- anchored: bool,
- minimize: bool,
- premultiply: bool,
- byte_classes: bool,
- reverse: bool,
- longest_match: bool,
-}
-
-#[cfg(feature = "std")]
-impl Builder {
- /// Create a new DenseDFA builder with the default configuration.
- pub fn new() -> Builder {
- let mut nfa = nfa::Builder::new();
- // This is enabled by default, but we set it here anyway. Since we're
- // building a DFA, shrinking the NFA is always a good idea.
- nfa.shrink(true);
- Builder {
- parser: ParserBuilder::new(),
- nfa,
- anchored: false,
- minimize: false,
- premultiply: true,
- byte_classes: true,
- reverse: false,
- longest_match: false,
- }
- }
-
- /// Build a DFA from the given pattern.
- ///
- /// If there was a problem parsing or compiling the pattern, then an error
- /// is returned.
- pub fn build(&self, pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> {
- self.build_with_size::<usize>(pattern)
- }
-
- /// Build a DFA from the given pattern using a specific representation for
- /// the DFA's state IDs.
- ///
- /// If there was a problem parsing or compiling the pattern, then an error
- /// is returned.
- ///
- /// The representation of state IDs is determined by the `S` type
- /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
- /// or `usize`, where `usize` is the default used for `build`. The purpose
- /// of specifying a representation for state IDs is to reduce the memory
- /// footprint of a DFA.
- ///
- /// When using this routine, the chosen state ID representation will be
- /// used throughout determinization and minimization, if minimization
- /// was requested. Even if the minimized DFA can fit into the chosen
- /// state ID representation but the initial determinized DFA cannot,
- /// then this will still return an error. To get a minimized DFA with a
- /// smaller state ID representation, first build it with a bigger state ID
- /// representation, and then shrink the size of the DFA using one of its
- /// conversion routines, such as
- /// [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
- pub fn build_with_size<S: StateID>(
- &self,
- pattern: &str,
- ) -> Result<DenseDFA<Vec<S>, S>> {
- self.build_from_nfa(&self.build_nfa(pattern)?)
- }
-
- /// An internal only (for now) API for building a dense DFA directly from
- /// an NFA.
- pub(crate) fn build_from_nfa<S: StateID>(
- &self,
- nfa: &NFA,
- ) -> Result<DenseDFA<Vec<S>, S>> {
- if self.longest_match && !self.anchored {
- return Err(Error::unsupported_longest_match());
- }
-
- let mut dfa = if self.byte_classes {
- Determinizer::new(nfa)
- .with_byte_classes()
- .longest_match(self.longest_match)
- .build()
- } else {
- Determinizer::new(nfa).longest_match(self.longest_match).build()
- }?;
- if self.minimize {
- dfa.minimize();
- }
- if self.premultiply {
- dfa.premultiply()?;
- }
- Ok(dfa.into_dense_dfa())
- }
-
- /// Builds an NFA from the given pattern.
- pub(crate) fn build_nfa(&self, pattern: &str) -> Result<NFA> {
- let hir = self.parser.build().parse(pattern).map_err(Error::syntax)?;
- Ok(self.nfa.build(&hir)?)
- }
-
- /// Set whether matching must be anchored at the beginning of the input.
- ///
- /// When enabled, a match must begin at the start of the input. When
- /// disabled, the DFA will act as if the pattern started with a `.*?`,
- /// which enables a match to appear anywhere.
- ///
- /// By default this is disabled.
- pub fn anchored(&mut self, yes: bool) -> &mut Builder {
- self.anchored = yes;
- self.nfa.anchored(yes);
- self
- }
-
- /// Enable or disable the case insensitive flag by default.
- ///
- /// By default this is disabled. It may alternatively be selectively
- /// enabled in the regular expression itself via the `i` flag.
- pub fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
- self.parser.case_insensitive(yes);
- self
- }
-
- /// Enable verbose mode in the regular expression.
- ///
- /// When enabled, verbose mode permits insigificant whitespace in many
- /// places in the regular expression, as well as comments. Comments are
- /// started using `#` and continue until the end of the line.
- ///
- /// By default, this is disabled. It may be selectively enabled in the
- /// regular expression by using the `x` flag regardless of this setting.
- pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
- self.parser.ignore_whitespace(yes);
- self
- }
-
- /// Enable or disable the "dot matches any character" flag by default.
- ///
- /// By default this is disabled. It may alternatively be selectively
- /// enabled in the regular expression itself via the `s` flag.
- pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
- self.parser.dot_matches_new_line(yes);
- self
- }
-
- /// Enable or disable the "swap greed" flag by default.
- ///
- /// By default this is disabled. It may alternatively be selectively
- /// enabled in the regular expression itself via the `U` flag.
- pub fn swap_greed(&mut self, yes: bool) -> &mut Builder {
- self.parser.swap_greed(yes);
- self
- }
-
- /// Enable or disable the Unicode flag (`u`) by default.
- ///
- /// By default this is **enabled**. It may alternatively be selectively
- /// disabled in the regular expression itself via the `u` flag.
- ///
- /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
- /// default), a regular expression will fail to parse if Unicode mode is
- /// disabled and a sub-expression could possibly match invalid UTF-8.
- pub fn unicode(&mut self, yes: bool) -> &mut Builder {
- self.parser.unicode(yes);
- self
- }
-
- /// When enabled, the builder will permit the construction of a regular
- /// expression that may match invalid UTF-8.
- ///
- /// When disabled (the default), the builder is guaranteed to produce a
- /// regex that will only ever match valid UTF-8 (otherwise, the builder
- /// will return an error).
- pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
- self.parser.allow_invalid_utf8(yes);
- self.nfa.allow_invalid_utf8(yes);
- self
- }
-
- /// Set the nesting limit used for the regular expression parser.
- ///
- /// The nesting limit controls how deep the abstract syntax tree is allowed
- /// to be. If the AST exceeds the given limit (e.g., with too many nested
- /// groups), then an error is returned by the parser.
- ///
- /// The purpose of this limit is to act as a heuristic to prevent stack
- /// overflow when building a finite automaton from a regular expression's
- /// abstract syntax tree. In particular, construction currently uses
- /// recursion. In the future, the implementation may stop using recursion
- /// and this option will no longer be necessary.
- ///
- /// This limit is not checked until the entire AST is parsed. Therefore,
- /// if callers want to put a limit on the amount of heap space used, then
- /// they should impose a limit on the length, in bytes, of the concrete
- /// pattern string. In particular, this is viable since the parser will
- /// limit itself to heap space proportional to the lenth of the pattern
- /// string.
- ///
- /// Note that a nest limit of `0` will return a nest limit error for most
- /// patterns but not all. For example, a nest limit of `0` permits `a` but
- /// not `ab`, since `ab` requires a concatenation AST item, which results
- /// in a nest depth of `1`. In general, a nest limit is not something that
- /// manifests in an obvious way in the concrete syntax, therefore, it
- /// should not be used in a granular way.
- pub fn nest_limit(&mut self, limit: u32) -> &mut Builder {
- self.parser.nest_limit(limit);
- self
- }
-
- /// Minimize the DFA.
- ///
- /// When enabled, the DFA built will be minimized such that it is as small
- /// as possible.
- ///
- /// Whether one enables minimization or not depends on the types of costs
- /// you're willing to pay and how much you care about its benefits. In
- /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
- /// space, where `n` is the number of DFA states and `k` is the alphabet
- /// size. In practice, minimization can be quite costly in terms of both
- /// space and time, so it should only be done if you're willing to wait
- /// longer to produce a DFA. In general, you might want a minimal DFA in
- /// the following circumstances:
- ///
- /// 1. You would like to optimize for the size of the automaton. This can
- /// manifest in one of two ways. Firstly, if you're converting the
- /// DFA into Rust code (or a table embedded in the code), then a minimal
- /// DFA will translate into a corresponding reduction in code size, and
- /// thus, also the final compiled binary size. Secondly, if you are
- /// building many DFAs and putting them on the heap, you'll be able to
- /// fit more if they are smaller. Note though that building a minimal
- /// DFA itself requires additional space; you only realize the space
- /// savings once the minimal DFA is constructed (at which point, the
- /// space used for minimization is freed).
- /// 2. You've observed that a smaller DFA results in faster match
- /// performance. Naively, this isn't guaranteed since there is no
- /// inherent difference between matching with a bigger-than-minimal
- /// DFA and a minimal DFA. However, a smaller DFA may make use of your
- /// CPU's cache more efficiently.
- /// 3. You are trying to establish an equivalence between regular
- /// languages. The standard method for this is to build a minimal DFA
- /// for each language and then compare them. If the DFAs are equivalent
- /// (up to state renaming), then the languages are equivalent.
- ///
- /// This option is disabled by default.
- pub fn minimize(&mut self, yes: bool) -> &mut Builder {
- self.minimize = yes;
- self
- }
-
- /// Premultiply state identifiers in the DFA's transition table.
- ///
- /// When enabled, state identifiers are premultiplied to point to their
- /// corresponding row in the DFA's transition table. That is, given the
- /// `i`th state, its corresponding premultiplied identifier is `i * k`
- /// where `k` is the alphabet size of the DFA. (The alphabet size is at
- /// most 256, but is in practice smaller if byte classes is enabled.)
- ///
- /// When state identifiers are not premultiplied, then the identifier of
- /// the `i`th state is `i`.
- ///
- /// The advantage of premultiplying state identifiers is that is saves
- /// a multiplication instruction per byte when searching with the DFA.
- /// This has been observed to lead to a 20% performance benefit in
- /// micro-benchmarks.
- ///
- /// The primary disadvantage of premultiplying state identifiers is
- /// that they require a larger integer size to represent. For example,
- /// if your DFA has 200 states, then its premultiplied form requires
- /// 16 bits to represent every possible state identifier, where as its
- /// non-premultiplied form only requires 8 bits.
- ///
- /// This option is enabled by default.
- pub fn premultiply(&mut self, yes: bool) -> &mut Builder {
- self.premultiply = yes;
- self
- }
-
- /// Shrink the size of the DFA's alphabet by mapping bytes to their
- /// equivalence classes.
- ///
- /// When enabled, each DFA will use a map from all possible bytes to their
- /// corresponding equivalence class. Each equivalence class represents a
- /// set of bytes that does not discriminate between a match and a non-match
- /// in the DFA. For example, the pattern `[ab]+` has at least two
- /// equivalence classes: a set containing `a` and `b` and a set containing
- /// every byte except for `a` and `b`. `a` and `b` are in the same
- /// equivalence classes because they never discriminate between a match
- /// and a non-match.
- ///
- /// The advantage of this map is that the size of the transition table can
- /// be reduced drastically from `#states * 256 * sizeof(id)` to
- /// `#states * k * sizeof(id)` where `k` is the number of equivalence
- /// classes. As a result, total space usage can decrease substantially.
- /// Moreover, since a smaller alphabet is used, compilation becomes faster
- /// as well.
- ///
- /// The disadvantage of this map is that every byte searched must be
- /// passed through this map before it can be used to determine the next
- /// transition. This has a small match time performance cost.
- ///
- /// This option is enabled by default.
- pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
- self.byte_classes = yes;
- self
- }
-
- /// Reverse the DFA.
- ///
- /// A DFA reversal is performed by reversing all of the concatenated
- /// sub-expressions in the original pattern, recursively. The resulting
- /// DFA can be used to match the pattern starting from the end of a string
- /// instead of the beginning of a string.
- ///
- /// Generally speaking, a reversed DFA is most useful for finding the start
- /// of a match, since a single forward DFA is only capable of finding the
- /// end of a match. This start of match handling is done for you
- /// automatically if you build a [`Regex`](struct.Regex.html).
- pub fn reverse(&mut self, yes: bool) -> &mut Builder {
- self.reverse = yes;
- self.nfa.reverse(yes);
- self
- }
-
- /// Find the longest possible match.
- ///
- /// This is distinct from the default leftmost-first match semantics in
- /// that it treats all NFA states as having equivalent priority. In other
- /// words, the longest possible match is always found and it is not
- /// possible to implement non-greedy match semantics when this is set. That
- /// is, `a+` and `a+?` are equivalent when this is enabled.
- ///
- /// In particular, a practical issue with this option at the moment is that
- /// it prevents unanchored searches from working correctly, since
- /// unanchored searches are implemented by prepending an non-greedy `.*?`
- /// to the beginning of the pattern. As stated above, non-greedy match
- /// semantics aren't supported. Therefore, if this option is enabled and
- /// an unanchored search is requested, then building a DFA will return an
- /// error.
- ///
- /// This option is principally useful when building a reverse DFA for
- /// finding the start of a match. If you are building a regex with
- /// [`RegexBuilder`](struct.RegexBuilder.html), then this is handled for
- /// you automatically. The reason why this is necessary for start of match
- /// handling is because we want to find the earliest possible starting
- /// position of a match to satisfy leftmost-first match semantics. When
- /// matching in reverse, this means finding the longest possible match,
- /// hence, this option.
- ///
- /// By default this is disabled.
- pub fn longest_match(&mut self, yes: bool) -> &mut Builder {
- // There is prior art in RE2 that shows how this can support unanchored
- // searches. Instead of treating all NFA states as having equivalent
- // priority, we instead group NFA states into sets, and treat members
- // of each set as having equivalent priority, but having greater
- // priority than all following members of different sets. We then
- // essentially assign a higher priority to everything over the prefix
- // `.*?`.
- self.longest_match = yes;
- self
- }
-
- /// Apply best effort heuristics to shrink the NFA at the expense of more
- /// time/memory.
- ///
- /// This may be exposed in the future, but for now is exported for use in
- /// the `regex-automata-debug` tool.
- #[doc(hidden)]
- pub fn shrink(&mut self, yes: bool) -> &mut Builder {
- self.nfa.shrink(yes);
- self
- }
-}
-
-#[cfg(feature = "std")]
-impl Default for Builder {
- fn default() -> Builder {
- Builder::new()
- }
-}
-
-/// Return the given byte as its escaped string form.
-#[cfg(feature = "std")]
-fn escape(b: u8) -> String {
- use std::ascii;
-
- String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
-}
-
-#[cfg(all(test, feature = "std"))]
-mod tests {
- use super::*;
-
- #[test]
- fn errors_when_converting_to_smaller_dfa() {
- let pattern = r"\w{10}";
- let dfa = Builder::new()
- .byte_classes(false)
- .anchored(true)
- .premultiply(false)
- .build_with_size::<u16>(pattern)
- .unwrap();
- assert!(dfa.to_u8().is_err());
- }
-
- #[test]
- fn errors_when_determinization_would_overflow() {
- let pattern = r"\w{10}";
-
- let mut builder = Builder::new();
- builder.byte_classes(false).anchored(true).premultiply(false);
- // using u16 is fine
- assert!(builder.build_with_size::<u16>(pattern).is_ok());
- // // ... but u8 results in overflow (because there are >256 states)
- assert!(builder.build_with_size::<u8>(pattern).is_err());
- }
-
- #[test]
- fn errors_when_premultiply_would_overflow() {
- let pattern = r"[a-z]";
-
- let mut builder = Builder::new();
- builder.byte_classes(false).anchored(true).premultiply(false);
- // without premultiplication is OK
- assert!(builder.build_with_size::<u8>(pattern).is_ok());
- // ... but with premultiplication overflows u8
- builder.premultiply(true);
- assert!(builder.build_with_size::<u8>(pattern).is_err());
- }
-
- // let data = ::std::fs::read_to_string("/usr/share/dict/words").unwrap();
- // let mut words: Vec<&str> = data.lines().collect();
- // println!("{} words", words.len());
- // words.sort_by(|w1, w2| w1.len().cmp(&w2.len()).reverse());
- // let pattern = words.join("|");
- // print_automata_counts(&pattern);
- // print_automata(&pattern);
-
- // print_automata(r"[01]*1[01]{5}");
- // print_automata(r"X(.?){0,8}Y");
- // print_automata_counts(r"\p{alphabetic}");
- // print_automata(r"a*b+|cdefg");
- // print_automata(r"(..)*(...)*");
-
- // let pattern = r"\p{any}*?\p{Other_Uppercase}";
- // let pattern = r"\p{any}*?\w+";
- // print_automata_counts(pattern);
- // print_automata_counts(r"(?-u:\w)");
-
- // let pattern = r"\p{Greek}";
- // let pattern = r"zZzZzZzZzZ";
- // let pattern = grapheme_pattern();
- // let pattern = r"\p{Ideographic}";
- // let pattern = r"\w{10}"; // 51784 --> 41264
- // let pattern = r"\w"; // 5182
- // let pattern = r"a*";
- // print_automata(pattern);
- // let (_, _, dfa) = build_automata(pattern);
-}
diff --git a/vendor/regex-automata/src/determinize.rs b/vendor/regex-automata/src/determinize.rs
deleted file mode 100644
index cf0c28585..000000000
--- a/vendor/regex-automata/src/determinize.rs
+++ /dev/null
@@ -1,286 +0,0 @@
-use std::collections::HashMap;
-use std::mem;
-use std::rc::Rc;
-
-use dense;
-use error::Result;
-use nfa::{self, NFA};
-use sparse_set::SparseSet;
-use state_id::{dead_id, StateID};
-
-type DFARepr<S> = dense::Repr<Vec<S>, S>;
-
-/// A determinizer converts an NFA to a DFA.
-///
-/// This determinizer follows the typical powerset construction, where each
-/// DFA state is comprised of one or more NFA states. In the worst case, there
-/// is one DFA state for every possible combination of NFA states. In practice,
-/// this only happens in certain conditions, typically when there are bounded
-/// repetitions.
-///
-/// The type variable `S` refers to the chosen state identifier representation
-/// used for the DFA.
-///
-/// The lifetime variable `'a` refers to the lifetime of the NFA being
-/// converted to a DFA.
-#[derive(Debug)]
-pub(crate) struct Determinizer<'a, S: StateID> {
- /// The NFA we're converting into a DFA.
- nfa: &'a NFA,
- /// The DFA we're building.
- dfa: DFARepr<S>,
- /// Each DFA state being built is defined as an *ordered* set of NFA
- /// states, along with a flag indicating whether the state is a match
- /// state or not.
- ///
- /// This is never empty. The first state is always a dummy state such that
- /// a state id == 0 corresponds to a dead state.
- builder_states: Vec<Rc<State>>,
- /// A cache of DFA states that already exist and can be easily looked up
- /// via ordered sets of NFA states.
- cache: HashMap<Rc<State>, S>,
- /// Scratch space for a stack of NFA states to visit, for depth first
- /// visiting without recursion.
- stack: Vec<nfa::StateID>,
- /// Scratch space for storing an ordered sequence of NFA states, for
- /// amortizing allocation.
- scratch_nfa_states: Vec<nfa::StateID>,
- /// Whether to build a DFA that finds the longest possible match.
- longest_match: bool,
-}
-
-/// An intermediate representation for a DFA state during determinization.
-#[derive(Debug, Eq, Hash, PartialEq)]
-struct State {
- /// Whether this state is a match state or not.
- is_match: bool,
- /// An ordered sequence of NFA states that make up this DFA state.
- nfa_states: Vec<nfa::StateID>,
-}
-
-impl<'a, S: StateID> Determinizer<'a, S> {
- /// Create a new determinizer for converting the given NFA to a DFA.
- pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
- let dead = Rc::new(State::dead());
- let mut cache = HashMap::default();
- cache.insert(dead.clone(), dead_id());
-
- Determinizer {
- nfa,
- dfa: DFARepr::empty().anchored(nfa.is_anchored()),
- builder_states: vec![dead],
- cache,
- stack: vec![],
- scratch_nfa_states: vec![],
- longest_match: false,
- }
- }
-
- /// Instruct the determinizer to use equivalence classes as the transition
- /// alphabet instead of all possible byte values.
- pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
- let byte_classes = self.nfa.byte_classes().clone();
- self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
- .anchored(self.nfa.is_anchored());
- self
- }
-
- /// Instruct the determinizer to build a DFA that recognizes the longest
- /// possible match instead of the leftmost first match. This is useful when
- /// constructing reverse DFAs for finding the start of a match.
- pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
- self.longest_match = yes;
- self
- }
-
- /// Build the DFA. If there was a problem constructing the DFA (e.g., if
- /// the chosen state identifier representation is too small), then an error
- /// is returned.
- pub fn build(mut self) -> Result<DFARepr<S>> {
- let representative_bytes: Vec<u8> =
- self.dfa.byte_classes().representatives().collect();
- let mut sparse = self.new_sparse_set();
- let mut uncompiled = vec![self.add_start(&mut sparse)?];
- while let Some(dfa_id) = uncompiled.pop() {
- for &b in &representative_bytes {
- let (next_dfa_id, is_new) =
- self.cached_state(dfa_id, b, &mut sparse)?;
- self.dfa.add_transition(dfa_id, b, next_dfa_id);
- if is_new {
- uncompiled.push(next_dfa_id);
- }
- }
- }
-
- // At this point, we shuffle the matching states in the final DFA to
- // the beginning. This permits a DFA's match loop to detect a match
- // condition by merely inspecting the current state's identifier, and
- // avoids the need for any additional auxiliary storage.
- let is_match: Vec<bool> =
- self.builder_states.iter().map(|s| s.is_match).collect();
- self.dfa.shuffle_match_states(&is_match);
- Ok(self.dfa)
- }
-
- /// Return the identifier for the next DFA state given an existing DFA
- /// state and an input byte. If the next DFA state already exists, then
- /// return its identifier from the cache. Otherwise, build the state, cache
- /// it and return its identifier.
- ///
- /// The given sparse set is used for scratch space. It must have a capacity
- /// equivalent to the total number of NFA states, but its contents are
- /// otherwise unspecified.
- ///
- /// This routine returns a boolean indicating whether a new state was
- /// built. If a new state is built, then the caller needs to add it to its
- /// frontier of uncompiled DFA states to compute transitions for.
- fn cached_state(
- &mut self,
- dfa_id: S,
- b: u8,
- sparse: &mut SparseSet,
- ) -> Result<(S, bool)> {
- sparse.clear();
- // Compute the set of all reachable NFA states, including epsilons.
- self.next(dfa_id, b, sparse);
- // Build a candidate state and check if it has already been built.
- let state = self.new_state(sparse);
- if let Some(&cached_id) = self.cache.get(&state) {
- // Since we have a cached state, put the constructed state's
- // memory back into our scratch space, so that it can be reused.
- let _ =
- mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
- return Ok((cached_id, false));
- }
- // Nothing was in the cache, so add this state to the cache.
- self.add_state(state).map(|s| (s, true))
- }
-
- /// Compute the set of all eachable NFA states, including the full epsilon
- /// closure, from a DFA state for a single byte of input.
- fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
- next_nfa_states.clear();
- for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
- let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
- match *self.nfa.state(nfa_id) {
- nfa::State::Union { .. }
- | nfa::State::Fail
- | nfa::State::Match => {}
- nfa::State::Range { range: ref r } => {
- if r.start <= b && b <= r.end {
- self.epsilon_closure(r.next, next_nfa_states);
- }
- }
- nfa::State::Sparse { ref ranges } => {
- for r in ranges.iter() {
- if r.start > b {
- break;
- } else if r.start <= b && b <= r.end {
- self.epsilon_closure(r.next, next_nfa_states);
- break;
- }
- }
- }
- }
- }
- }
-
- /// Compute the epsilon closure for the given NFA state.
- fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
- if !self.nfa.state(start).is_epsilon() {
- set.insert(start);
- return;
- }
-
- self.stack.push(start);
- while let Some(mut id) = self.stack.pop() {
- loop {
- if set.contains(id) {
- break;
- }
- set.insert(id);
- match *self.nfa.state(id) {
- nfa::State::Range { .. }
- | nfa::State::Sparse { .. }
- | nfa::State::Fail
- | nfa::State::Match => break,
- nfa::State::Union { ref alternates } => {
- id = match alternates.get(0) {
- None => break,
- Some(&id) => id,
- };
- self.stack.extend(alternates[1..].iter().rev());
- }
- }
- }
- }
- }
-
- /// Compute the initial DFA state and return its identifier.
- ///
- /// The sparse set given is used for scratch space, and must have capacity
- /// equal to the total number of NFA states. Its contents are unspecified.
- fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
- sparse.clear();
- self.epsilon_closure(self.nfa.start(), sparse);
- let state = self.new_state(&sparse);
- let id = self.add_state(state)?;
- self.dfa.set_start_state(id);
- Ok(id)
- }
-
- /// Add the given state to the DFA and make it available in the cache.
- ///
- /// The state initially has no transitions. That is, it transitions to the
- /// dead state for all possible inputs.
- fn add_state(&mut self, state: State) -> Result<S> {
- let id = self.dfa.add_empty_state()?;
- let rstate = Rc::new(state);
- self.builder_states.push(rstate.clone());
- self.cache.insert(rstate, id);
- Ok(id)
- }
-
- /// Convert the given set of ordered NFA states to a DFA state.
- fn new_state(&mut self, set: &SparseSet) -> State {
- let mut state = State {
- is_match: false,
- nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
- };
- state.nfa_states.clear();
-
- for &id in set {
- match *self.nfa.state(id) {
- nfa::State::Range { .. } => {
- state.nfa_states.push(id);
- }
- nfa::State::Sparse { .. } => {
- state.nfa_states.push(id);
- }
- nfa::State::Fail => {
- break;
- }
- nfa::State::Match => {
- state.is_match = true;
- if !self.longest_match {
- break;
- }
- }
- nfa::State::Union { .. } => {}
- }
- }
- state
- }
-
- /// Create a new sparse set with enough capacity to hold all NFA states.
- fn new_sparse_set(&self) -> SparseSet {
- SparseSet::new(self.nfa.len())
- }
-}
-
-impl State {
- /// Create a new empty dead state.
- fn dead() -> State {
- State { nfa_states: vec![], is_match: false }
- }
-}
diff --git a/vendor/regex-automata/src/dfa.rs b/vendor/regex-automata/src/dfa.rs
deleted file mode 100644
index 43de3461f..000000000
--- a/vendor/regex-automata/src/dfa.rs
+++ /dev/null
@@ -1,363 +0,0 @@
-use state_id::StateID;
-
-/// A trait describing the interface of a deterministic finite automaton (DFA).
-///
-/// Every DFA has exactly one start state and at least one dead state (which
-/// may be the same, as in the case of an empty DFA). In all cases, a state
-/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
-/// always returns `true`.
-///
-/// Every DFA also has zero or more match states, such that
-/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
-/// a match state.
-///
-/// In general, users of this trait likely will only need to use the search
-/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
-/// methods are lower level and are used for walking the transitions of a DFA
-/// manually. In particular, the aforementioned search routines are implemented
-/// generically in terms of the lower level transition walking routines.
-pub trait DFA {
- /// The representation used for state identifiers in this DFA.
- ///
- /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
- type ID: StateID;
-
- /// Return the identifier of this DFA's start state.
- fn start_state(&self) -> Self::ID;
-
- /// Returns true if and only if the given identifier corresponds to a match
- /// state.
- fn is_match_state(&self, id: Self::ID) -> bool;
-
- /// Returns true if and only if the given identifier corresponds to a dead
- /// state. When a DFA enters a dead state, it is impossible to leave and
- /// thus can never lead to a match.
- fn is_dead_state(&self, id: Self::ID) -> bool;
-
- /// Returns true if and only if the given identifier corresponds to either
- /// a dead state or a match state, such that one of `is_match_state(id)`
- /// or `is_dead_state(id)` must return true.
- ///
- /// Depending on the implementation of the DFA, this routine can be used
- /// to save a branch in the core matching loop. Nevertheless,
- /// `is_match_state(id) || is_dead_state(id)` is always a valid
- /// implementation.
- fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
-
- /// Returns true if and only if this DFA is anchored.
- ///
- /// When a DFA is anchored, it is only allowed to report matches that
- /// start at index `0`.
- fn is_anchored(&self) -> bool;
-
- /// Given the current state that this DFA is in and the next input byte,
- /// this method returns the identifier of the next state. The identifier
- /// returned is always valid, but it may correspond to a dead state.
- fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
-
- /// Like `next_state`, but its implementation may look up the next state
- /// without memory safety checks such as bounds checks. As such, callers
- /// must ensure that the given identifier corresponds to a valid DFA
- /// state. Implementors must, in turn, ensure that this routine is safe
- /// for all valid state identifiers and for all possible `u8` values.
- unsafe fn next_state_unchecked(
- &self,
- current: Self::ID,
- input: u8,
- ) -> Self::ID;
-
- /// Returns true if and only if the given bytes match this DFA.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if a DFA enters
- /// a match state or a dead state, then this routine will return `true` or
- /// `false`, respectively, without inspecting any future input.
- ///
- /// # Example
- ///
- /// This example shows how to use this method with a
- /// [`DenseDFA`](enum.DenseDFA.html).
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa = DenseDFA::new("foo[0-9]+bar")?;
- /// assert_eq!(true, dfa.is_match(b"foo12345bar"));
- /// assert_eq!(false, dfa.is_match(b"foobar"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- #[inline]
- fn is_match(&self, bytes: &[u8]) -> bool {
- self.is_match_at(bytes, 0)
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Example
- ///
- /// This example shows how to use this method with a
- /// [`DenseDFA`](enum.DenseDFA.html).
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa = DenseDFA::new("foo[0-9]+")?;
- /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
- ///
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the shortest match semantics detect a match earlier.
- /// let dfa = DenseDFA::new("abc|a")?;
- /// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- #[inline]
- fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
- self.shortest_match_at(bytes, 0)
- }
-
- /// Returns the end offset of the longest match. If no match exists,
- /// then `None` is returned.
- ///
- /// Implementors of this trait are not required to implement any particular
- /// match semantics (such as leftmost-first), which are instead manifest in
- /// the DFA's topology itself.
- ///
- /// In particular, this method must continue searching even after it
- /// enters a match state. The search should only terminate once it has
- /// reached the end of the input or when it has entered a dead state. Upon
- /// termination, the position of the last byte seen while still in a match
- /// state is returned.
- ///
- /// # Example
- ///
- /// This example shows how to use this method with a
- /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
- /// "leftmost first" match semantics.
- ///
- /// Leftmost first match semantics corresponds to the match with the
- /// smallest starting offset, but where the end offset is determined by
- /// preferring earlier branches in the original regular expression. For
- /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
- /// will match `Samwise` in `Samwise`.
- ///
- /// Generally speaking, the "leftmost first" match is how most backtracking
- /// regular expressions tend to work. This is in contrast to POSIX-style
- /// regular expressions that yield "leftmost longest" matches. Namely,
- /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
- /// leftmost longest semantics.
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa = DenseDFA::new("foo[0-9]+")?;
- /// assert_eq!(Some(8), dfa.find(b"foo12345"));
- ///
- /// // Even though a match is found after reading the first byte (`a`),
- /// // the leftmost first match semantics demand that we find the earliest
- /// // match that prefers earlier parts of the pattern over latter parts.
- /// let dfa = DenseDFA::new("abc|a")?;
- /// assert_eq!(Some(3), dfa.find(b"abc"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- #[inline]
- fn find(&self, bytes: &[u8]) -> Option<usize> {
- self.find_at(bytes, 0)
- }
-
- /// Returns the start offset of the longest match in reverse, by searching
- /// from the end of the input towards the start of the input. If no match
- /// exists, then `None` is returned. In other words, this has the same
- /// match semantics as `find`, but in reverse.
- ///
- /// # Example
- ///
- /// This example shows how to use this method with a
- /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
- /// is principally useful when used in conjunction with the
- /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
- /// configuration knob. In general, it's unlikely to be correct to use both
- /// `find` and `rfind` with the same DFA since any particular DFA will only
- /// support searching in one direction.
- ///
- /// ```
- /// use regex_automata::{dense, DFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
- /// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- #[inline]
- fn rfind(&self, bytes: &[u8]) -> Option<usize> {
- self.rfind_at(bytes, bytes.len())
- }
-
- /// Returns the same as `is_match`, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- #[inline]
- fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
- if self.is_anchored() && start > 0 {
- return false;
- }
-
- let mut state = self.start_state();
- if self.is_match_or_dead_state(state) {
- return self.is_match_state(state);
- }
- for &b in bytes[start..].iter() {
- state = unsafe { self.next_state_unchecked(state, b) };
- if self.is_match_or_dead_state(state) {
- return self.is_match_state(state);
- }
- }
- false
- }
-
- /// Returns the same as `shortest_match`, but starts the search at the
- /// given offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- #[inline]
- fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- if self.is_anchored() && start > 0 {
- return None;
- }
-
- let mut state = self.start_state();
- if self.is_match_or_dead_state(state) {
- return if self.is_dead_state(state) { None } else { Some(start) };
- }
- for (i, &b) in bytes[start..].iter().enumerate() {
- state = unsafe { self.next_state_unchecked(state, b) };
- if self.is_match_or_dead_state(state) {
- return if self.is_dead_state(state) {
- None
- } else {
- Some(start + i + 1)
- };
- }
- }
- None
- }
-
- /// Returns the same as `find`, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- #[inline]
- fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- if self.is_anchored() && start > 0 {
- return None;
- }
-
- let mut state = self.start_state();
- let mut last_match = if self.is_dead_state(state) {
- return None;
- } else if self.is_match_state(state) {
- Some(start)
- } else {
- None
- };
- for (i, &b) in bytes[start..].iter().enumerate() {
- state = unsafe { self.next_state_unchecked(state, b) };
- if self.is_match_or_dead_state(state) {
- if self.is_dead_state(state) {
- return last_match;
- }
- last_match = Some(start + i + 1);
- }
- }
- last_match
- }
-
- /// Returns the same as `rfind`, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == bytes.len()`.
- #[inline(never)]
- fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- if self.is_anchored() && start < bytes.len() {
- return None;
- }
-
- let mut state = self.start_state();
- let mut last_match = if self.is_dead_state(state) {
- return None;
- } else if self.is_match_state(state) {
- Some(start)
- } else {
- None
- };
- for (i, &b) in bytes[..start].iter().enumerate().rev() {
- state = unsafe { self.next_state_unchecked(state, b) };
- if self.is_match_or_dead_state(state) {
- if self.is_dead_state(state) {
- return last_match;
- }
- last_match = Some(i);
- }
- }
- last_match
- }
-}
-
-impl<'a, T: DFA> DFA for &'a T {
- type ID = T::ID;
-
- #[inline]
- fn start_state(&self) -> Self::ID {
- (**self).start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: Self::ID) -> bool {
- (**self).is_match_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
- (**self).is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: Self::ID) -> bool {
- (**self).is_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- (**self).is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
- (**self).next_state(current, input)
- }
-
- #[inline]
- unsafe fn next_state_unchecked(
- &self,
- current: Self::ID,
- input: u8,
- ) -> Self::ID {
- (**self).next_state_unchecked(current, input)
- }
-}
diff --git a/vendor/regex-automata/src/dfa/accel.rs b/vendor/regex-automata/src/dfa/accel.rs
new file mode 100644
index 000000000..dbfeb7932
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/accel.rs
@@ -0,0 +1,507 @@
+// This module defines some core types for dealing with accelerated DFA states.
+// Briefly, a DFA state can be "accelerated" if all of its transitions except
+// for a few loop back to itself. This directly implies that the only way out
+// of such a state is if a byte corresponding to one of those non-loopback
+// transitions is found. Such states are often found in simple repetitions in
+// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
+// DFA with regex-cli:
+//
+// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
+// dense::DFA(
+// D 000000:
+// Q 000001:
+// *000002:
+// A 000003: \x00-` => 3, a => 5, b-\xFF => 3
+// >000004: \x00-` => 3, a => 4, b-\xFF => 3
+// 000005: \x00-\xFF => 2, EOI => 2
+// )
+//
+// In particular, state 3 is accelerated (shown via the 'A' indicator) since
+// the only way to leave that state once entered is to see an 'a' byte. If
+// there is a long run of non-'a' bytes, then using something like 'memchr'
+// to find the next 'a' byte can be significantly faster than just using the
+// standard byte-at-a-time state machine.
+//
+// Unfortunately, this optimization rarely applies when Unicode is enabled.
+// For example, patterns like '[^a]' don't actually match any byte that isn't
+// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
+// 'a'. This makes the state machine much more complex---far beyond a single
+// state---and removes the ability to easily accelerate it. (Because if the
+// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
+//
+// In practice, we only consider accelerating states that have 3 or fewer
+// non-loop transitions. At a certain point, you get diminishing returns, but
+// also because that's what the memchr crate supports. The structures below
+// hard-code this assumption and provide (de)serialization APIs for use inside
+// a DFA.
+//
+// And finally, note that there is some trickery involved in making it very
+// fast to not only check whether a state is accelerated at search time, but
+// also to access the bytes to search for to implement the acceleration itself.
+// dfa/special.rs provides more detail, but the short story is that all
+// accelerated states appear contiguously in a DFA. This means we can represent
+// the ID space of all accelerated DFA states with a single range. So given
+// a state ID, we can determine whether it's accelerated via
+//
+// min_accel_id <= id <= max_accel_id
+//
+// And find its corresponding accelerator with:
+//
+// accels.get((id - min_accel_id) / dfa_stride)
+
+use core::convert::{TryFrom, TryInto};
+
+#[cfg(feature = "alloc")]
+use alloc::{vec, vec::Vec};
+
+use crate::util::bytes::{self, DeserializeError, Endian, SerializeError};
+
+/// The base type used to represent a collection of accelerators.
+///
+/// While an `Accel` is represented as a fixed size array of bytes, a
+/// *collection* of `Accel`s (called `Accels`) is represented internally as a
+/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
+/// fairly low-risk not-safe code, it lets us remove the need for a second type
+/// parameter in the definition of dense::DFA. (Which really wants everything
+/// to be a slice of u32.)
+type AccelTy = u32;
+
+/// The size of the unit of representation for accelerators.
+///
+/// ACCEL_CAP *must* be a multiple of this size.
+const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
+
+/// The maximum length in bytes that a single Accel can be. This is distinct
+/// from the capacity of an accelerator in that the length represents only the
+/// bytes that should be read.
+const ACCEL_LEN: usize = 4;
+
+/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
+/// multiple of 4 (our ID size) and because it gives us a little wiggle room
+/// if we want to support more accel bytes in the future without a breaking
+/// change.
+///
+/// This MUST be a multiple of ACCEL_TY_SIZE.
+const ACCEL_CAP: usize = 8;
+
+/// Search for between 1 and 3 needle bytes in the given haystack, starting the
+/// search at the given position. If `needles` has a length other than 1-3,
+/// then this panics.
+#[inline(always)]
+pub(crate) fn find_fwd(
+ needles: &[u8],
+ haystack: &[u8],
+ at: usize,
+) -> Option<usize> {
+ let bs = needles;
+ let i = match needles.len() {
+ 1 => memchr::memchr(bs[0], &haystack[at..])?,
+ 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
+ 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
+ 0 => panic!("cannot find with empty needles"),
+ n => panic!("invalid needles length: {}", n),
+ };
+ Some(at + i)
+}
+
+/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
+/// starting the search at the given position. If `needles` has a length other
+/// than 1-3, then this panics.
+#[inline(always)]
+pub(crate) fn find_rev(
+ needles: &[u8],
+ haystack: &[u8],
+ at: usize,
+) -> Option<usize> {
+ let bs = needles;
+ match needles.len() {
+ 1 => memchr::memrchr(bs[0], &haystack[..at]),
+ 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
+ 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
+ 0 => panic!("cannot find with empty needles"),
+ n => panic!("invalid needles length: {}", n),
+ }
+}
+
+/// Represents the accelerators for all accelerated states in a dense DFA.
+///
+/// The `A` type parameter represents the type of the underlying bytes.
+/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
+#[derive(Clone)]
+pub(crate) struct Accels<A> {
+ /// A length prefixed slice of contiguous accelerators. See the top comment
+ /// in this module for more details on how we can jump from a DFA's state
+ /// ID to an accelerator in this list.
+ ///
+ /// The first 4 bytes always correspond to the number of accelerators
+ /// that follow.
+ accels: A,
+}
+
+#[cfg(feature = "alloc")]
+impl Accels<Vec<AccelTy>> {
+ /// Create an empty sequence of accelerators for a DFA.
+ pub fn empty() -> Accels<Vec<AccelTy>> {
+ Accels { accels: vec![0] }
+ }
+
+ /// Add an accelerator to this sequence.
+ ///
+ /// This adds to the accelerator to the end of the sequence and therefore
+ /// should be done in correspondence with its state in the DFA.
+ ///
+ /// This panics if this results in more accelerators than AccelTy::MAX.
+ pub fn add(&mut self, accel: Accel) {
+ self.accels.extend_from_slice(&accel.as_accel_tys());
+ let len = self.len();
+ self.set_len(len + 1);
+ }
+
+ /// Set the number of accelerators in this sequence, which is encoded in
+ /// the first 4 bytes of the underlying bytes.
+ fn set_len(&mut self, new_len: usize) {
+ // The only way an accelerator gets added is if a state exists for
+ // it, and if a state exists, then its index is guaranteed to be
+ // representable by a AccelTy by virtue of the guarantees provided by
+ // StateID.
+ let new_len = AccelTy::try_from(new_len).unwrap();
+ self.accels[0] = new_len;
+ }
+}
+
+impl<'a> Accels<&'a [AccelTy]> {
+ /// Deserialize a sequence of accelerators from the given bytes. If there
+ /// was a problem deserializing, then an error is returned.
+ ///
+ /// This is guaranteed to run in constant time. This does not guarantee
+ /// that every accelerator in the returned collection is valid. Thus,
+ /// accessing one may panic, or not-safe code that relies on accelerators
+ /// being correct my result in UB.
+ ///
+ /// Callers may check the validity of every accelerator with the `validate`
+ /// method.
+ pub unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr() as usize;
+
+ let (count, _) =
+ bytes::try_read_u32_as_usize(slice, "accelerators count")?;
+ // The accelerator count is part of the accel_tys slice that
+ // we deserialize. This is perhaps a bit idiosyncratic. It would
+ // probably be better to split out the count into a real field.
+
+ let accel_tys_count = bytes::add(
+ bytes::mul(count, 2, "total number of accelerator accel_tys")?,
+ 1,
+ "total number of accel_tys",
+ )?;
+ let accel_tys_len = bytes::mul(
+ ACCEL_TY_SIZE,
+ accel_tys_count,
+ "total number of bytes in accelerators",
+ )?;
+ bytes::check_slice_len(slice, accel_tys_len, "accelerators")?;
+ bytes::check_alignment::<AccelTy>(slice)?;
+ let accel_tys = &slice[..accel_tys_len];
+ slice = &slice[accel_tys_len..];
+ // SAFETY: We've checked the length and alignment above, and since
+ // slice is just bytes, we can safely cast to a slice of &[AccelTy].
+ #[allow(unused_unsafe)]
+ let accels = unsafe {
+ core::slice::from_raw_parts(
+ accel_tys.as_ptr() as *const AccelTy,
+ accel_tys_count,
+ )
+ };
+ Ok((Accels { accels }, slice.as_ptr() as usize - slice_start))
+ }
+}
+
+impl<A: AsRef<[AccelTy]>> Accels<A> {
+ /// Return an owned version of the accelerators.
+ #[cfg(feature = "alloc")]
+ pub fn to_owned(&self) -> Accels<Vec<AccelTy>> {
+ Accels { accels: self.accels.as_ref().to_vec() }
+ }
+
+ /// Return a borrowed version of the accelerators.
+ pub fn as_ref(&self) -> Accels<&[AccelTy]> {
+ Accels { accels: self.accels.as_ref() }
+ }
+
+ /// Return the bytes representing the serialization of the accelerators.
+ pub fn as_bytes(&self) -> &[u8] {
+ let accels = self.accels.as_ref();
+ // SAFETY: This is safe because accels is a just a slice of AccelTy,
+ // and u8 always has a smaller alignment.
+ unsafe {
+ core::slice::from_raw_parts(
+ accels.as_ptr() as *const u8,
+ accels.len() * ACCEL_TY_SIZE,
+ )
+ }
+ }
+
+ /// Returns the memory usage, in bytes, of these accelerators.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent all of the accelerators.
+ ///
+ /// This does **not** include the stack size used by this value.
+ pub fn memory_usage(&self) -> usize {
+ self.as_bytes().len()
+ }
+
+ /// Return the bytes to search for corresponding to the accelerator in this
+ /// sequence at index `i`. If no such accelerator exists, then this panics.
+ ///
+ /// The significance of the index is that it should be in correspondence
+ /// with the index of the corresponding DFA. That is, accelerated DFA
+ /// states are stored contiguously in the DFA and have an ordering implied
+ /// by their respective state IDs. The state's index in that sequence
+ /// corresponds to the index of its corresponding accelerator.
+ #[inline(always)]
+ pub fn needles(&self, i: usize) -> &[u8] {
+ if i >= self.len() {
+ panic!("invalid accelerator index {}", i);
+ }
+ let bytes = self.as_bytes();
+ let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
+ let len = bytes[offset] as usize;
+ &bytes[offset + 1..offset + 1 + len]
+ }
+
+ /// Return the total number of accelerators in this sequence.
+ pub fn len(&self) -> usize {
+ // This should never panic since deserialization checks that the
+ // length can fit into a usize.
+ usize::try_from(self.accels.as_ref()[0]).unwrap()
+ }
+
+ /// Return the accelerator in this sequence at index `i`. If no such
+ /// accelerator exists, then this returns None.
+ ///
+ /// See the docs for `needles` on the significance of the index.
+ fn get(&self, i: usize) -> Option<Accel> {
+ if i >= self.len() {
+ return None;
+ }
+ let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
+ let accel = Accel::from_slice(&self.as_bytes()[offset..])
+ .expect("Accels must contain valid accelerators");
+ Some(accel)
+ }
+
+ /// Returns an iterator of accelerators in this sequence.
+ fn iter(&self) -> IterAccels<'_, A> {
+ IterAccels { accels: self, i: 0 }
+ }
+
+ /// Writes these accelerators to the given byte buffer using the indicated
+ /// endianness. If the given buffer is too small, then an error is
+ /// returned. Upon success, the total number of bytes written is returned.
+ /// The number of bytes written is guaranteed to be a multiple of 8.
+ pub fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ assert_eq!(
+ nwrite % ACCEL_TY_SIZE,
+ 0,
+ "expected accelerator bytes written to be a multiple of {}",
+ ACCEL_TY_SIZE,
+ );
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("accelerators"));
+ }
+
+ // The number of accelerators can never exceed AccelTy::MAX.
+ E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
+ // The actual accelerators are just raw bytes and thus their endianness
+ // is irrelevant. So we can copy them as bytes.
+ dst[ACCEL_TY_SIZE..nwrite]
+ .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
+ Ok(nwrite)
+ }
+
+ /// Validates that every accelerator in this collection can be successfully
+ /// deserialized as a valid accelerator.
+ pub fn validate(&self) -> Result<(), DeserializeError> {
+ for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
+ let _ = Accel::from_slice(chunk)?;
+ }
+ Ok(())
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub fn write_to_len(&self) -> usize {
+ self.as_bytes().len()
+ }
+}
+
+impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "Accels(")?;
+ let mut list = f.debug_list();
+ for a in self.iter() {
+ list.entry(&a);
+ }
+ list.finish()?;
+ write!(f, ")")
+ }
+}
+
+#[derive(Debug)]
+struct IterAccels<'a, A: AsRef<[AccelTy]>> {
+ accels: &'a Accels<A>,
+ i: usize,
+}
+
+impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
+ type Item = Accel;
+
+ fn next(&mut self) -> Option<Accel> {
+ let accel = self.accels.get(self.i)?;
+ self.i += 1;
+ Some(accel)
+ }
+}
+
+/// Accel represents a structure for determining how to "accelerate" a DFA
+/// state.
+///
+/// Namely, it contains zero or more bytes that must be seen in order for the
+/// DFA to leave the state it is associated with. In practice, the actual range
+/// is 1 to 3 bytes.
+///
+/// The purpose of acceleration is to identify states whose vast majority
+/// of transitions are just loops back to the same state. For example,
+/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
+/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
+/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
+/// looking for the next occurrence of either `a` or `b` instead of explicitly
+/// following transitions. (In this case, `b` transitions to the next state
+/// where as `a` would transition to the dead state.)
+#[derive(Clone)]
+pub(crate) struct Accel {
+ /// The first byte is the length. Subsequent bytes are the accelerated
+ /// bytes.
+ ///
+ /// Note that we make every accelerator 8 bytes as a slightly wasteful
+ /// way of making sure alignment is always correct for state ID sizes of
+ /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
+ /// particularly common, especially when Unicode is enabled.
+ bytes: [u8; ACCEL_CAP],
+}
+
+impl Accel {
+ /// Returns an empty accel, where no bytes are accelerated.
+ #[cfg(feature = "alloc")]
+ pub fn new() -> Accel {
+ Accel { bytes: [0; ACCEL_CAP] }
+ }
+
+ /// Returns a verified accelerator derived from the beginning of the given
+ /// slice.
+ ///
+ /// If the slice is not long enough or contains invalid bytes for an
+ /// accelerator, then this returns an error.
+ pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
+ slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
+ let bytes = slice
+ .try_into()
+ .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
+ Accel::from_bytes(bytes)
+ }
+
+ /// Returns a verified accelerator derived from raw bytes.
+ ///
+ /// If the given bytes are invalid, then this returns an error.
+ fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
+ if bytes[0] as usize >= ACCEL_LEN {
+ return Err(DeserializeError::generic(
+ "accelerator bytes cannot have length more than 3",
+ ));
+ }
+ Ok(Accel::from_bytes_unchecked(bytes))
+ }
+
+ /// Returns an accelerator derived from raw bytes.
+ ///
+ /// This does not check whether the given bytes are valid. Invalid bytes
+ /// cannot sacrifice memory safety, but may result in panics or silent
+ /// logic bugs.
+ fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
+ Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
+ }
+
+ /// Attempts to add the given byte to this accelerator. If the accelerator
+ /// is already full then this returns false. Otherwise, returns true.
+ ///
+ /// If the given byte is already in this accelerator, then it panics.
+ #[cfg(feature = "alloc")]
+ pub fn add(&mut self, byte: u8) -> bool {
+ if self.len() >= 3 {
+ return false;
+ }
+ assert!(
+ !self.contains(byte),
+ "accelerator already contains {:?}",
+ crate::util::DebugByte(byte)
+ );
+ self.bytes[self.len() + 1] = byte;
+ self.bytes[0] += 1;
+ true
+ }
+
+ /// Return the number of bytes in this accelerator.
+ pub fn len(&self) -> usize {
+ self.bytes[0] as usize
+ }
+
+ /// Returns true if and only if there are no bytes in this accelerator.
+ #[cfg(feature = "alloc")]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the slice of bytes to accelerate.
+ ///
+ /// If this accelerator is empty, then this returns an empty slice.
+ fn needles(&self) -> &[u8] {
+ &self.bytes[1..1 + self.len()]
+ }
+
+ /// Returns true if and only if this accelerator will accelerate the given
+ /// byte.
+ #[cfg(feature = "alloc")]
+ fn contains(&self, byte: u8) -> bool {
+ self.needles().iter().position(|&b| b == byte).is_some()
+ }
+
+ /// Returns the accelerator bytes as an array of AccelTys.
+ #[cfg(feature = "alloc")]
+ fn as_accel_tys(&self) -> [AccelTy; 2] {
+ assert_eq!(ACCEL_CAP, 8);
+ // These unwraps are OK since ACCEL_CAP is set to 8.
+ let first =
+ AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
+ let second =
+ AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
+ [first, second]
+ }
+}
+
+impl core::fmt::Debug for Accel {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "Accel(")?;
+ let mut set = f.debug_set();
+ for &b in self.needles() {
+ set.entry(&crate::util::DebugByte(b));
+ }
+ set.finish()?;
+ write!(f, ")")
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/automaton.rs b/vendor/regex-automata/src/dfa/automaton.rs
new file mode 100644
index 000000000..08bd6722a
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/automaton.rs
@@ -0,0 +1,1903 @@
+use crate::{
+ dfa::search,
+ util::{
+ id::{PatternID, StateID},
+ matchtypes::{HalfMatch, MatchError},
+ prefilter,
+ },
+};
+
+/// A trait describing the interface of a deterministic finite automaton (DFA).
+///
+/// The complexity of this trait probably means that it's unlikely for others
+/// to implement it. The primary purpose of the trait is to provide for a way
+/// of abstracting over different types of DFAs. In this crate, that means
+/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where
+/// as sparse DFAs are slower but come with a smaller memory footprint. But
+/// they otherwise provide exactly equivalent expressive power.) For example, a
+/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait.
+///
+/// Normally, a DFA's execution model is very simple. You might have a single
+/// start state, zero or more final or "match" states and a function that
+/// transitions from one state to the next given the next byte of input.
+/// Unfortunately, the interface described by this trait is significantly
+/// more complicated than this. The complexity has a number of different
+/// reasons, mostly motivated by performance, functionality or space savings:
+///
+/// * A DFA can search for multiple patterns simultaneously. This
+/// means extra information is returned when a match occurs. Namely,
+/// a match is not just an offset, but an offset plus a pattern ID.
+/// [`Automaton::pattern_count`] returns the number of patterns compiled into
+/// the DFA, [`Automaton::match_count`] returns the total number of patterns
+/// that match in a particular state and [`Automaton::match_pattern`] permits
+/// iterating over the patterns that match in a particular state.
+/// * A DFA can have multiple start states, and the choice of which start
+/// state to use depends on the content of the string being searched and
+/// position of the search, as well as whether the search is an anchored
+/// search for a specific pattern in the DFA. Moreover, computing the start
+/// state also depends on whether you're doing a forward or a reverse search.
+/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`]
+/// are used to compute the start state for forward and reverse searches,
+/// respectively.
+/// * All matches are delayed by one byte to support things like `$` and `\b`
+/// at the end of a pattern. Therefore, every use of a DFA is required to use
+/// [`Automaton::next_eoi_state`]
+/// at the end of the search to compute the final transition.
+/// * For optimization reasons, some states are treated specially. Every
+/// state is either special or not, which can be determined via the
+/// [`Automaton::is_special_state`] method. If it's special, then the state
+/// must be at least one of a few possible types of states. (Note that some
+/// types can overlap, for example, a match state can also be an accel state.
+/// But some types can't. If a state is a dead state, then it can never be any
+/// other type of state.) Those types are:
+/// * A dead state. A dead state means the DFA will never enter a match
+/// state. This can be queried via the [`Automaton::is_dead_state`] method.
+/// * A quit state. A quit state occurs if the DFA had to stop the search
+/// prematurely for some reason. This can be queried via the
+/// [`Automaton::is_quit_state`] method.
+/// * A match state. A match state occurs when a match is found. When a DFA
+/// enters a match state, the search may stop immediately (when looking
+/// for the earliest match), or it may continue to find the leftmost-first
+/// match. This can be queried via the [`Automaton::is_match_state`]
+/// method.
+/// * A start state. A start state is where a search begins. For every
+/// search, there is exactly one start state that is used, however, a
+/// DFA may contain many start states. When the search is in a start
+/// state, it may use a prefilter to quickly skip to candidate matches
+/// without executing the DFA on every byte. This can be queried via the
+/// [`Automaton::is_start_state`] method.
+/// * An accel state. An accel state is a state that is accelerated.
+/// That is, it is a state where _most_ of its transitions loop back to
+/// itself and only a small number of transitions lead to other states.
+/// This kind of state is said to be accelerated because a search routine
+/// can quickly look for the bytes leading out of the state instead of
+/// continuing to execute the DFA on each byte. This can be queried via the
+/// [`Automaton::is_accel_state`] method. And the bytes that lead out of
+/// the state can be queried via the [`Automaton::accelerator`] method.
+///
+/// There are a number of provided methods on this trait that implement
+/// efficient searching (for forwards and backwards) with a DFA using all of
+/// the above features of this trait. In particular, given the complexity of
+/// all these features, implementing a search routine in this trait is not
+/// straight forward. If you need to do this for specialized reasons, then
+/// it's recommended to look at the source of this crate. It is intentionally
+/// well commented to help with this. With that said, it is possible to
+/// somewhat simplify the search routine. For example, handling accelerated
+/// states is strictly optional, since it is always correct to assume that
+/// `Automaton::is_accel_state` returns false. However, one complex part of
+/// writing a search routine using this trait is handling the 1-byte delay of a
+/// match. That is not optional.
+///
+/// # Safety
+///
+/// This trait is unsafe to implement because DFA searching may rely on the
+/// correctness of the implementation for memory safety. For example, DFA
+/// searching may use explicit bounds check elision, which will in turn rely
+/// on the correctness of every function that returns a state ID.
+///
+/// When implementing this trait, one must uphold the documented correctness
+/// guarantees. Otherwise, undefined behavior may occur.
+pub unsafe trait Automaton {
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input.
+ ///
+ /// Implementations must guarantee that the returned ID is always a valid
+ /// ID when `current` refers to a valid ID. Moreover, the transition
+ /// function must be defined for all possible values of `input`.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid ID.
+ /// However, if the caller provides an invalid ID then this must never
+ /// sacrifice memory safety.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a DFA for a given haystack
+ /// by using the `next_state` method.
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense};
+ ///
+ /// let dfa = dense::DFA::new(r"[a-z]+r")?;
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut state = dfa.start_state_forward(
+ /// None, haystack, 0, haystack.len(),
+ /// );
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// state = dfa.next_state(state, b);
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+ /// // special "EOI" transition at the end of the search.
+ /// state = dfa.next_eoi_state(state);
+ /// assert!(dfa.is_match_state(state));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn next_state(&self, current: StateID, input: u8) -> StateID;
+
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input.
+ ///
+ /// Unlike [`Automaton::next_state`], implementations may implement this
+ /// more efficiently by assuming that the `current` state ID is valid.
+ /// Typically, this manifests by eliding bounds checks.
+ ///
+ /// # Safety
+ ///
+ /// Callers of this method must guarantee that `current` refers to a valid
+ /// state ID. If `current` is not a valid state ID for this automaton, then
+ /// calling this routine may result in undefined behavior.
+ ///
+ /// If `current` is valid, then implementations must guarantee that the ID
+ /// returned is valid for all possible values of `input`.
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID;
+
+ /// Transitions from the current state to the next state for the special
+ /// EOI symbol.
+ ///
+ /// Implementations must guarantee that the returned ID is always a valid
+ /// ID when `current` refers to a valid ID.
+ ///
+ /// This routine must be called at the end of every search in a correct
+ /// implementation of search. Namely, DFAs in this crate delay matches
+ /// by one byte in order to support look-around operators. Thus, after
+ /// reaching the end of a haystack, a search implementation must follow one
+ /// last EOI transition.
+ ///
+ /// It is best to think of EOI as an additional symbol in the alphabet of
+ /// a DFA that is distinct from every other symbol. That is, the alphabet
+ /// of DFAs in this crate has a logical size of 257 instead of 256, where
+ /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
+ /// physical alphabet size may be smaller because of alphabet compression
+ /// via equivalence classes, but EOI is always represented somehow in the
+ /// alphabet.)
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid ID.
+ /// However, if the caller provides an invalid ID then this must never
+ /// sacrifice memory safety.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a DFA for a given haystack,
+ /// and then finishing the search with the final EOI transition.
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense};
+ ///
+ /// let dfa = dense::DFA::new(r"[a-z]+r")?;
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut state = dfa.start_state_forward(
+ /// None, haystack, 0, haystack.len(),
+ /// );
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// state = dfa.next_state(state, b);
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search. Without this
+ /// // final transition, the assert below will fail since the DFA will not
+ /// // have entered a match state yet!
+ /// state = dfa.next_eoi_state(state);
+ /// assert!(dfa.is_match_state(state));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn next_eoi_state(&self, current: StateID) -> StateID;
+
+ /// Return the ID of the start state for this DFA when executing a forward
+ /// search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The pattern ID, if present. When the underlying DFA has been compiled
+ /// with multiple patterns _and_ the DFA has been configured to compile
+ /// an anchored start state for each pattern, then a pattern ID may be
+ /// specified to execute an anchored search for that specific pattern.
+ /// If `pattern_id` is invalid or if the DFA doesn't have start states
+ /// compiled for each pattern, then implementations must panic. DFAs in
+ /// this crate can be configured to compile start states for each pattern
+ /// via
+ /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern).
+ /// * When `start > 0`, the byte at index `start - 1` may influence the
+ /// start state if the regex uses `^` or `\b`.
+ /// * Similarly, when `start == 0`, it may influence the start state when
+ /// the regex uses `^` or `\A`.
+ /// * Currently, `end` is unused.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for forward searches.
+ ///
+ /// # Panics
+ ///
+ /// Implementations must panic if `start..end` is not a valid sub-slice of
+ /// `bytes`. Implementations must also panic if `pattern_id` is non-None
+ /// and does not refer to a valid pattern, or if the DFA was not compiled
+ /// with anchored start states for each pattern.
+ fn start_state_forward(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID;
+
+ /// Return the ID of the start state for this DFA when executing a reverse
+ /// search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The pattern ID, if present. When the underlying DFA has been compiled
+ /// with multiple patterns _and_ the DFA has been configured to compile an
+ /// anchored start state for each pattern, then a pattern ID may be
+ /// specified to execute an anchored search for that specific pattern. If
+ /// `pattern_id` is invalid or if the DFA doesn't have start states compiled
+ /// for each pattern, then implementations must panic. DFAs in this crate
+ /// can be configured to compile start states for each pattern via
+ /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern).
+ /// * When `end < bytes.len()`, the byte at index `end` may influence the
+ /// start state if the regex uses `$` or `\b`.
+ /// * Similarly, when `end == bytes.len()`, it may influence the start
+ /// state when the regex uses `$` or `\z`.
+ /// * Currently, `start` is unused.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for reverse searches.
+ ///
+ /// # Panics
+ ///
+ /// Implementations must panic if `start..end` is not a valid sub-slice of
+ /// `bytes`. Implementations must also panic if `pattern_id` is non-None
+ /// and does not refer to a valid pattern, or if the DFA was not compiled
+ /// with anchored start states for each pattern.
+ fn start_state_reverse(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID;
+
+ /// Returns true if and only if the given identifier corresponds to a
+ /// "special" state. A special state is one or more of the following:
+ /// a dead state, a quit state, a match state, a start state or an
+ /// accelerated state.
+ ///
+ /// A correct implementation _may_ always return false for states that
+ /// are either start states or accelerated states, since that information
+ /// is only intended to be used for optimization purposes. Correct
+ /// implementations must return true if the state is a dead, quit or match
+ /// state. This is because search routines using this trait must be able
+ /// to rely on `is_special_state` as an indicator that a state may need
+ /// special treatment. (For example, when a search routine sees a dead
+ /// state, it must terminate.)
+ ///
+ /// This routine permits search implementations to use a single branch to
+ /// check whether a state needs special attention before executing the next
+ /// transition. The example below shows how to do this.
+ ///
+ /// # Example
+ ///
+ /// This example shows how `is_special_state` can be used to implement a
+ /// correct search routine with minimal branching. In particular, this
+ /// search routine implements "leftmost" matching, which means that it
+ /// doesn't immediately stop once a match is found. Instead, it continues
+ /// until it reaches a dead state.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, MatchError, PatternID,
+ /// };
+ ///
+ /// fn find_leftmost_first<A: Automaton>(
+ /// dfa: &A,
+ /// haystack: &[u8],
+ /// ) -> Result<Option<HalfMatch>, MatchError> {
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack. Note that start states can never
+ /// // be match states (since DFAs in this crate delay matches by 1
+ /// // byte), so we don't need to check if the start state is a match.
+ /// let mut state = dfa.start_state_forward(
+ /// None, haystack, 0, haystack.len(),
+ /// );
+ /// let mut last_match = None;
+ /// // Walk all the bytes in the haystack. We can quit early if we see
+ /// // a dead or a quit state. The former means the automaton will
+ /// // never transition to any other state. The latter means that the
+ /// // automaton entered a condition in which its search failed.
+ /// for (i, &b) in haystack.iter().enumerate() {
+ /// state = dfa.next_state(state, b);
+ /// if dfa.is_special_state(state) {
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// i,
+ /// ));
+ /// } else if dfa.is_dead_state(state) {
+ /// return Ok(last_match);
+ /// } else if dfa.is_quit_state(state) {
+ /// // It is possible to enter into a quit state after
+ /// // observing a match has occurred. In that case, we
+ /// // should return the match instead of an error.
+ /// if last_match.is_some() {
+ /// return Ok(last_match);
+ /// }
+ /// return Err(MatchError::Quit { byte: b, offset: i });
+ /// }
+ /// // Implementors may also want to check for start or accel
+ /// // states and handle them differently for performance
+ /// // reasons. But it is not necessary for correctness.
+ /// }
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search.
+ /// state = dfa.next_eoi_state(state);
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// haystack.len(),
+ /// ));
+ /// }
+ /// Ok(last_match)
+ /// }
+ ///
+ /// // We use a greedy '+' operator to show how the search doesn't just
+ /// // stop once a match is detected. It continues extending the match.
+ /// // Using '[a-z]+?' would also work as expected and stop the search
+ /// // early. Greediness is built into the automaton.
+ /// let dfa = dense::DFA::new(r"[a-z]+")?;
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 10);
+ ///
+ /// // Here's another example that tests our handling of the special EOI
+ /// // transition. This will fail to find a match if we don't call
+ /// // 'next_eoi_state' at the end of the search since the match isn't
+ /// // found until the final byte in the haystack.
+ /// let dfa = dense::DFA::new(r"[0-9]{4}")?;
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 15);
+ ///
+ /// // And note that our search implementation above automatically works
+ /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
+ /// // the appropriate pattern ID for us.
+ /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 1);
+ /// assert_eq!(mat.offset(), 3);
+ /// let mat = find_leftmost_first(&dfa, &haystack[3..])?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 7);
+ /// let mat = find_leftmost_first(&dfa, &haystack[10..])?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 1);
+ /// assert_eq!(mat.offset(), 5);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_special_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a dead
+ /// state. When a DFA enters a dead state, it is impossible to leave. That
+ /// is, every transition on a dead state by definition leads back to the
+ /// same dead state.
+ ///
+ /// In practice, the dead state always corresponds to the identifier `0`.
+ /// Moreover, in practice, there is only one dead state.
+ ///
+ /// The existence of a dead state is not strictly required in the classical
+ /// model of finite state machines, where one generally only cares about
+ /// the question of whether an input sequence matches or not. Dead states
+ /// are not needed to answer that question, since one can immediately quit
+ /// as soon as one enters a final or "match" state. However, we don't just
+ /// care about matches but also care about the location of matches, and
+ /// more specifically, care about semantics like "greedy" matching.
+ ///
+ /// For example, given the pattern `a+` and the input `aaaz`, the dead
+ /// state won't be entered until the state machine reaches `z` in the
+ /// input, at which point, the search routine can quit. But without the
+ /// dead state, the search routine wouldn't know when to quit. In a
+ /// classical representation, the search routine would stop after seeing
+ /// the first `a` (which is when the search would enter a match state). But
+ /// this wouldn't implement "greedy" matching where `a+` matches as many
+ /// `a`'s as possible.
+ ///
+ /// # Example
+ ///
+ /// See the example for [`Automaton::is_special_state`] for how to use this
+ /// method correctly.
+ fn is_dead_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a quit
+ /// state. A quit state is like a dead state (it has no transitions other
+ /// than to itself), except it indicates that the DFA failed to complete
+ /// the search. When this occurs, callers can neither accept or reject that
+ /// a match occurred.
+ ///
+ /// In practice, the quit state always corresponds to the state immediately
+ /// following the dead state. (Which is not usually represented by `1`,
+ /// since state identifiers are pre-multiplied by the state machine's
+ /// alphabet stride, and the alphabet stride varies between DFAs.)
+ ///
+ /// By default, state machines created by this crate will never enter a
+ /// quit state. Since entering a quit state is the only way for a DFA
+ /// in this crate to fail at search time, it follows that the default
+ /// configuration can never produce a match error. Nevertheless, handling
+ /// quit states is necessary to correctly support all configurations in
+ /// this crate.
+ ///
+ /// The typical way in which a quit state can occur is when heuristic
+ /// support for Unicode word boundaries is enabled via the
+ /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary)
+ /// option. But other options, like the lower level
+ /// [`dense::Config::quit`](crate::dfa::dense::Config::quit)
+ /// configuration, can also result in a quit state being entered. The
+ /// purpose of the quit state is to provide a way to execute a fast DFA
+ /// in common cases while delegating to slower routines when the DFA quits.
+ ///
+ /// The default search implementations provided by this crate will return
+ /// a [`MatchError::Quit`](crate::MatchError::Quit) error when a quit state
+ /// is entered.
+ ///
+ /// # Example
+ ///
+ /// See the example for [`Automaton::is_special_state`] for how to use this
+ /// method correctly.
+ fn is_quit_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a
+ /// match state. A match state is also referred to as a "final" state and
+ /// indicates that a match has been found.
+ ///
+ /// If all you care about is whether a particular pattern matches in the
+ /// input sequence, then a search routine can quit early as soon as the
+ /// machine enters a match state. However, if you're looking for the
+ /// standard "leftmost-first" match location, then search _must_ continue
+ /// until either the end of the input or until the machine enters a dead
+ /// state. (Since either condition implies that no other useful work can
+ /// be done.) Namely, when looking for the location of a match, then
+ /// search implementations should record the most recent location in
+ /// which a match state was entered, but otherwise continue executing the
+ /// search as normal. (The search may even leave the match state.) Once
+ /// the termination condition is reached, the most recently recorded match
+ /// location should be returned.
+ ///
+ /// Finally, one additional power given to match states in this crate
+ /// is that they are always associated with a specific pattern in order
+ /// to support multi-DFAs. See [`Automaton::match_pattern`] for more
+ /// details and an example for how to query the pattern associated with a
+ /// particular match state.
+ ///
+ /// # Example
+ ///
+ /// See the example for [`Automaton::is_special_state`] for how to use this
+ /// method correctly.
+ fn is_match_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a
+ /// start state. A start state is a state in which a DFA begins a search.
+ /// All searches begin in a start state. Moreover, since all matches are
+ /// delayed by one byte, a start state can never be a match state.
+ ///
+ /// The main role of a start state is, as mentioned, to be a starting
+ /// point for a DFA. This starting point is determined via one of
+ /// [`Automaton::start_state_forward`] or
+ /// [`Automaton::start_state_reverse`], depending on whether one is doing
+ /// a forward or a reverse search, respectively.
+ ///
+ /// A secondary use of start states is for prefix acceleration. Namely,
+ /// while executing a search, if one detects that you're in a start state,
+ /// then it may be faster to look for the next match of a prefix of the
+ /// pattern, if one exists. If a prefix exists and since all matches must
+ /// begin with that prefix, then skipping ahead to occurrences of that
+ /// prefix may be much faster than executing the DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to implement your own search routine that does
+ /// a prefix search whenever the search enters a start state.
+ ///
+ /// Note that you do not need to implement your own search routine to
+ /// make use of prefilters like this. The search routines provided
+ /// by this crate already implement prefilter support via the
+ /// [`Prefilter`](crate::util::prefilter::Prefilter) trait. The various
+ /// `find_*_at` routines on this trait support the `Prefilter` trait
+ /// through [`Scanner`](crate::util::prefilter::Scanner)s. This example is
+ /// meant to show how you might deal with prefilters in a simplified case
+ /// if you are implementing your own search routine.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// MatchError, PatternID,
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// };
+ ///
+ /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> {
+ /// // Would be faster to use the memchr crate, but this is still
+ /// // faster than running through the DFA.
+ /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i)
+ /// }
+ ///
+ /// fn find_leftmost_first<A: Automaton>(
+ /// dfa: &A,
+ /// haystack: &[u8],
+ /// prefix_byte: Option<u8>,
+ /// ) -> Result<Option<HalfMatch>, MatchError> {
+ /// // See the Automaton::is_special_state example for similar code
+ /// // with more comments.
+ ///
+ /// let mut state = dfa.start_state_forward(
+ /// None, haystack, 0, haystack.len(),
+ /// );
+ /// let mut last_match = None;
+ /// let mut pos = 0;
+ /// while pos < haystack.len() {
+ /// let b = haystack[pos];
+ /// state = dfa.next_state(state, b);
+ /// pos += 1;
+ /// if dfa.is_special_state(state) {
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// pos - 1,
+ /// ));
+ /// } else if dfa.is_dead_state(state) {
+ /// return Ok(last_match);
+ /// } else if dfa.is_quit_state(state) {
+ /// // It is possible to enter into a quit state after
+ /// // observing a match has occurred. In that case, we
+ /// // should return the match instead of an error.
+ /// if last_match.is_some() {
+ /// return Ok(last_match);
+ /// }
+ /// return Err(MatchError::Quit {
+ /// byte: b, offset: pos - 1,
+ /// });
+ /// } else if dfa.is_start_state(state) {
+ /// // If we're in a start state and know all matches begin
+ /// // with a particular byte, then we can quickly skip to
+ /// // candidate matches without running the DFA through
+ /// // every byte inbetween.
+ /// if let Some(prefix_byte) = prefix_byte {
+ /// pos = match find_byte(haystack, pos, prefix_byte) {
+ /// Some(pos) => pos,
+ /// None => break,
+ /// };
+ /// }
+ /// }
+ /// }
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search.
+ /// state = dfa.next_eoi_state(state);
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// haystack.len(),
+ /// ));
+ /// }
+ /// Ok(last_match)
+ /// }
+ ///
+ /// // In this example, it's obvious that all occurrences of our pattern
+ /// // begin with 'Z', so we pass in 'Z'.
+ /// let dfa = dense::DFA::new(r"Z[a-z]+")?;
+ /// let haystack = "123 foobar Zbaz quux".as_bytes();
+ /// let mat = find_leftmost_first(&dfa, haystack, Some(b'Z'))?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 15);
+ ///
+ /// // But note that we don't need to pass in a prefix byte. If we don't,
+ /// // then the search routine does no acceleration.
+ /// let mat = find_leftmost_first(&dfa, haystack, None)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 15);
+ ///
+ /// // However, if we pass an incorrect byte, then the prefix search will
+ /// // result in incorrect results.
+ /// assert_eq!(find_leftmost_first(&dfa, haystack, Some(b'X'))?, None);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_start_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to an
+ /// accelerated state.
+ ///
+ /// An accelerated state is a special optimization
+ /// trick implemented by this crate. Namely, if
+ /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is
+ /// enabled (and it is by default), then DFAs generated by this crate will
+ /// tag states meeting certain characteristics as accelerated. States meet
+ /// this criteria whenever most of their transitions are self-transitions.
+ /// That is, transitions that loop back to the same state. When a small
+ /// number of transitions aren't self-transitions, then it follows that
+ /// there are only a small number of bytes that can cause the DFA to leave
+ /// that state. Thus, there is an opportunity to look for those bytes
+ /// using more optimized routines rather than continuing to run through
+ /// the DFA. This trick is similar to the prefilter idea described in
+ /// the documentation of [`Automaton::is_start_state`] with two main
+ /// differences:
+ ///
+ /// 1. It is more limited since acceleration only applies to single bytes.
+ /// This means states are rarely accelerated when Unicode mode is enabled
+ /// (which is enabled by default).
+ /// 2. It can occur anywhere in the DFA, which increases optimization
+ /// opportunities.
+ ///
+ /// Like the prefilter idea, the main downside (and a possible reason to
+ /// disable it) is that it can lead to worse performance in some cases.
+ /// Namely, if a state is accelerated for very common bytes, then the
+ /// overhead of checking for acceleration and using the more optimized
+ /// routines to look for those bytes can cause overall performance to be
+ /// worse than if acceleration wasn't enabled at all.
+ ///
+ /// A simple example of a regex that has an accelerated state is
+ /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down
+ /// into a single state where all transitions except for `a` loop back to
+ /// itself, and where `a` is the only transition (other than the special
+ /// EOI transition) that goes to some other state. Thus, this state can
+ /// be accelerated and implemented more efficiently by calling an
+ /// optimized routine like `memchr` with `a` as the needle. Notice that
+ /// the `(?-u)` to disable Unicode is necessary here, as without it,
+ /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other
+ /// than `a`. This more complicated expression compiles down to many DFA
+ /// states and the simple acceleration optimization is no longer available.
+ ///
+ /// Typically, this routine is used to guard calls to
+ /// [`Automaton::accelerator`], which returns the accelerated bytes for
+ /// the specified state.
+ fn is_accel_state(&self, id: StateID) -> bool;
+
+ /// Returns the total number of patterns compiled into this DFA.
+ ///
+ /// In the case of a DFA that contains no patterns, this must return `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the pattern count for a DFA that never matches:
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa: DFA<Vec<u32>> = DFA::never_match()?;
+ /// assert_eq!(dfa.pattern_count(), 0);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And another example for a DFA that matches at every position:
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa: DFA<Vec<u32>> = DFA::always_match()?;
+ /// assert_eq!(dfa.pattern_count(), 1);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And finally, a DFA that was constructed from multiple patterns:
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(dfa.pattern_count(), 3);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn pattern_count(&self) -> usize;
+
+ /// Returns the total number of patterns that match in this state.
+ ///
+ /// If the given state is not a match state, then implementations may
+ /// panic.
+ ///
+ /// If the DFA was compiled with one pattern, then this must necessarily
+ /// always return `1` for all match states.
+ ///
+ /// Implementations must guarantee that [`Automaton::match_pattern`] can
+ /// be called with indices up to (but not including) the count returned by
+ /// this routine without panicking.
+ ///
+ /// # Panics
+ ///
+ /// Implementations are permitted to panic if the provided state ID does
+ /// not correspond to a match state.
+ ///
+ /// # Example
+ ///
+ /// This example shows a simple instance of implementing overlapping
+ /// matches. In particular, it shows not only how to determine how many
+ /// patterns have matched in a particular state, but also how to access
+ /// which specific patterns have matched.
+ ///
+ /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
+ /// when building the DFA. If we used
+ /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
+ /// instead, then the DFA would not be constructed in a way that supports
+ /// overlapping matches. (It would only report a single pattern that
+ /// matches at any particular point in time.)
+ ///
+ /// Another thing to take note of is the patterns used and the order in
+ /// which the pattern IDs are reported. In the example below, pattern `3`
+ /// is yielded first. Why? Because it corresponds to the match that
+ /// appears first. Namely, the `@` symbol is part of `\S+` but not part
+ /// of any of the other patterns. Since the `\S+` pattern has a match that
+ /// starts to the left of any other pattern, its ID is returned before any
+ /// other.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// MatchKind,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[
+ /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
+ /// ])?;
+ /// let haystack = "@bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut state = dfa.start_state_forward(
+ /// None, haystack, 0, haystack.len(),
+ /// );
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// state = dfa.next_state(state, b);
+ /// }
+ /// state = dfa.next_eoi_state(state);
+ ///
+ /// assert!(dfa.is_match_state(state));
+ /// assert_eq!(dfa.match_count(state), 3);
+ /// // The following calls are guaranteed to not panic since `match_count`
+ /// // returned `3` above.
+ /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3);
+ /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0);
+ /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn match_count(&self, id: StateID) -> usize;
+
+ /// Returns the pattern ID corresponding to the given match index in the
+ /// given state.
+ ///
+ /// See [`Automaton::match_count`] for an example of how to use this
+ /// method correctly. Note that if you know your DFA is compiled with a
+ /// single pattern, then this routine is never necessary since it will
+ /// always return a pattern ID of `0` for an index of `0` when `id`
+ /// corresponds to a match state.
+ ///
+ /// Typically, this routine is used when implementing an overlapping
+ /// search, as the example for `Automaton::match_count` does.
+ ///
+ /// # Panics
+ ///
+ /// If the state ID is not a match state or if the match index is out
+ /// of bounds for the given state, then this routine may either panic
+ /// or produce an incorrect result. If the state ID is correct and the
+ /// match index is correct, then this routine must always produce a valid
+ /// `PatternID`.
+ fn match_pattern(&self, id: StateID, index: usize) -> PatternID;
+
+ /// Return a slice of bytes to accelerate for the given state, if possible.
+ ///
+ /// If the given state has no accelerator, then an empty slice must be
+ /// returned. If `Automaton::is_accel_state` returns true for the given
+ /// ID, then this routine _must_ return a non-empty slice, but it is not
+ /// required to do so.
+ ///
+ /// If the given ID is not a valid state ID for this automaton, then
+ /// implementations may panic or produce incorrect results.
+ ///
+ /// See [`Automaton::is_accel_state`] for more details on state
+ /// acceleration.
+ ///
+ /// By default, this method will always return an empty slice.
+ ///
+ /// # Example
+ ///
+ /// This example shows a contrived case in which we build a regex that we
+ /// know is accelerated and extract the accelerator from a state.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson,
+ /// dfa::{Automaton, dense},
+ /// util::id::StateID,
+ /// SyntaxConfig,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// // We disable Unicode everywhere and permit the regex to match
+ /// // invalid UTF-8. e.g., `[^abc]` matches `\xFF`, which is not valid
+ /// // UTF-8.
+ /// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+ /// // This makes the implicit `(?s:.)*?` prefix added to the regex
+ /// // match through arbitrary bytes instead of being UTF-8 aware. This
+ /// // isn't necessary to get acceleration to work in this case, but
+ /// // it does make the DFA substantially simpler.
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("[^abc]+a")?;
+ ///
+ /// // Here we just pluck out the state that we know is accelerated.
+ /// // While the stride calculations are something that can be relied
+ /// // on by callers, the specific position of the accelerated state is
+ /// // implementation defined.
+ /// //
+ /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
+ /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
+ /// let id = StateID::new(3 * dfa.stride()).unwrap();
+ /// let accelerator = dfa.accelerator(id);
+ /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
+ /// assert_eq!(accelerator, &[b'a', b'b', b'c']);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn accelerator(&self, _id: StateID) -> &[u8] {
+ &[]
+ }
+
+ /// Executes a forward search and returns the end position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state. This is useful for implementing boolean `is_match`-like
+ /// routines, where as little work is done as possible.
+ ///
+ /// See [`Automaton::find_earliest_fwd_at`] for additional functionality,
+ /// such as providing a prefilter, a specific pattern to match and the
+ /// bounds of the search within the haystack. This routine is meant as
+ /// a convenience for common cases where the additional functionality is
+ /// not needed.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates
+ /// how the position returned might differ from what one might expect when
+ /// executing a traditional leftmost search.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = dense::DFA::new("foo[0-9]+")?;
+ /// // Normally, the end of the leftmost first match here would be 8,
+ /// // corresponding to the end of the input. But the "earliest" semantics
+ /// // this routine cause it to stop as soon as a match is known, which
+ /// // occurs once 'foo[0-9]' has matched.
+ /// let expected = HalfMatch::must(0, 4);
+ /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"foo12345")?);
+ ///
+ /// let dfa = dense::DFA::new("abc|a")?;
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let expected = HalfMatch::must(0, 1);
+ /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"abc")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn find_earliest_fwd(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_earliest_fwd_at(None, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a reverse search and returns the start position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state.
+ ///
+ /// Note that while it is not technically necessary to build a reverse
+ /// automaton to use a reverse search, it is likely that you'll want to do
+ /// so. Namely, the typical use of a reverse search is to find the starting
+ /// location of a match once its end is discovered from a forward search. A
+ /// reverse DFA automaton can be built by configuring the intermediate NFA
+ /// to be reversed via
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates
+ /// how the position returned might differ from what one might expect when
+ /// executing a traditional leftmost reverse search.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson,
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("[a-z]+[0-9]+")?;
+ /// // Normally, the end of the leftmost first match here would be 0,
+ /// // corresponding to the beginning of the input. But the "earliest"
+ /// // semantics of this routine cause it to stop as soon as a match is
+ /// // known, which occurs once '[a-z][0-9]+' has matched.
+ /// let expected = HalfMatch::must(0, 2);
+ /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"foo12345")?);
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// // Normally, the end of the leftmost first match here would be 0,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let expected = HalfMatch::must(0, 2);
+ /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"abc")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn find_earliest_rev(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_earliest_rev_at(None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Notes for implementors
+ ///
+ /// Implementors of this trait are not required to implement any particular
+ /// match semantics (such as leftmost-first), which are instead manifest in
+ /// the DFA's transitions.
+ ///
+ /// In particular, this method must continue searching even after it enters
+ /// a match state. The search should only terminate once it has reached
+ /// the end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// Since this trait provides an implementation for this method by default,
+ /// it's unlikely that one will need to implement this.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). By default, a dense DFA uses
+ /// "leftmost first" match semantics.
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics. (This crate does not currently support
+ /// leftmost longest semantics.)
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = dense::DFA::new("foo[0-9]+")?;
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = dense::DFA::new("abc|a")?;
+ /// let expected = HalfMatch::must(0, 3);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn find_leftmost_fwd(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_leftmost_fwd_at(None, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Notes for implementors
+ ///
+ /// Implementors of this trait are not required to implement any particular
+ /// match semantics (such as leftmost-first), which are instead manifest in
+ /// the DFA's transitions.
+ ///
+ /// In particular, this method must continue searching even after it enters
+ /// a match state. The search should only terminate once it has reached
+ /// the end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// Since this trait provides an implementation for this method by default,
+ /// it's unlikely that one will need to implement this.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this routine
+ /// is principally useful when used in conjunction with the
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
+ /// configuration. In general, it's unlikely to be correct to use both
+ /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since any
+ /// particular DFA will only support searching in one direction with
+ /// respect to the pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson,
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("foo[0-9]+")?;
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"foo12345")?);
+ ///
+ /// // Even though a match is found after reading the last byte (`c`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = dense::Builder::new()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn find_leftmost_rev(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_leftmost_rev_at(None, bytes, 0, bytes.len())
+ }
+
+ /// Executes an overlapping forward search and returns the end position of
+ /// matches as they are found. If no match exists, then `None` is returned.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run a basic overlapping search with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the
+ /// automaton with a `MatchKind::All` configuration. Overlapping searches
+ /// are unlikely to work as one would expect when using the default
+ /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first
+ /// matching is fundamentally incompatible with overlapping searches.
+ /// Namely, overlapping searches need to report matches as they are seen,
+ /// where as leftmost-first searches will continue searching even after a
+ /// match has been observed in order to find the conventional end position
+ /// of the match. More concretely, leftmost-first searches use dead states
+ /// to terminate a search after a specific match can no longer be extended.
+ /// Overlapping searches instead do the opposite by continuing the search
+ /// to find totally new matches (potentially of other patterns).
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, OverlappingState, dense},
+ /// HalfMatch,
+ /// MatchKind,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn find_overlapping_fwd(
+ &self,
+ bytes: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_overlapping_fwd_at(None, None, bytes, 0, bytes.len(), state)
+ }
+
+ /// Executes a forward search and returns the end position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state. This is useful for implementing boolean `is_match`-like
+ /// routines, where as little work is done as possible.
+ ///
+ /// This is like [`Automaton::find_earliest_fwd`], except it provides some
+ /// additional control over how the search is executed:
+ ///
+ /// * `pre` is a prefilter scanner that, when given, is used whenever the
+ /// DFA enters its starting state. This is meant to speed up searches where
+ /// one or a small number of literal prefixes are known.
+ /// * `pattern_id` specifies a specific pattern in the DFA to run an
+ /// anchored search for. If not given, then a search for any pattern is
+ /// performed. For DFAs built by this crate,
+ /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern)
+ /// must be enabled to use this functionality.
+ /// * `start` and `end` permit searching a specific region of the haystack
+ /// `bytes`. This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `bytes`. (Because the existence of look-around
+ /// operations such as `\b`, `^` and `$` need to take the surrounding
+ /// context into account. This cannot be done if the haystack doesn't
+ /// contain it.)
+ ///
+ /// The examples below demonstrate each of these additional parameters.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine must panic if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It must also panic if the given haystack range is not valid.
+ ///
+ /// # Example: prefilter
+ ///
+ /// This example shows how to provide a prefilter for a pattern where all
+ /// matches start with a `z` byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// util::prefilter::{Candidate, Prefilter, Scanner, State},
+ /// HalfMatch,
+ /// };
+ ///
+ /// #[derive(Debug)]
+ /// pub struct ZPrefilter;
+ ///
+ /// impl Prefilter for ZPrefilter {
+ /// fn next_candidate(
+ /// &self,
+ /// _: &mut State,
+ /// haystack: &[u8],
+ /// at: usize,
+ /// ) -> Candidate {
+ /// // Try changing b'z' to b'q' and observe this test fail since
+ /// // the prefilter will skip right over the match.
+ /// match haystack.iter().position(|&b| b == b'z') {
+ /// None => Candidate::None,
+ /// Some(i) => Candidate::PossibleStartOfMatch(at + i),
+ /// }
+ /// }
+ ///
+ /// fn heap_bytes(&self) -> usize {
+ /// 0
+ /// }
+ /// }
+ ///
+ /// let dfa = dense::DFA::new("z[0-9]{3}")?;
+ /// let haystack = "foobar z123 q123".as_bytes();
+ /// // A scanner executes a prefilter while tracking some state that helps
+ /// // determine whether a prefilter is still "effective" or not.
+ /// let mut scanner = Scanner::new(&ZPrefilter);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// Some(&mut scanner),
+ /// None,
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-DFA that permits searching for
+ /// specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// PatternID,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let haystack = "foo123".as_bytes();
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// None,
+ /// None,
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(HalfMatch::must(1, 6));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// None,
+ /// Some(PatternID::must(1)),
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // N.B. We disable Unicode here so that we use a simple ASCII word
+ /// // boundary. Alternatively, we could enable heuristic support for
+ /// // Unicode word boundaries.
+ /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about the
+ /// // larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `3` instead of `6`.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// None,
+ /// None,
+ /// &haystack[3..6],
+ /// 0,
+ /// haystack[3..6].len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let got = dfa.find_earliest_fwd_at(
+ /// None,
+ /// None,
+ /// haystack,
+ /// 3,
+ /// 6,
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn find_earliest_fwd_at(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_earliest_fwd(pre, self, pattern_id, bytes, start, end)
+ }
+
+ /// Executes a reverse search and returns the start position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state.
+ ///
+ /// This is like [`Automaton::find_earliest_rev`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine must panic if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It must also panic if the given haystack range is not valid.
+ #[inline]
+ fn find_earliest_rev_at(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_earliest_rev(self, pattern_id, bytes, start, end)
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// This is like [`Automaton::find_leftmost_fwd`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine must panic if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It must also panic if the given haystack range is not valid.
+ #[inline]
+ fn find_leftmost_fwd_at(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_leftmost_fwd(pre, self, pattern_id, bytes, start, end)
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// This is like [`Automaton::find_leftmost_rev`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine must panic if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It must also panic if the given haystack range is not valid.
+ #[inline]
+ fn find_leftmost_rev_at(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_leftmost_rev(self, pattern_id, bytes, start, end)
+ }
+
+ /// Executes an overlapping forward search and returns the end position of
+ /// matches as they are found. If no match exists, then `None` is returned.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// This is like [`Automaton::find_overlapping_fwd`], except it provides
+ /// some additional control over how the search is executed. See the
+ /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should always be set to the end
+ /// of the last match. If more patterns match at the previous location,
+ /// then they will be immediately returned. (This is tracked by the given
+ /// overlapping state.) Otherwise, the search continues at the starting
+ /// position given.
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFAs generated by this crate, this only occurs in a non-default
+ /// configuration where quit bytes are used or Unicode word boundaries are
+ /// heuristically enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine must panic if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It must also panic if the given haystack range is not valid.
+ #[inline]
+ fn find_overlapping_fwd_at(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_overlapping_fwd(
+ pre, self, pattern_id, bytes, start, end, state,
+ )
+ }
+}
+
+unsafe impl<'a, T: Automaton> Automaton for &'a T {
+ #[inline]
+ fn next_state(&self, current: StateID, input: u8) -> StateID {
+ (**self).next_state(current, input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID {
+ (**self).next_state_unchecked(current, input)
+ }
+
+ #[inline]
+ fn next_eoi_state(&self, current: StateID) -> StateID {
+ (**self).next_eoi_state(current)
+ }
+
+ #[inline]
+ fn start_state_forward(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID {
+ (**self).start_state_forward(pattern_id, bytes, start, end)
+ }
+
+ #[inline]
+ fn start_state_reverse(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID {
+ (**self).start_state_reverse(pattern_id, bytes, start, end)
+ }
+
+ #[inline]
+ fn is_special_state(&self, id: StateID) -> bool {
+ (**self).is_special_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: StateID) -> bool {
+ (**self).is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_quit_state(&self, id: StateID) -> bool {
+ (**self).is_quit_state(id)
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: StateID) -> bool {
+ (**self).is_match_state(id)
+ }
+
+ #[inline]
+ fn is_start_state(&self, id: StateID) -> bool {
+ (**self).is_start_state(id)
+ }
+
+ #[inline]
+ fn is_accel_state(&self, id: StateID) -> bool {
+ (**self).is_accel_state(id)
+ }
+
+ #[inline]
+ fn pattern_count(&self) -> usize {
+ (**self).pattern_count()
+ }
+
+ #[inline]
+ fn match_count(&self, id: StateID) -> usize {
+ (**self).match_count(id)
+ }
+
+ #[inline]
+ fn match_pattern(&self, id: StateID, index: usize) -> PatternID {
+ (**self).match_pattern(id, index)
+ }
+
+ #[inline]
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ (**self).accelerator(id)
+ }
+
+ #[inline]
+ fn find_earliest_fwd(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_earliest_fwd(bytes)
+ }
+
+ #[inline]
+ fn find_earliest_rev(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_earliest_rev(bytes)
+ }
+
+ #[inline]
+ fn find_leftmost_fwd(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_leftmost_fwd(bytes)
+ }
+
+ #[inline]
+ fn find_leftmost_rev(
+ &self,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_leftmost_rev(bytes)
+ }
+
+ #[inline]
+ fn find_overlapping_fwd(
+ &self,
+ bytes: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_overlapping_fwd(bytes, state)
+ }
+
+ #[inline]
+ fn find_earliest_fwd_at(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_earliest_fwd_at(pre, pattern_id, bytes, start, end)
+ }
+
+ #[inline]
+ fn find_earliest_rev_at(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_earliest_rev_at(pattern_id, bytes, start, end)
+ }
+
+ #[inline]
+ fn find_leftmost_fwd_at(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_leftmost_fwd_at(pre, pattern_id, bytes, start, end)
+ }
+
+ #[inline]
+ fn find_leftmost_rev_at(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).find_leftmost_rev_at(pattern_id, bytes, start, end)
+ }
+
+ #[inline]
+ fn find_overlapping_fwd_at(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self)
+ .find_overlapping_fwd_at(pre, pattern_id, bytes, start, end, state)
+ }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides no introspection capabilities. The only thing a caller
+/// can do is construct it and pass it around to permit search routines to use
+/// it to track state.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+ /// The state ID of the state at which the search was in when the call
+ /// terminated. When this is a match state, `last_match` must be set to a
+ /// non-None value.
+ ///
+ /// A `None` value indicates the start state of the corresponding
+ /// automaton. We cannot use the actual ID, since any one automaton may
+ /// have many start states, and which one is in use depends on several
+ /// search-time factors.
+ id: Option<StateID>,
+ /// Information associated with a match when `id` corresponds to a match
+ /// state.
+ last_match: Option<StateMatch>,
+}
+
+/// Internal state about the last match that occurred. This records both the
+/// offset of the match and the match index.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) struct StateMatch {
+ /// The index into the matching patterns for the current match state.
+ pub(crate) match_index: usize,
+ /// The offset in the haystack at which the match occurred. This is used
+ /// when reporting multiple matches at the same offset. That is, when
+ /// an overlapping search runs, the first thing it checks is whether it's
+ /// already in a match state, and if so, whether there are more patterns
+ /// to report as matches in that state. If so, it increments `match_index`
+ /// and returns the pattern and this offset. Once `match_index` exceeds the
+ /// number of matching patterns in the current state, the search continues.
+ pub(crate) offset: usize,
+}
+
+impl OverlappingState {
+ /// Create a new overlapping state that begins at the start state of any
+ /// automaton.
+ pub fn start() -> OverlappingState {
+ OverlappingState { id: None, last_match: None }
+ }
+
+ pub(crate) fn id(&self) -> Option<StateID> {
+ self.id
+ }
+
+ pub(crate) fn set_id(&mut self, id: StateID) {
+ self.id = Some(id);
+ }
+
+ pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
+ self.last_match.as_mut()
+ }
+
+ pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
+ self.last_match = Some(last_match);
+ }
+}
+
+/// Write a prefix "state" indicator for fmt::Debug impls.
+///
+/// Specifically, this tries to succinctly distinguish the different types of
+/// states: dead states, quit states, accelerated states, start states and
+/// match states. It even accounts for the possible overlappings of different
+/// state types.
+pub(crate) fn fmt_state_indicator<A: Automaton>(
+ f: &mut core::fmt::Formatter<'_>,
+ dfa: A,
+ id: StateID,
+) -> core::fmt::Result {
+ if dfa.is_dead_state(id) {
+ write!(f, "D")?;
+ if dfa.is_start_state(id) {
+ write!(f, ">")?;
+ } else {
+ write!(f, " ")?;
+ }
+ } else if dfa.is_quit_state(id) {
+ write!(f, "Q ")?;
+ } else if dfa.is_start_state(id) {
+ if dfa.is_accel_state(id) {
+ write!(f, "A>")?;
+ } else {
+ write!(f, " >")?;
+ }
+ } else if dfa.is_match_state(id) {
+ if dfa.is_accel_state(id) {
+ write!(f, "A*")?;
+ } else {
+ write!(f, " *")?;
+ }
+ } else if dfa.is_accel_state(id) {
+ write!(f, "A ")?;
+ } else {
+ write!(f, " ")?;
+ }
+ Ok(())
+}
diff --git a/vendor/regex-automata/src/dfa/dense.rs b/vendor/regex-automata/src/dfa/dense.rs
new file mode 100644
index 000000000..07c135098
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/dense.rs
@@ -0,0 +1,4470 @@
+/*!
+Types and routines specific to dense DFAs.
+
+This module is the home of [`dense::DFA`](DFA).
+
+This module also contains a [`dense::Builder`](Builder) and a
+[`dense::Config`](Config) for configuring and building a dense DFA.
+*/
+
+#[cfg(feature = "alloc")]
+use core::cmp;
+use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
+
+#[cfg(feature = "alloc")]
+use alloc::{
+ collections::{BTreeMap, BTreeSet},
+ vec,
+ vec::Vec,
+};
+
+#[cfg(feature = "alloc")]
+use crate::{
+ dfa::{
+ accel::Accel, determinize, error::Error, minimize::Minimizer, sparse,
+ },
+ nfa::thompson,
+ util::alphabet::ByteSet,
+ MatchKind,
+};
+use crate::{
+ dfa::{
+ accel::Accels,
+ automaton::{fmt_state_indicator, Automaton},
+ special::Special,
+ DEAD,
+ },
+ util::{
+ alphabet::{self, ByteClasses},
+ bytes::{self, DeserializeError, Endian, SerializeError},
+ id::{PatternID, StateID},
+ start::Start,
+ },
+};
+
+/// The label that is pre-pended to a serialized DFA.
+const LABEL: &str = "rust-regex-automata-dfa-dense";
+
+/// The format version of dense regexes. This version gets incremented when a
+/// change occurs. A change may not necessarily be a breaking change, but the
+/// version does permit good error messages in the case where a breaking change
+/// is made.
+const VERSION: u32 = 2;
+
+/// The configuration used for compiling a dense DFA.
+///
+/// A dense DFA configuration is a simple data object that is typically used
+/// with [`dense::Builder::configure`](self::Builder::configure).
+///
+/// The default configuration guarantees that a search will _never_ return a
+/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a
+/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode
+/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a
+/// search to return an error. See the corresponding configuration options for
+/// more details on when those error conditions arise.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ // As with other configuration types in this crate, we put all our knobs
+ // in options so that we can distinguish between "default" and "not set."
+ // This makes it possible to easily combine multiple configurations
+ // without default values overwriting explicitly specified values. See the
+ // 'overwrite' method.
+ //
+ // For docs on the fields below, see the corresponding method setters.
+ anchored: Option<bool>,
+ accelerate: Option<bool>,
+ minimize: Option<bool>,
+ match_kind: Option<MatchKind>,
+ starts_for_each_pattern: Option<bool>,
+ byte_classes: Option<bool>,
+ unicode_word_boundary: Option<bool>,
+ quit: Option<ByteSet>,
+ dfa_size_limit: Option<Option<usize>>,
+ determinize_size_limit: Option<Option<usize>>,
+}
+
+#[cfg(feature = "alloc")]
+impl Config {
+ /// Return a new default dense DFA compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of a search. When
+ /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`,
+ /// which enables a match to appear anywhere.
+ ///
+ /// Note that if you want to run both anchored and unanchored
+ /// searches without building multiple automatons, you can enable the
+ /// [`Config::starts_for_each_pattern`] configuration instead. This will
+ /// permit unanchored any-pattern searches and pattern-specific anchored
+ /// searches. See the documentation for that configuration for an example.
+ ///
+ /// By default this is disabled.
+ ///
+ /// **WARNING:** this is subtly different than using a `^` at the start of
+ /// your regex. A `^` forces a regex to match exclusively at the start of
+ /// input, regardless of where you begin your search. In contrast, enabling
+ /// this option will allow your regex to match anywhere in your input,
+ /// but the match must start at the beginning of a search. (Most of the
+ /// higher level convenience search routines make "start of input" and
+ /// "start of search" equivalent, but some routines allow treating these as
+ /// orthogonal.)
+ ///
+ /// For example, consider the haystack `aba` and the following searches:
+ ///
+ /// 1. The regex `^a` is compiled with `anchored=false` and searches
+ /// `aba` starting at position `2`. Since `^` requires the match to
+ /// start at the beginning of the input and `2 > 0`, no match is found.
+ /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
+ /// starting at position `2`. This reports a match at `[2, 3]` since
+ /// the match starts where the search started. Since there is no `^`,
+ /// there is no requirement for the match to start at the beginning of
+ /// the input.
+ /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
+ /// starting at position `1`. Since `b` corresponds to position `1` and
+ /// since the regex is anchored, it finds no match.
+ /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
+ /// startting at position `1`. Since the regex is neither anchored nor
+ /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
+ /// prefix that permits it to match anywhere. Thus, it reports a match
+ /// at `[2, 3]`.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// a pattern that begins with `^` (as described in the above warning
+ /// message).
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ ///
+ /// let haystack = "aba".as_bytes();
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().anchored(false)) // default
+ /// .build(r"^a")?;
+ /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
+ /// // No match is found because 2 is not the beginning of the haystack,
+ /// // which is what ^ requires.
+ /// let expected = None;
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().anchored(true))
+ /// .build(r"a")?;
+ /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
+ /// // An anchored search can still match anywhere in the haystack, it just
+ /// // must begin at the start of the search which is '2' in this case.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().anchored(true))
+ /// .build(r"a")?;
+ /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
+ /// // No match is found since we start searching at offset 1 which
+ /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+ /// // is found.
+ /// let expected = None;
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().anchored(false)) // default
+ /// .build(r"a")?;
+ /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
+ /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
+ /// // pattern. Even though the search starts at 'b', the 'match anything'
+ /// // prefix allows the search to match 'a'.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn anchored(mut self, yes: bool) -> Config {
+ self.anchored = Some(yes);
+ self
+ }
+
+ /// Enable state acceleration.
+ ///
+ /// When enabled, DFA construction will analyze each state to determine
+ /// whether it is eligible for simple acceleration. Acceleration typically
+ /// occurs when most of a state's transitions loop back to itself, leaving
+ /// only a select few bytes that will exit the state. When this occurs,
+ /// other routines like `memchr` can be used to look for those bytes which
+ /// may be much faster than traversing the DFA.
+ ///
+ /// Callers may elect to disable this if consistent performance is more
+ /// desirable than variable performance. Namely, acceleration can sometimes
+ /// make searching slower than it otherwise would be if the transitions
+ /// that leave accelerated states are traversed frequently.
+ ///
+ /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
+ /// an example.
+ ///
+ /// This is enabled by default.
+ pub fn accelerate(mut self, yes: bool) -> Config {
+ self.accelerate = Some(yes);
+ self
+ }
+
+ /// Minimize the DFA.
+ ///
+ /// When enabled, the DFA built will be minimized such that it is as small
+ /// as possible.
+ ///
+ /// Whether one enables minimization or not depends on the types of costs
+ /// you're willing to pay and how much you care about its benefits. In
+ /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+ /// space, where `n` is the number of DFA states and `k` is the alphabet
+ /// size. In practice, minimization can be quite costly in terms of both
+ /// space and time, so it should only be done if you're willing to wait
+ /// longer to produce a DFA. In general, you might want a minimal DFA in
+ /// the following circumstances:
+ ///
+ /// 1. You would like to optimize for the size of the automaton. This can
+ /// manifest in one of two ways. Firstly, if you're converting the
+ /// DFA into Rust code (or a table embedded in the code), then a minimal
+ /// DFA will translate into a corresponding reduction in code size, and
+ /// thus, also the final compiled binary size. Secondly, if you are
+ /// building many DFAs and putting them on the heap, you'll be able to
+ /// fit more if they are smaller. Note though that building a minimal
+ /// DFA itself requires additional space; you only realize the space
+ /// savings once the minimal DFA is constructed (at which point, the
+ /// space used for minimization is freed).
+ /// 2. You've observed that a smaller DFA results in faster match
+ /// performance. Naively, this isn't guaranteed since there is no
+ /// inherent difference between matching with a bigger-than-minimal
+ /// DFA and a minimal DFA. However, a smaller DFA may make use of your
+ /// CPU's cache more efficiently.
+ /// 3. You are trying to establish an equivalence between regular
+ /// languages. The standard method for this is to build a minimal DFA
+ /// for each language and then compare them. If the DFAs are equivalent
+ /// (up to state renaming), then the languages are equivalent.
+ ///
+ /// Typically, minimization only makes sense as an offline process. That
+ /// is, one might minimize a DFA before serializing it to persistent
+ /// storage. In practical terms, minimization can take around an order of
+ /// magnitude more time than compiling the initial DFA via determinization.
+ ///
+ /// This option is disabled by default.
+ pub fn minimize(mut self, yes: bool) -> Config {
+ self.minimize = Some(yes);
+ self
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to classical DFA construction
+ /// where all possible matches are added to the DFA.
+ ///
+ /// Typically, `All` is used when one wants to execute an overlapping
+ /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+ /// sense to use `All` with the various "leftmost" find routines, since the
+ /// leftmost routines depend on the `LeftmostFirst` automata construction
+ /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA
+ /// as a way to terminate the search and report a match. `LeftmostFirst`
+ /// also supports non-greedy matches using this strategy where as `All`
+ /// does not.
+ ///
+ /// # Example: overlapping search
+ ///
+ /// This example shows the typical use of `MatchKind::All`, which is to
+ /// report overlapping matches.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, OverlappingState, dense},
+ /// HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: reverse automaton to find start of match
+ ///
+ /// Another example for using `MatchKind::All` is for constructing a
+ /// reverse automaton to find the start of a match. `All` semantics are
+ /// used for this in order to find the longest possible match, which
+ /// corresponds to the leftmost starting position.
+ ///
+ /// Note that if you need the starting position then
+ /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for
+ /// you, so it's usually not necessary to do this yourself.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind};
+ ///
+ /// let haystack = "123foobar456".as_bytes();
+ /// let pattern = r"[a-z]+";
+ ///
+ /// let dfa_fwd = dense::DFA::new(pattern)?;
+ /// let dfa_rev = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .anchored(true)
+ /// .match_kind(MatchKind::All)
+ /// )
+ /// .build(pattern)?;
+ /// let expected_fwd = HalfMatch::must(0, 9);
+ /// let expected_rev = HalfMatch::must(0, 3);
+ /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap();
+ /// // Here we don't specify the pattern to search for since there's only
+ /// // one pattern and we're doing a leftmost search. But if this were an
+ /// // overlapping search, you'd need to specify the pattern that matched
+ /// // in the forward direction. (Otherwise, you might wind up finding the
+ /// // starting position of a match of some other pattern.) That in turn
+ /// // requires building the reverse automaton with starts_for_each_pattern
+ /// // enabled. Indeed, this is what Regex does internally.
+ /// let got_rev = dfa_rev.find_leftmost_rev_at(
+ /// None, haystack, 0, got_fwd.offset(),
+ /// )?.unwrap();
+ /// assert_eq!(expected_fwd, got_fwd);
+ /// assert_eq!(expected_rev, got_rev);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
+ self
+ }
+
+ /// Whether to compile a separate start state for each pattern in the
+ /// automaton.
+ ///
+ /// When enabled, a separate **anchored** start state is added for each
+ /// pattern in the DFA. When this start state is used, then the DFA will
+ /// only search for matches for the pattern specified, even if there are
+ /// other patterns in the DFA.
+ ///
+ /// The main downside of this option is that it can potentially increase
+ /// the size of the DFA and/or increase the time it takes to build the DFA.
+ ///
+ /// There are a few reasons one might want to enable this (it's disabled
+ /// by default):
+ ///
+ /// 1. When looking for the start of an overlapping match (using a
+ /// reverse DFA), doing it correctly requires starting the reverse search
+ /// using the starting state of the pattern that matched in the forward
+ /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex),
+ /// it will automatically enable this option when building the reverse DFA
+ /// internally.
+ /// 2. When you want to use a DFA with multiple patterns to both search
+ /// for matches of any pattern or to search for anchored matches of one
+ /// particular pattern while using the same DFA. (Otherwise, you would need
+ /// to compile a new DFA for each pattern.)
+ /// 3. Since the start states added for each pattern are anchored, if you
+ /// compile an unanchored DFA with one pattern while also enabling this
+ /// option, then you can use the same DFA to perform anchored or unanchored
+ /// searches. The latter you get with the standard search APIs. The former
+ /// you get from the various `_at` search methods that allow you specify a
+ /// pattern ID to search for.
+ ///
+ /// By default this is disabled.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this option to permit the same DFA to
+ /// run both anchored and unanchored searches for a single pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, PatternID,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().starts_for_each_pattern(true))
+ /// .build(r"foo[0-9]+")?;
+ /// let haystack = b"quux foo123";
+ ///
+ /// // Here's a normal unanchored search. Notice that we use 'None' for the
+ /// // pattern ID. Since the DFA was built as an unanchored machine, it
+ /// // use its default unanchored starting state.
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+ /// None, None, haystack, 0, haystack.len(),
+ /// )?);
+ /// // But now if we explicitly specify the pattern to search ('0' being
+ /// // the only pattern in the DFA), then it will use the starting state
+ /// // for that specific pattern which is always anchored. Since the
+ /// // pattern doesn't have a match at the beginning of the haystack, we
+ /// // find nothing.
+ /// assert_eq!(None, dfa.find_leftmost_fwd_at(
+ /// None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
+ /// )?);
+ /// // And finally, an anchored search is not the same as putting a '^' at
+ /// // beginning of the pattern. An anchored search can only match at the
+ /// // beginning of the *search*, which we can change:
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+ /// None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
+ /// )?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+ self.starts_for_each_pattern = Some(yes);
+ self
+ }
+
+ /// Whether to attempt to shrink the size of the DFA's alphabet or not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging a generated DFA.
+ ///
+ /// When enabled, the DFA will use a map from all possible bytes to their
+ /// corresponding equivalence class. Each equivalence class represents a
+ /// set of bytes that does not discriminate between a match and a non-match
+ /// in the DFA. For example, the pattern `[ab]+` has at least two
+ /// equivalence classes: a set containing `a` and `b` and a set containing
+ /// every byte except for `a` and `b`. `a` and `b` are in the same
+ /// equivalence classes because they never discriminate between a match
+ /// and a non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table
+ /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to
+ /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence
+ /// classes (rounded up to the nearest power of 2). As a result, total
+ /// space usage can decrease substantially. Moreover, since a smaller
+ /// alphabet is used, DFA compilation becomes faster as well.
+ ///
+ /// **WARNING:** This is only useful for debugging DFAs. Disabling this
+ /// does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(mut self, yes: bool) -> Config {
+ self.byte_classes = Some(yes);
+ self
+ }
+
+ /// Heuristically enable Unicode word boundaries.
+ ///
+ /// When set, this will attempt to implement Unicode word boundaries as if
+ /// they were ASCII word boundaries. This only works when the search input
+ /// is ASCII only. If a non-ASCII byte is observed while searching, then a
+ /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+ ///
+ /// A possible alternative to enabling this option is to simply use an
+ /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
+ /// option is if you absolutely need Unicode support. This option lets one
+ /// use a fast search implementation (a DFA) for some potentially very
+ /// common cases, while providing the option to fall back to some other
+ /// regex engine to handle the general case when an error is returned.
+ ///
+ /// If the pattern provided has no Unicode word boundary in it, then this
+ /// option has no effect. (That is, quitting on a non-ASCII byte only
+ /// occurs when this option is enabled _and_ a Unicode word boundary is
+ /// present in the pattern.)
+ ///
+ /// This is almost equivalent to setting all non-ASCII bytes to be quit
+ /// bytes. The only difference is that this will cause non-ASCII bytes to
+ /// be quit bytes _only_ when a Unicode word boundary is present in the
+ /// pattern.
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle
+ /// a [`MatchError`](crate::MatchError) error during search.
+ /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
+ /// to using the `try_` suite of methods. Alternatively, if
+ /// callers can guarantee that their input is ASCII only, then a
+ /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
+ /// returned while searching.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to heuristically enable Unicode word boundaries
+ /// in a pattern. It also shows what happens when a search comes across a
+ /// non-ASCII byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, MatchError, MatchKind,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().unicode_word_boundary(true))
+ /// .build(r"\b[0-9]+\b")?;
+ ///
+ /// // The match occurs before the search ever observes the snowman
+ /// // character, so no error occurs.
+ /// let haystack = "foo 123 ☃".as_bytes();
+ /// let expected = Some(HalfMatch::must(0, 7));
+ /// let got = dfa.find_leftmost_fwd(haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // Notice that this search fails, even though the snowman character
+ /// // occurs after the ending match offset. This is because search
+ /// // routines read one byte past the end of the search to account for
+ /// // look-around, and indeed, this is required here to determine whether
+ /// // the trailing \b matches.
+ /// let haystack = "foo 123☃".as_bytes();
+ /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
+ /// let got = dfa.find_leftmost_fwd(haystack);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
+ // We have a separate option for this instead of just setting the
+ // appropriate quit bytes here because we don't want to set quit bytes
+ // for every regex. We only want to set them when the regex contains a
+ // Unicode word boundary.
+ self.unicode_word_boundary = Some(yes);
+ self
+ }
+
+ /// Add a "quit" byte to the DFA.
+ ///
+ /// When a quit byte is seen during search time, then search will return
+ /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
+ /// offset at which the search stopped.
+ ///
+ /// A quit byte will always overrule any other aspects of a regex. For
+ /// example, if the `x` byte is added as a quit byte and the regex `\w` is
+ /// used, then observing `x` will cause the search to quit immediately
+ /// despite the fact that `x` is in the `\w` class.
+ ///
+ /// This mechanism is primarily useful for heuristically enabling certain
+ /// features like Unicode word boundaries in a DFA. Namely, if the input
+ /// to search is ASCII, then a Unicode word boundary can be implemented
+ /// via an ASCII word boundary with no change in semantics. Thus, a DFA
+ /// can attempt to match a Unicode word boundary but give up as soon as it
+ /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
+ /// to be quit bytes, then Unicode word boundaries will be permitted when
+ /// building DFAs. Of course, callers should enable
+ /// [`Config::unicode_word_boundary`] if they want this behavior instead.
+ /// (The advantage being that non-ASCII quit bytes will only be added if a
+ /// Unicode word boundary is in the pattern.)
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle a
+ /// [`MatchError`](crate::MatchError) error during search. When using a
+ /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the
+ /// `try_` suite of methods.
+ ///
+ /// By default, there are no quit bytes set.
+ ///
+ /// # Panics
+ ///
+ /// This panics if heuristic Unicode word boundaries are enabled and any
+ /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
+ /// Unicode word boundaries requires setting every non-ASCII byte to a quit
+ /// byte. So if the caller attempts to undo any of that, then this will
+ /// panic.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte. This could be useful if, for example, you wanted to prevent
+ /// a user supplied pattern from matching across a line boundary.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, MatchError,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ ///
+ /// let haystack = "foo\nbar".as_bytes();
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+ /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn quit(mut self, byte: u8, yes: bool) -> Config {
+ if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
+ panic!(
+ "cannot set non-ASCII byte to be non-quit when \
+ Unicode word boundaries are enabled"
+ );
+ }
+ if self.quit.is_none() {
+ self.quit = Some(ByteSet::empty());
+ }
+ if yes {
+ self.quit.as_mut().unwrap().add(byte);
+ } else {
+ self.quit.as_mut().unwrap().remove(byte);
+ }
+ self
+ }
+
+ /// Set a size limit on the total heap used by a DFA.
+ ///
+ /// This size limit is expressed in bytes and is applied during
+ /// determinization of an NFA into a DFA. If the DFA's heap usage, and only
+ /// the DFA, exceeds this configured limit, then determinization is stopped
+ /// and an error is returned.
+ ///
+ /// This limit does not apply to auxiliary storage used during
+ /// determinization that isn't part of the generated DFA.
+ ///
+ /// This limit is only applied during determinization. Currently, there is
+ /// no way to post-pone this check to after minimization if minimization
+ /// was enabled.
+ ///
+ /// The total limit on heap used during determinization is the sum of the
+ /// DFA and determinization size limits.
+ ///
+ /// The default is no limit.
+ ///
+ /// # Example
+ ///
+ /// This example shows a DFA that fails to build because of a configured
+ /// size limit. This particular example also serves as a cautionary tale
+ /// demonstrating just how big DFAs with large Unicode character classes
+ /// can get.
+ ///
+ /// ```
+ /// use regex_automata::dfa::{dense, Automaton};
+ ///
+ /// // 3MB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new().dfa_size_limit(Some(3_000_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 4MB probably is!
+ /// // (Note that DFA sizes aren't necessarily stable between releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().dfa_size_limit(Some(4_000_000)))
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// While one needs a little more than 3MB to represent `\w{20}`, it
+ /// turns out that you only need a little more than 4KB to represent
+ /// `(?-u:\w{20})`. So only use Unicode if you need it!
+ pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.dfa_size_limit = Some(bytes);
+ self
+ }
+
+ /// Set a size limit on the total heap used by determinization.
+ ///
+ /// This size limit is expressed in bytes and is applied during
+ /// determinization of an NFA into a DFA. If the heap used for auxiliary
+ /// storage during determinization (memory that is not in the DFA but
+ /// necessary for building the DFA) exceeds this configured limit, then
+ /// determinization is stopped and an error is returned.
+ ///
+ /// This limit does not apply to heap used by the DFA itself.
+ ///
+ /// The total limit on heap used during determinization is the sum of the
+ /// DFA and determinization size limits.
+ ///
+ /// The default is no limit.
+ ///
+ /// # Example
+ ///
+ /// This example shows a DFA that fails to build because of a
+ /// configured size limit on the amount of heap space used by
+ /// determinization. This particular example complements the example for
+ /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode
+ /// potentially make DFAs themselves big, but it also results in more
+ /// auxiliary storage during determinization. (Although, auxiliary storage
+ /// is still not as much as the DFA itself.)
+ ///
+ /// ```
+ /// use regex_automata::dfa::{dense, Automaton};
+ ///
+ /// // 300KB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(300_000))
+ /// )
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 400KB probably is!
+ /// // (Note that auxiliary storage sizes aren't necessarily stable between
+ /// // releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(400_000))
+ /// )
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.determinize_size_limit = Some(bytes);
+ self
+ }
+
+ /// Returns whether this configuration has enabled anchored searches.
+ pub fn get_anchored(&self) -> bool {
+ self.anchored.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled simple state
+ /// acceleration.
+ pub fn get_accelerate(&self) -> bool {
+ self.accelerate.unwrap_or(true)
+ }
+
+ /// Returns whether this configuration has enabled the expensive process
+ /// of minimizing a DFA.
+ pub fn get_minimize(&self) -> bool {
+ self.minimize.unwrap_or(false)
+ }
+
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns whether this configuration has enabled anchored starting states
+ /// for every pattern in the DFA.
+ pub fn get_starts_for_each_pattern(&self) -> bool {
+ self.starts_for_each_pattern.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled byte classes or not.
+ /// This is typically a debugging oriented option, as disabling it confers
+ /// no speed benefit.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns whether this configuration has enabled heuristic Unicode word
+ /// boundary support. When enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_unicode_word_boundary(&self) -> bool {
+ self.unicode_word_boundary.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration will instruct the DFA to enter a
+ /// quit state whenever the given byte is seen during a search. When at
+ /// least one byte has this enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_quit(&self, byte: u8) -> bool {
+ self.quit.map_or(false, |q| q.contains(byte))
+ }
+
+ /// Returns the DFA size limit of this configuration if one was set.
+ /// The size limit is total number of bytes on the heap that a DFA is
+ /// permitted to use. If the DFA exceeds this limit during construction,
+ /// then construction is stopped and an error is returned.
+ pub fn get_dfa_size_limit(&self) -> Option<usize> {
+ self.dfa_size_limit.unwrap_or(None)
+ }
+
+ /// Returns the determinization size limit of this configuration if one
+ /// was set. The size limit is total number of bytes on the heap that
+ /// determinization is permitted to use. If determinization exceeds this
+ /// limit during construction, then construction is stopped and an error is
+ /// returned.
+ ///
+ /// This is different from the DFA size limit in that this only applies to
+ /// the auxiliary storage used during determinization. Once determinization
+ /// is complete, this memory is freed.
+ ///
+ /// The limit on the total heap memory used is the sum of the DFA and
+ /// determinization size limits.
+ pub fn get_determinize_size_limit(&self) -> Option<usize> {
+ self.determinize_size_limit.unwrap_or(None)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config {
+ anchored: o.anchored.or(self.anchored),
+ accelerate: o.accelerate.or(self.accelerate),
+ minimize: o.minimize.or(self.minimize),
+ match_kind: o.match_kind.or(self.match_kind),
+ starts_for_each_pattern: o
+ .starts_for_each_pattern
+ .or(self.starts_for_each_pattern),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ unicode_word_boundary: o
+ .unicode_word_boundary
+ .or(self.unicode_word_boundary),
+ quit: o.quit.or(self.quit),
+ dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
+ determinize_size_limit: o
+ .determinize_size_limit
+ .or(self.determinize_size_limit),
+ }
+ }
+}
+
+/// A builder for constructing a deterministic finite automaton from regular
+/// expressions.
+///
+/// This builder provides two main things:
+///
+/// 1. It provides a few different `build` routines for actually constructing
+/// a DFA from different kinds of inputs. The most convenient is
+/// [`Builder::build`], which builds a DFA directly from a pattern string. The
+/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
+/// from an NFA.
+/// 2. The builder permits configuring a number of things.
+/// [`Builder::configure`] is used with [`Config`] to configure aspects of
+/// the DFA and the construction process itself. [`Builder::syntax`] and
+/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
+/// construction, respectively. The syntax and thompson configurations only
+/// apply when building from a pattern string.
+///
+/// This builder always constructs a *single* DFA. As such, this builder
+/// can only be used to construct regexes that either detect the presence
+/// of a match or find the end location of a match. A single DFA cannot
+/// produce both the start and end of a match. For that information, use a
+/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured
+/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to
+/// use a DFA directly is if the end location of a match is enough for your use
+/// case. Namely, a `Regex` will construct two DFAs instead of one, since a
+/// second reverse DFA is needed to find the start of a match.
+///
+/// Note that if one wants to build a sparse DFA, you must first build a dense
+/// DFA and convert that to a sparse DFA. There is no way to build a sparse
+/// DFA without first building a dense DFA.
+///
+/// # Example
+///
+/// This example shows how to build a minimized DFA that completely disables
+/// Unicode. That is:
+///
+/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
+/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
+/// (instead of any UTF-8 encoding of a Unicode scalar value except for
+/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
+/// * The pattern itself is permitted to match invalid UTF-8. For example,
+/// things like `[^a]` that match any byte except for `a` are permitted.
+/// * Unanchored patterns can search through invalid UTF-8. That is, for
+/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
+/// `(?s:.)*?`.
+///
+/// ```
+/// use regex_automata::{
+/// dfa::{Automaton, dense},
+/// nfa::thompson,
+/// HalfMatch, SyntaxConfig,
+/// };
+///
+/// let dfa = dense::Builder::new()
+/// .configure(dense::Config::new().minimize(false))
+/// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo[^b]ar.*")?;
+///
+/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
+/// let expected = Some(HalfMatch::must(0, 10));
+/// let got = dfa.find_leftmost_fwd(haystack)?;
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ thompson: thompson::Builder,
+}
+
+#[cfg(feature = "alloc")]
+impl Builder {
+ /// Create a new dense DFA builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ thompson: thompson::Builder::new(),
+ }
+ }
+
+ /// Build a DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<OwnedDFA, Error> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a DFA from the given patterns.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<OwnedDFA, Error> {
+ let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?;
+ self.build_from_nfa(&nfa)
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a DFA if you already have an NFA in
+ /// hand.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// nfa::thompson,
+ /// HalfMatch,
+ /// };
+ ///
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // This shows how to set non-default options for building an NFA.
+ /// let nfa = thompson::Builder::new()
+ /// .configure(thompson::Config::new().shrink(false))
+ /// .build(r"[0-9]+")?;
+ /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.find_leftmost_fwd(haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_nfa(
+ &self,
+ nfa: &thompson::NFA,
+ ) -> Result<OwnedDFA, Error> {
+ let mut quit = self.config.quit.unwrap_or(ByteSet::empty());
+ if self.config.get_unicode_word_boundary()
+ && nfa.has_word_boundary_unicode()
+ {
+ for b in 0x80..=0xFF {
+ quit.add(b);
+ }
+ }
+ let classes = if !self.config.get_byte_classes() {
+ // DFAs will always use the equivalence class map, but enabling
+ // this option is useful for debugging. Namely, this will cause all
+ // transitions to be defined over their actual bytes instead of an
+ // opaque equivalence class identifier. The former is much easier
+ // to grok as a human.
+ ByteClasses::singletons()
+ } else {
+ let mut set = nfa.byte_class_set().clone();
+ // It is important to distinguish any "quit" bytes from all other
+ // bytes. Otherwise, a non-quit byte may end up in the same class
+ // as a quit byte, and thus cause the DFA stop when it shouldn't.
+ if !quit.is_empty() {
+ set.add_set(&quit);
+ }
+ set.byte_classes()
+ };
+
+ let mut dfa = DFA::initial(
+ classes,
+ nfa.pattern_len(),
+ self.config.get_starts_for_each_pattern(),
+ )?;
+ determinize::Config::new()
+ .anchored(self.config.get_anchored())
+ .match_kind(self.config.get_match_kind())
+ .quit(quit)
+ .dfa_size_limit(self.config.get_dfa_size_limit())
+ .determinize_size_limit(self.config.get_determinize_size_limit())
+ .run(nfa, &mut dfa)?;
+ if self.config.get_minimize() {
+ dfa.minimize();
+ }
+ if self.config.get_accelerate() {
+ dfa.accelerate();
+ }
+ Ok(dfa)
+ }
+
+ /// Apply the given dense DFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether the DFA should match the regex
+ /// in reverse or if additional time should be spent shrinking the size of
+ /// the NFA.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+/// A convenience alias for an owned DFA. We use this particular instantiation
+/// a lot in this crate, so it's worth giving it a name. This instantiation
+/// is commonly used for mutable APIs on the DFA while building it. The main
+/// reason for making DFAs generic is no_std support, and more generally,
+/// making it possible to load a DFA from an arbitrary slice of bytes.
+#[cfg(feature = "alloc")]
+pub(crate) type OwnedDFA = DFA<Vec<u32>>;
+
+/// A dense table-based deterministic finite automaton (DFA).
+///
+/// All dense DFAs have one or more start states, zero or more match states
+/// and a transition table that maps the current state and the current byte
+/// of input to the next state. A DFA can use this information to implement
+/// fast searching. In particular, the use of a dense DFA generally makes the
+/// trade off that match speed is the most valuable characteristic, even if
+/// building the DFA may take significant time *and* space. (More concretely,
+/// building a DFA takes time and space that is exponential in the size of the
+/// pattern in the worst case.) As such, the processing of every byte of input
+/// is done with a small constant number of operations that does not vary with
+/// the pattern, its size or the size of the alphabet. If your needs don't line
+/// up with this trade off, then a dense DFA may not be an adequate solution to
+/// your problem.
+///
+/// In contrast, a [`sparse::DFA`] makes the opposite
+/// trade off: it uses less space but will execute a variable number of
+/// instructions per byte at match time, which makes it slower for matching.
+/// (Note that space usage is still exponential in the size of the pattern in
+/// the worst case.)
+///
+/// A DFA can be built using the default configuration via the
+/// [`DFA::new`] constructor. Otherwise, one can
+/// configure various aspects via [`dense::Builder`](Builder).
+///
+/// A single DFA fundamentally supports the following operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of a match.
+/// 3. In the case of a DFA with multiple patterns, which pattern matched is
+/// reported as well.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of
+/// a match, *two* DFAs are required. This functionality is provided by a
+/// [`Regex`](crate::dfa::regex::Regex).
+///
+/// # Type parameters
+///
+/// A `DFA` has one type parameter, `T`, which is used to represent state IDs,
+/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`.
+///
+/// # The `Automaton` trait
+///
+/// This type implements the [`Automaton`] trait, which means it can be used
+/// for searching. For example:
+///
+/// ```
+/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let expected = HalfMatch::must(0, 8);
+/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA<T> {
+ /// The transition table for this DFA. This includes the transitions
+ /// themselves, along with the stride, number of states and the equivalence
+ /// class mapping.
+ tt: TransitionTable<T>,
+ /// The set of starting state identifiers for this DFA. The starting state
+ /// IDs act as pointers into the transition table. The specific starting
+ /// state chosen for each search is dependent on the context at which the
+ /// search begins.
+ st: StartTable<T>,
+ /// The set of match states and the patterns that match for each
+ /// corresponding match state.
+ ///
+ /// This structure is technically only needed because of support for
+ /// multi-regexes. Namely, multi-regexes require answering not just whether
+ /// a match exists, but _which_ patterns match. So we need to store the
+ /// matching pattern IDs for each match state. We do this even when there
+ /// is only one pattern for the sake of simplicity. In practice, this uses
+ /// up very little space for the case of on pattern.
+ ms: MatchStates<T>,
+ /// Information about which states are "special." Special states are states
+ /// that are dead, quit, matching, starting or accelerated. For more info,
+ /// see the docs for `Special`.
+ special: Special,
+ /// The accelerators for this DFA.
+ ///
+ /// If a state is accelerated, then there exist only a small number of
+ /// bytes that can cause the DFA to leave the state. This permits searching
+ /// to use optimized routines to find those specific bytes instead of using
+ /// the transition table.
+ ///
+ /// All accelerated states exist in a contiguous range in the DFA's
+ /// transition table. See dfa/special.rs for more details on how states are
+ /// arranged.
+ accels: Accels<T>,
+}
+
+#[cfg(feature = "alloc")]
+impl OwnedDFA {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding DFA.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`](Builder) to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ ///
+ /// let dfa = dense::DFA::new("foo[0-9]+bar")?;
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<OwnedDFA, Error> {
+ Builder::new().build(pattern)
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// return the corresponding multi-DFA.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`](Builder) to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ ///
+ /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let expected = HalfMatch::must(1, 3);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<OwnedDFA, Error> {
+ Builder::new().build_many(patterns)
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl OwnedDFA {
+ /// Create a new DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ ///
+ /// let dfa = dense::DFA::always_match()?;
+ ///
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<OwnedDFA, Error> {
+ let nfa = thompson::NFA::always_match();
+ Builder::new().build_from_nfa(&nfa)
+ }
+
+ /// Create a new DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense};
+ ///
+ /// let dfa = dense::DFA::never_match()?;
+ /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
+ /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<OwnedDFA, Error> {
+ let nfa = thompson::NFA::never_match();
+ Builder::new().build_from_nfa(&nfa)
+ }
+
+ /// Create an initial DFA with the given equivalence classes, pattern count
+ /// and whether anchored starting states are enabled for each pattern. An
+ /// initial DFA can be further mutated via determinization.
+ fn initial(
+ classes: ByteClasses,
+ pattern_count: usize,
+ starts_for_each_pattern: bool,
+ ) -> Result<OwnedDFA, Error> {
+ let start_pattern_count =
+ if starts_for_each_pattern { pattern_count } else { 0 };
+ Ok(DFA {
+ tt: TransitionTable::minimal(classes),
+ st: StartTable::dead(start_pattern_count)?,
+ ms: MatchStates::empty(pattern_count),
+ special: Special::new(),
+ accels: Accels::empty(),
+ })
+ }
+}
+
+impl<T: AsRef<[u32]>> DFA<T> {
+ /// Cheaply return a borrowed version of this dense DFA. Specifically,
+ /// the DFA returned always uses `&[u32]` for its transition table.
+ pub fn as_ref(&self) -> DFA<&'_ [u32]> {
+ DFA {
+ tt: self.tt.as_ref(),
+ st: self.st.as_ref(),
+ ms: self.ms.as_ref(),
+ special: self.special,
+ accels: self.accels(),
+ }
+ }
+
+ /// Return an owned version of this sparse DFA. Specifically, the DFA
+ /// returned always uses `Vec<u32>` for its transition table.
+ ///
+ /// Effectively, this returns a dense DFA whose transition table lives on
+ /// the heap.
+ #[cfg(feature = "alloc")]
+ pub fn to_owned(&self) -> OwnedDFA {
+ DFA {
+ tt: self.tt.to_owned(),
+ st: self.st.to_owned(),
+ ms: self.ms.to_owned(),
+ special: self.special,
+ accels: self.accels().to_owned(),
+ }
+ }
+
+ /// Returns true only if this DFA has starting states for each pattern.
+ ///
+ /// When a DFA has starting states for each pattern, then a search with the
+ /// DFA can be configured to only look for anchored matches of a specific
+ /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
+ /// can accept a non-None `pattern_id` if and only if this method returns
+ /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
+ ///
+ /// Note that if the DFA has no patterns, this always returns false.
+ pub fn has_starts_for_each_pattern(&self) -> bool {
+ self.st.patterns > 0
+ }
+
+ /// Returns the total number of elements in the alphabet for this DFA.
+ ///
+ /// That is, this returns the total number of transitions that each state
+ /// in this DFA must have. Typically, a normal byte oriented DFA would
+ /// always have an alphabet size of 256, corresponding to the number of
+ /// unique values in a single byte. However, this implementation has two
+ /// peculiarities that impact the alphabet length:
+ ///
+ /// * Every state has a special "EOI" transition that is only followed
+ /// after the end of some haystack is reached. This EOI transition is
+ /// necessary to account for one byte of look-ahead when implementing
+ /// things like `\b` and `$`.
+ /// * Bytes are grouped into equivalence classes such that no two bytes in
+ /// the same class can distinguish a match from a non-match. For example,
+ /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the
+ /// same equivalence class. This leads to a massive space savings.
+ ///
+ /// Note though that the alphabet length does _not_ necessarily equal the
+ /// total stride space taken up by a single DFA state in the transition
+ /// table. Namely, for performance reasons, the stride is always the
+ /// smallest power of two that is greater than or equal to the alphabet
+ /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
+ /// often more useful. The alphabet length is typically useful only for
+ /// informational purposes.
+ pub fn alphabet_len(&self) -> usize {
+ self.tt.alphabet_len()
+ }
+
+ /// Returns the total stride for every state in this DFA, expressed as the
+ /// exponent of a power of 2. The stride is the amount of space each state
+ /// takes up in the transition table, expressed as a number of transitions.
+ /// (Unused transitions map to dead states.)
+ ///
+ /// The stride of a DFA is always equivalent to the smallest power of 2
+ /// that is greater than or equal to the DFA's alphabet length. This
+ /// definition uses extra space, but permits faster translation between
+ /// premultiplied state identifiers and contiguous indices (by using shifts
+ /// instead of relying on integer division).
+ ///
+ /// For example, if the DFA's stride is 16 transitions, then its `stride2`
+ /// is `4` since `2^4 = 16`.
+ ///
+ /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+ /// while the maximum `stride2` value is `9` (corresponding to a stride of
+ /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
+ /// when accounting for the special EOI transition. However, an alphabet
+ /// length of that size is exceptionally rare since the alphabet is shrunk
+ /// into equivalence classes.
+ pub fn stride2(&self) -> usize {
+ self.tt.stride2
+ }
+
+ /// Returns the total stride for every state in this DFA. This corresponds
+ /// to the total number of transitions used by each state in this DFA's
+ /// transition table.
+ ///
+ /// Please see [`DFA::stride2`] for more information. In particular, this
+ /// returns the stride as the number of transitions, where as `stride2`
+ /// returns it as the exponent of a power of 2.
+ pub fn stride(&self) -> usize {
+ self.tt.stride()
+ }
+
+ /// Returns the "universal" start state for this DFA.
+ ///
+ /// A universal start state occurs only when all of the starting states
+ /// for this DFA are precisely the same. This occurs when there are no
+ /// look-around assertions at the beginning (or end for a reverse DFA) of
+ /// the pattern.
+ ///
+ /// Using this as a starting state for a DFA without a universal starting
+ /// state has unspecified behavior. This condition is not checked, so the
+ /// caller must guarantee it themselves.
+ pub(crate) fn universal_start_state(&self) -> StateID {
+ // We choose 'NonWordByte' for no particular reason, other than
+ // the fact that this is the 'main' starting configuration used in
+ // determinization. But in essence, it doesn't really matter.
+ //
+ // Also, we might consider exposing this routine, but it seems
+ // a little tricky to use correctly. Maybe if we also expose a
+ // 'has_universal_start_state' method?
+ self.st.start(Start::NonWordByte, None)
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<dense::DFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.tt.memory_usage()
+ + self.st.memory_usage()
+ + self.ms.memory_usage()
+ + self.accels.memory_usage()
+ }
+}
+
+/// Routines for converting a dense DFA to other representations, such as
+/// sparse DFAs or raw bytes suitable for persistent storage.
+impl<T: AsRef<[u32]>> DFA<T> {
+ /// Convert this dense DFA to a sparse DFA.
+ ///
+ /// If a `StateID` is too small to represent all states in the sparse
+ /// DFA, then this returns an error. In most cases, if a dense DFA is
+ /// constructable with `StateID` then a sparse DFA will be as well.
+ /// However, it is not guaranteed.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ ///
+ /// let dense = dense::DFA::new("foo[0-9]+")?;
+ /// let sparse = dense.to_sparse()?;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, Error> {
+ sparse::DFA::from_dense(self)
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
+ /// format. Upon success, the `Vec<u8>` and the initial padding length are
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+ /// an address that does not have the same alignment as `u32`. The padding
+ /// corresponds to the number of leading bytes written to the returned
+ /// `Vec<u8>`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_little_endian would work on a little endian target.
+ /// let (buf, _) = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) {
+ self.to_bytes::<bytes::LE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
+ /// format. Upon success, the `Vec<u8>` and the initial padding length are
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+ /// an address that does not have the same alignment as `u32`. The padding
+ /// corresponds to the number of leading bytes written to the returned
+ /// `Vec<u8>`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_big_endian would work on a big endian target.
+ /// let (buf, _) = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) {
+ self.to_bytes::<bytes::BE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
+ /// format. Upon success, the `Vec<u8>` and the initial padding length are
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+ /// an address that does not have the same alignment as `u32`. The padding
+ /// corresponds to the number of leading bytes written to the returned
+ /// `Vec<u8>`.
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let (buf, _) = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) {
+ self.to_bytes::<bytes::NE>()
+ }
+
+ /// The implementation of the public `to_bytes` serialization methods,
+ /// which is generic over endianness.
+ #[cfg(feature = "alloc")]
+ fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) {
+ let len = self.write_to_len();
+ let (mut buf, padding) = bytes::alloc_aligned_buffer::<u32>(len);
+ // This should always succeed since the only possible serialization
+ // error is providing a buffer that's too small, but we've ensured that
+ // `buf` is big enough here.
+ self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap();
+ (buf, padding)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in little endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike the various `to_byte_*` routines, this does not write
+ /// any padding. Callers are responsible for handling alignment correctly.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_little_endian would work on a little endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_little_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.as_ref().write_to::<bytes::LE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in big endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike the various `to_byte_*` routines, this does not write
+ /// any padding. Callers are responsible for handling alignment correctly.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_big_endian would work on a big endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_big_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.as_ref().write_to::<bytes::BE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in native endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// Note that unlike the various `to_byte_*` routines, this does not write
+ /// any padding. Callers are responsible for handling alignment correctly.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_native_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.as_ref().write_to::<bytes::NE>(dst)
+ }
+
+ /// Return the total number of bytes required to serialize this DFA.
+ ///
+ /// This is useful for determining the size of the buffer required to pass
+ /// to one of the serialization routines:
+ ///
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// Passing a buffer smaller than the size returned by this method will
+ /// result in a serialization error. Serialization routines are guaranteed
+ /// to succeed when the buffer is big enough.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to dynamically allocate enough room to serialize
+ /// a DFA.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let mut buf = vec![0; original_dfa.write_to_len()];
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Note that this example isn't actually guaranteed to work! In
+ /// particular, if `buf` is not aligned to a 4-byte boundary, then the
+ /// `DFA::from_bytes` call will fail. If you need this to work, then you
+ /// either need to deal with adding some initial padding yourself, or use
+ /// one of the `to_bytes` methods, which will do it for you.
+ pub fn write_to_len(&self) -> usize {
+ bytes::write_label_len(LABEL)
+ + bytes::write_endianness_check_len()
+ + bytes::write_version_len()
+ + size_of::<u32>() // unused, intended for future flexibility
+ + self.tt.write_to_len()
+ + self.st.write_to_len()
+ + self.ms.write_to_len()
+ + self.special.write_to_len()
+ + self.accels.write_to_len()
+ }
+}
+
+impl<'a> DFA<&'a [u32]> {
+ /// Safely deserialize a DFA with a specific state identifier
+ /// representation. Upon success, this returns both the deserialized DFA
+ /// and the number of bytes read from the given slice. Namely, the contents
+ /// of the slice beyond the DFA are not read.
+ ///
+ /// Deserializing a DFA using this routine will never allocate heap memory.
+ /// For safety purposes, the DFA's transition table will be verified such
+ /// that every transition points to a valid state. If this verification is
+ /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
+ /// will always execute in constant time.
+ ///
+ /// The bytes given must be generated by one of the serialization APIs
+ /// of a `DFA` using a semver compatible release of this crate. Those
+ /// include:
+ ///
+ /// * [`DFA::to_bytes_little_endian`]
+ /// * [`DFA::to_bytes_big_endian`]
+ /// * [`DFA::to_bytes_native_endian`]
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along
+ /// with handling alignment correctly. The `write_to` methods do not
+ /// allocate and write to an existing slice (which may be on the stack).
+ /// Since deserialization always uses the native endianness of the target
+ /// platform, the serialization API you use should match the endianness of
+ /// the target platform. (It's often a good idea to generate serialized
+ /// DFAs for both forms of endianness and then load the correct one based
+ /// on endianness.)
+ ///
+ /// # Errors
+ ///
+ /// Generally speaking, it's easier to state the conditions in which an
+ /// error is _not_ returned. All of the following must be true:
+ ///
+ /// * The bytes given must be produced by one of the serialization APIs
+ /// on this DFA, as mentioned above.
+ /// * The endianness of the target platform matches the endianness used to
+ /// serialized the provided DFA.
+ /// * The slice given must have the same alignment as `u32`.
+ ///
+ /// If any of the above are not true, then an error will be returned.
+ ///
+ /// # Panics
+ ///
+ /// This routine will never panic for any input.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize a DFA to raw bytes, deserialize it
+ /// and then use it for searching.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let (bytes, _) = initial.to_bytes_native_endian();
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: dealing with alignment and padding
+ ///
+ /// In the above example, we used the `to_bytes_native_endian` method to
+ /// serialize a DFA, but we ignored part of its return value corresponding
+ /// to padding added to the beginning of the serialized DFA. This is OK
+ /// because deserialization will skip this initial padding. What matters
+ /// is that the address immediately following the padding has an alignment
+ /// that matches `u32`. That is, the following is an equivalent but
+ /// alternative way to write the above example:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// // Serialization returns the number of leading padding bytes added to
+ /// // the returned Vec<u8>.
+ /// let (bytes, pad) = initial.to_bytes_native_endian();
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This padding is necessary because Rust's standard library does
+ /// not expose any safe and robust way of creating a `Vec<u8>` with a
+ /// guaranteed alignment other than 1. Now, in practice, the underlying
+ /// allocator is likely to provide a `Vec<u8>` that meets our alignment
+ /// requirements, which means `pad` is zero in practice most of the time.
+ ///
+ /// The purpose of exposing the padding like this is flexibility for the
+ /// caller. For example, if one wants to embed a serialized DFA into a
+ /// compiled program, then it's important to guarantee that it starts at a
+ /// `u32`-aligned address. The simplest way to do this is to discard the
+ /// padding bytes and set it up so that the serialized DFA itself begins at
+ /// a properly aligned address. We can show this in two parts. The first
+ /// part is serializing the DFA to a file:
+ ///
+ /// ```no_run
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let (bytes, pad) = dfa.to_bytes_big_endian();
+ /// // Write the contents of the DFA *without* the initial padding.
+ /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?;
+ ///
+ /// // Do it again, but this time for little endian.
+ /// let (bytes, pad) = dfa.to_bytes_little_endian();
+ /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?;
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And now the second part is embedding the DFA into the compiled program
+ /// and deserializing it at runtime on first use. We use conditional
+ /// compilation to choose the correct endianness.
+ ///
+ /// ```no_run
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+ ///
+ /// type S = u32;
+ /// type DFA = dense::DFA<&'static [S]>;
+ ///
+ /// fn get_foo() -> &'static DFA {
+ /// use std::cell::Cell;
+ /// use std::mem::MaybeUninit;
+ /// use std::sync::Once;
+ ///
+ /// // This struct with a generic B is used to permit unsizing
+ /// // coercions, specifically, where B winds up being a [u8]. We also
+ /// // need repr(C) to guarantee that _align comes first, which forces
+ /// // a correct alignment.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [S; 0],
+ /// bytes: B,
+ /// }
+ ///
+ /// # const _: &str = stringify! {
+ /// // This assignment is made possible (implicitly) via the
+ /// // CoerceUnsized trait.
+ /// static ALIGNED: &Aligned<[u8]> = &Aligned {
+ /// _align: [],
+ /// #[cfg(target_endian = "big")]
+ /// bytes: *include_bytes!("foo.bigendian.dfa"),
+ /// #[cfg(target_endian = "little")]
+ /// bytes: *include_bytes!("foo.littleendian.dfa"),
+ /// };
+ /// # };
+ /// # static ALIGNED: &Aligned<[u8]> = &Aligned {
+ /// # _align: [],
+ /// # bytes: [],
+ /// # };
+ ///
+ /// struct Lazy(Cell<MaybeUninit<DFA>>);
+ /// // SAFETY: This is safe because DFA impls Sync.
+ /// unsafe impl Sync for Lazy {}
+ ///
+ /// static INIT: Once = Once::new();
+ /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
+ ///
+ /// INIT.call_once(|| {
+ /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+ /// .expect("serialized DFA should be valid");
+ /// // SAFETY: This is guaranteed to only execute once, and all
+ /// // we do with the pointer is write the DFA to it.
+ /// unsafe {
+ /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
+ /// }
+ /// });
+ /// // SAFETY: DFA is guaranteed to by initialized via INIT and is
+ /// // stored in static memory.
+ /// unsafe {
+ /// let dfa = (*DFA.0.as_ptr()).as_ptr();
+ /// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
+ /// }
+ /// }
+ ///
+ /// let dfa = get_foo();
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
+ /// ```
+ ///
+ /// Alternatively, consider using
+ /// [`lazy_static`](https://crates.io/crates/lazy_static)
+ /// or
+ /// [`once_cell`](https://crates.io/crates/once_cell),
+ /// which will guarantee safety for you. You will still need to use the
+ /// `Aligned` trick above to force correct alignment, but this is safe to
+ /// do and `from_bytes` will return an error if you get it wrong.
+ pub fn from_bytes(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
+ // SAFETY: This is safe because we validate both the transition table,
+ // start state ID list and the match states below. If either validation
+ // fails, then we return an error.
+ let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
+ dfa.tt.validate()?;
+ dfa.st.validate(&dfa.tt)?;
+ dfa.ms.validate(&dfa)?;
+ dfa.accels.validate()?;
+ // N.B. dfa.special doesn't have a way to do unchecked deserialization,
+ // so it has already been validated.
+ Ok((dfa, nread))
+ }
+
+ /// Deserialize a DFA with a specific state identifier representation in
+ /// constant time by omitting the verification of the validity of the
+ /// transition table and other data inside the DFA.
+ ///
+ /// This is just like [`DFA::from_bytes`], except it can potentially return
+ /// a DFA that exhibits undefined behavior if its transition table contains
+ /// invalid state identifiers.
+ ///
+ /// This routine is useful if you need to deserialize a DFA cheaply
+ /// and cannot afford the transition table validation performed by
+ /// `from_bytes`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let (bytes, _) = initial.to_bytes_native_endian();
+ /// // SAFETY: This is guaranteed to be safe since the bytes given come
+ /// // directly from a compatible serialization routine.
+ /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub unsafe fn from_bytes_unchecked(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
+ let mut nr = 0;
+
+ nr += bytes::skip_initial_padding(slice);
+ bytes::check_alignment::<StateID>(&slice[nr..])?;
+ nr += bytes::read_label(&slice[nr..], LABEL)?;
+ nr += bytes::read_endianness_check(&slice[nr..])?;
+ nr += bytes::read_version(&slice[nr..], VERSION)?;
+
+ let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
+ nr += size_of::<u32>();
+
+ let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (special, nread) = Special::from_bytes(&slice[nr..])?;
+ nr += nread;
+ special.validate_state_count(tt.count(), tt.stride2)?;
+
+ let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ Ok((DFA { tt, st, ms, special, accels }, nr))
+ }
+
+ /// The implementation of the public `write_to` serialization methods,
+ /// which is generic over endianness.
+ ///
+ /// This is defined only for &[u32] to reduce binary size/compilation time.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("dense DFA"));
+ }
+ dst = &mut dst[..nwrite];
+
+ let mut nw = 0;
+ nw += bytes::write_label(LABEL, &mut dst[nw..])?;
+ nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
+ nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
+ nw += {
+ // Currently unused, intended for future flexibility
+ E::write_u32(0, &mut dst[nw..]);
+ size_of::<u32>()
+ };
+ nw += self.tt.write_to::<E>(&mut dst[nw..])?;
+ nw += self.st.write_to::<E>(&mut dst[nw..])?;
+ nw += self.ms.write_to::<E>(&mut dst[nw..])?;
+ nw += self.special.write_to::<E>(&mut dst[nw..])?;
+ nw += self.accels.write_to::<E>(&mut dst[nw..])?;
+ Ok(nw)
+ }
+}
+
+/// The following methods implement mutable routines on the internal
+/// representation of a DFA. As such, we must fix the first type parameter to a
+/// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
+/// can get away with this because these methods are internal to the crate and
+/// are exclusively used during construction of the DFA.
+#[cfg(feature = "alloc")]
+impl OwnedDFA {
+ /// Add a start state of this DFA.
+ pub(crate) fn set_start_state(
+ &mut self,
+ index: Start,
+ pattern_id: Option<PatternID>,
+ id: StateID,
+ ) {
+ assert!(self.tt.is_valid(id), "invalid start state");
+ self.st.set_start(index, pattern_id, id);
+ }
+
+ /// Set the given transition to this DFA. Both the `from` and `to` states
+ /// must already exist.
+ pub(crate) fn set_transition(
+ &mut self,
+ from: StateID,
+ byte: alphabet::Unit,
+ to: StateID,
+ ) {
+ self.tt.set(from, byte, to);
+ }
+
+ /// An an empty state (a state where all transitions lead to a dead state)
+ /// and return its identifier. The identifier returned is guaranteed to
+ /// not point to any other existing state.
+ ///
+ /// If adding a state would exceed `StateID::LIMIT`, then this returns an
+ /// error.
+ pub(crate) fn add_empty_state(&mut self) -> Result<StateID, Error> {
+ self.tt.add_empty_state()
+ }
+
+ /// Swap the two states given in the transition table.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// swap. Callers must ensure that other states pointing to id1 and id2 are
+ /// updated appropriately.
+ pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ self.tt.swap(id1, id2);
+ }
+
+ /// Truncate the states in this DFA to the given count.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// truncation. Callers must ensure that other states pointing to truncated
+ /// states are updated appropriately.
+ pub(crate) fn truncate_states(&mut self, count: usize) {
+ self.tt.truncate(count);
+ }
+
+ /// Return a mutable representation of the state corresponding to the given
+ /// id. This is useful for implementing routines that manipulate DFA states
+ /// (e.g., swapping states).
+ pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+ self.tt.state_mut(id)
+ }
+
+ /// Minimize this DFA in place using Hopcroft's algorithm.
+ pub(crate) fn minimize(&mut self) {
+ Minimizer::new(self).run();
+ }
+
+ /// Updates the match state pattern ID map to use the one provided.
+ ///
+ /// This is useful when it's convenient to manipulate matching states
+ /// (and their corresponding pattern IDs) as a map. In particular, the
+ /// representation used by a DFA for this map is not amenable to mutation,
+ /// so if things need to be changed (like when shuffling states), it's
+ /// often easier to work with the map form.
+ pub(crate) fn set_pattern_map(
+ &mut self,
+ map: &BTreeMap<StateID, Vec<PatternID>>,
+ ) -> Result<(), Error> {
+ self.ms = self.ms.new_with_map(map)?;
+ Ok(())
+ }
+
+ /// Find states that have a small number of non-loop transitions and mark
+ /// them as candidates for acceleration during search.
+ pub(crate) fn accelerate(&mut self) {
+ // dead and quit states can never be accelerated.
+ if self.state_count() <= 2 {
+ return;
+ }
+
+ // Go through every state and record their accelerator, if possible.
+ let mut accels = BTreeMap::new();
+ // Count the number of accelerated match, start and non-match/start
+ // states.
+ let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0);
+ for state in self.states() {
+ if let Some(accel) = state.accelerate(self.byte_classes()) {
+ accels.insert(state.id(), accel);
+ if self.is_match_state(state.id()) {
+ cmatch += 1;
+ } else if self.is_start_state(state.id()) {
+ cstart += 1;
+ } else {
+ assert!(!self.is_dead_state(state.id()));
+ assert!(!self.is_quit_state(state.id()));
+ cnormal += 1;
+ }
+ }
+ }
+ // If no states were able to be accelerated, then we're done.
+ if accels.is_empty() {
+ return;
+ }
+ let original_accels_len = accels.len();
+
+ // A remapper keeps track of state ID changes. Once we're done
+ // shuffling, the remapper is used to rewrite all transitions in the
+ // DFA based on the new positions of states.
+ let mut remapper = Remapper::from_dfa(self);
+
+ // As we swap states, if they are match states, we need to swap their
+ // pattern ID lists too (for multi-regexes). We do this by converting
+ // the lists to an easily swappable map, and then convert back to
+ // MatchStates once we're done.
+ let mut new_matches = self.ms.to_map(self);
+
+ // There is at least one state that gets accelerated, so these are
+ // guaranteed to get set to sensible values below.
+ self.special.min_accel = StateID::MAX;
+ self.special.max_accel = StateID::ZERO;
+ let update_special_accel =
+ |special: &mut Special, accel_id: StateID| {
+ special.min_accel = cmp::min(special.min_accel, accel_id);
+ special.max_accel = cmp::max(special.max_accel, accel_id);
+ };
+
+ // Start by shuffling match states. Any match states that are
+ // accelerated get moved to the end of the match state range.
+ if cmatch > 0 && self.special.matches() {
+ // N.B. special.{min,max}_match do not need updating, since the
+ // range/number of match states does not change. Only the ordering
+ // of match states may change.
+ let mut next_id = self.special.max_match;
+ let mut cur_id = next_id;
+ while cur_id >= self.special.min_match {
+ if let Some(accel) = accels.remove(&cur_id) {
+ accels.insert(next_id, accel);
+ update_special_accel(&mut self.special, next_id);
+
+ // No need to do any actual swapping for equivalent IDs.
+ if cur_id != next_id {
+ remapper.swap(self, cur_id, next_id);
+
+ // Swap pattern IDs for match states.
+ let cur_pids = new_matches.remove(&cur_id).unwrap();
+ let next_pids = new_matches.remove(&next_id).unwrap();
+ new_matches.insert(cur_id, next_pids);
+ new_matches.insert(next_id, cur_pids);
+ }
+ next_id = self.tt.prev_state_id(next_id);
+ }
+ cur_id = self.tt.prev_state_id(cur_id);
+ }
+ }
+
+ // This is where it gets tricky. Without acceleration, start states
+ // normally come right after match states. But we want accelerated
+ // states to be a single contiguous range (to make it very fast
+ // to determine whether a state *is* accelerated), while also keeping
+ // match and starting states as contiguous ranges for the same reason.
+ // So what we do here is shuffle states such that it looks like this:
+ //
+ // DQMMMMAAAAASSSSSSNNNNNNN
+ // | |
+ // |---------|
+ // accelerated states
+ //
+ // Where:
+ // D - dead state
+ // Q - quit state
+ // M - match state (may be accelerated)
+ // A - normal state that is accelerated
+ // S - start state (may be accelerated)
+ // N - normal state that is NOT accelerated
+ //
+ // We implement this by shuffling states, which is done by a sequence
+ // of pairwise swaps. We start by looking at all normal states to be
+ // accelerated. When we find one, we swap it with the earliest starting
+ // state, and then swap that with the earliest normal state. This
+ // preserves the contiguous property.
+ //
+ // Once we're done looking for accelerated normal states, now we look
+ // for accelerated starting states by moving them to the beginning
+ // of the starting state range (just like we moved accelerated match
+ // states to the end of the matching state range).
+ //
+ // For a more detailed/different perspective on this, see the docs
+ // in dfa/special.rs.
+ if cnormal > 0 {
+ // our next available starting and normal states for swapping.
+ let mut next_start_id = self.special.min_start;
+ let mut cur_id = self.from_index(self.state_count() - 1);
+ // This is guaranteed to exist since cnormal > 0.
+ let mut next_norm_id =
+ self.tt.next_state_id(self.special.max_start);
+ while cur_id >= next_norm_id {
+ if let Some(accel) = accels.remove(&cur_id) {
+ remapper.swap(self, next_start_id, cur_id);
+ remapper.swap(self, next_norm_id, cur_id);
+ // Keep our accelerator map updated with new IDs if the
+ // states we swapped were also accelerated.
+ if let Some(accel2) = accels.remove(&next_norm_id) {
+ accels.insert(cur_id, accel2);
+ }
+ if let Some(accel2) = accels.remove(&next_start_id) {
+ accels.insert(next_norm_id, accel2);
+ }
+ accels.insert(next_start_id, accel);
+ update_special_accel(&mut self.special, next_start_id);
+ // Our start range shifts one to the right now.
+ self.special.min_start =
+ self.tt.next_state_id(self.special.min_start);
+ self.special.max_start =
+ self.tt.next_state_id(self.special.max_start);
+ next_start_id = self.tt.next_state_id(next_start_id);
+ next_norm_id = self.tt.next_state_id(next_norm_id);
+ }
+ // This is pretty tricky, but if our 'next_norm_id' state also
+ // happened to be accelerated, then the result is that it is
+ // now in the position of cur_id, so we need to consider it
+ // again. This loop is still guaranteed to terminate though,
+ // because when accels contains cur_id, we're guaranteed to
+ // increment next_norm_id even if cur_id remains unchanged.
+ if !accels.contains_key(&cur_id) {
+ cur_id = self.tt.prev_state_id(cur_id);
+ }
+ }
+ }
+ // Just like we did for match states, but we want to move accelerated
+ // start states to the beginning of the range instead of the end.
+ if cstart > 0 {
+ // N.B. special.{min,max}_start do not need updating, since the
+ // range/number of start states does not change at this point. Only
+ // the ordering of start states may change.
+ let mut next_id = self.special.min_start;
+ let mut cur_id = next_id;
+ while cur_id <= self.special.max_start {
+ if let Some(accel) = accels.remove(&cur_id) {
+ remapper.swap(self, cur_id, next_id);
+ accels.insert(next_id, accel);
+ update_special_accel(&mut self.special, next_id);
+ next_id = self.tt.next_state_id(next_id);
+ }
+ cur_id = self.tt.next_state_id(cur_id);
+ }
+ }
+
+ // Remap all transitions in our DFA and assert some things.
+ remapper.remap(self);
+ // This unwrap is OK because acceleration never changes the number of
+ // match states or patterns in those match states. Since acceleration
+ // runs after the pattern map has been set at least once, we know that
+ // our match states cannot error.
+ self.set_pattern_map(&new_matches).unwrap();
+ self.special.set_max();
+ self.special.validate().expect("special state ranges should validate");
+ self.special
+ .validate_state_count(self.state_count(), self.stride2())
+ .expect(
+ "special state ranges should be consistent with state count",
+ );
+ assert_eq!(
+ self.special.accel_len(self.stride()),
+ // We record the number of accelerated states initially detected
+ // since the accels map is itself mutated in the process above.
+ // If mutated incorrectly, its size may change, and thus can't be
+ // trusted as a source of truth of how many accelerated states we
+ // expected there to be.
+ original_accels_len,
+ "mismatch with expected number of accelerated states",
+ );
+
+ // And finally record our accelerators. We kept our accels map updated
+ // as we shuffled states above, so the accelerators should now
+ // correspond to a contiguous range in the state ID space. (Which we
+ // assert.)
+ let mut prev: Option<StateID> = None;
+ for (id, accel) in accels {
+ assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id));
+ prev = Some(id);
+ self.accels.add(accel);
+ }
+ }
+
+ /// Shuffle the states in this DFA so that starting states, match
+ /// states and accelerated states are all contiguous.
+ ///
+ /// See dfa/special.rs for more details.
+ pub(crate) fn shuffle(
+ &mut self,
+ mut matches: BTreeMap<StateID, Vec<PatternID>>,
+ ) -> Result<(), Error> {
+ // The determinizer always adds a quit state and it is always second.
+ self.special.quit_id = self.from_index(1);
+ // If all we have are the dead and quit states, then we're done and
+ // the DFA will never produce a match.
+ if self.state_count() <= 2 {
+ self.special.set_max();
+ return Ok(());
+ }
+
+ // Collect all our start states into a convenient set and confirm there
+ // is no overlap with match states. In the classicl DFA construction,
+ // start states can be match states. But because of look-around, we
+ // delay all matches by a byte, which prevents start states from being
+ // match states.
+ let mut is_start: BTreeSet<StateID> = BTreeSet::new();
+ for (start_id, _, _) in self.starts() {
+ // While there's nothing theoretically wrong with setting a start
+ // state to a dead ID (indeed, it could be an optimization!), the
+ // shuffling code below assumes that start states aren't dead. If
+ // this assumption is violated, the dead state could be shuffled
+ // to a new location, which must never happen. So if we do want
+ // to allow start states to be dead, then this assert should be
+ // removed and the code below fixed.
+ //
+ // N.B. Minimization can cause start states to be dead, but that
+ // happens after states are shuffled, so it's OK. Also, start
+ // states are dead for the DFA that never matches anything, but
+ // in that case, there are no states to shuffle.
+ assert_ne!(start_id, DEAD, "start state cannot be dead");
+ assert!(
+ !matches.contains_key(&start_id),
+ "{:?} is both a start and a match state, which is not allowed",
+ start_id,
+ );
+ is_start.insert(start_id);
+ }
+
+ // We implement shuffling by a sequence of pairwise swaps of states.
+ // Since we have a number of things referencing states via their
+ // IDs and swapping them changes their IDs, we need to record every
+ // swap we make so that we can remap IDs. The remapper handles this
+ // book-keeping for us.
+ let mut remapper = Remapper::from_dfa(self);
+
+ // Shuffle matching states.
+ if matches.is_empty() {
+ self.special.min_match = DEAD;
+ self.special.max_match = DEAD;
+ } else {
+ // The determinizer guarantees that the first two states are the
+ // dead and quit states, respectively. We want our match states to
+ // come right after quit.
+ let mut next_id = self.from_index(2);
+ let mut new_matches = BTreeMap::new();
+ self.special.min_match = next_id;
+ for (id, pids) in matches {
+ remapper.swap(self, next_id, id);
+ new_matches.insert(next_id, pids);
+ // If we swapped a start state, then update our set.
+ if is_start.contains(&next_id) {
+ is_start.remove(&next_id);
+ is_start.insert(id);
+ }
+ next_id = self.tt.next_state_id(next_id);
+ }
+ matches = new_matches;
+ self.special.max_match = cmp::max(
+ self.special.min_match,
+ self.tt.prev_state_id(next_id),
+ );
+ }
+
+ // Shuffle starting states.
+ {
+ let mut next_id = self.from_index(2);
+ if self.special.matches() {
+ next_id = self.tt.next_state_id(self.special.max_match);
+ }
+ self.special.min_start = next_id;
+ for id in is_start {
+ remapper.swap(self, next_id, id);
+ next_id = self.tt.next_state_id(next_id);
+ }
+ self.special.max_start = cmp::max(
+ self.special.min_start,
+ self.tt.prev_state_id(next_id),
+ );
+ }
+
+ // Finally remap all transitions in our DFA.
+ remapper.remap(self);
+ self.set_pattern_map(&matches)?;
+ self.special.set_max();
+ self.special.validate().expect("special state ranges should validate");
+ self.special
+ .validate_state_count(self.state_count(), self.stride2())
+ .expect(
+ "special state ranges should be consistent with state count",
+ );
+ Ok(())
+ }
+}
+
+/// A variety of generic internal methods for accessing DFA internals.
+impl<T: AsRef<[u32]>> DFA<T> {
+ /// Return the byte classes used by this DFA.
+ pub(crate) fn byte_classes(&self) -> &ByteClasses {
+ &self.tt.classes
+ }
+
+ /// Return the info about special states.
+ pub(crate) fn special(&self) -> &Special {
+ &self.special
+ }
+
+ /// Return the info about special states as a mutable borrow.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn special_mut(&mut self) -> &mut Special {
+ &mut self.special
+ }
+
+ /// Returns an iterator over all states in this DFA.
+ ///
+ /// This iterator yields a tuple for each state. The first element of the
+ /// tuple corresponds to a state's identifier, and the second element
+ /// corresponds to the state itself (comprised of its transitions).
+ pub(crate) fn states(&self) -> StateIter<'_, T> {
+ self.tt.states()
+ }
+
+ /// Return the total number of states in this DFA. Every DFA has at least
+ /// 1 state, even the empty DFA.
+ pub(crate) fn state_count(&self) -> usize {
+ self.tt.count()
+ }
+
+ /// Return an iterator over all pattern IDs for the given match state.
+ ///
+ /// If the given state is not a match state, then this panics.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] {
+ assert!(self.is_match_state(id));
+ self.ms.pattern_id_slice(self.match_state_index(id))
+ }
+
+ /// Return the total number of pattern IDs for the given match state.
+ ///
+ /// If the given state is not a match state, then this panics.
+ pub(crate) fn match_pattern_len(&self, id: StateID) -> usize {
+ assert!(self.is_match_state(id));
+ self.ms.pattern_len(self.match_state_index(id))
+ }
+
+ /// Returns the total number of patterns matched by this DFA.
+ pub(crate) fn pattern_count(&self) -> usize {
+ self.ms.patterns
+ }
+
+ /// Returns a map from match state ID to a list of pattern IDs that match
+ /// in that state.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> {
+ self.ms.to_map(self)
+ }
+
+ /// Returns the ID of the quit state for this DFA.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn quit_id(&self) -> StateID {
+ self.from_index(1)
+ }
+
+ /// Convert the given state identifier to the state's index. The state's
+ /// index corresponds to the position in which it appears in the transition
+ /// table. When a DFA is NOT premultiplied, then a state's identifier is
+ /// also its index. When a DFA is premultiplied, then a state's identifier
+ /// is equal to `index * alphabet_len`. This routine reverses that.
+ pub(crate) fn to_index(&self, id: StateID) -> usize {
+ self.tt.to_index(id)
+ }
+
+ /// Convert an index to a state (in the range 0..self.state_count()) to an
+ /// actual state identifier.
+ ///
+ /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+ /// to some other information (such as a remapped state ID).
+ #[cfg(feature = "alloc")]
+ pub(crate) fn from_index(&self, index: usize) -> StateID {
+ self.tt.from_index(index)
+ }
+
+ /// Return the table of state IDs for this DFA's start states.
+ pub(crate) fn starts(&self) -> StartStateIter<'_> {
+ self.st.iter()
+ }
+
+ /// Returns the index of the match state for the given ID. If the
+ /// given ID does not correspond to a match state, then this may
+ /// panic or produce an incorrect result.
+ fn match_state_index(&self, id: StateID) -> usize {
+ debug_assert!(self.is_match_state(id));
+ // This is one of the places where we rely on the fact that match
+ // states are contiguous in the transition table. Namely, that the
+ // first match state ID always corresponds to dfa.special.min_start.
+ // From there, since we know the stride, we can compute the overall
+ // index of any match state given the match state's ID.
+ let min = self.special().min_match.as_usize();
+ // CORRECTNESS: We're allowed to produce an incorrect result or panic,
+ // so both the subtraction and the unchecked StateID construction is
+ // OK.
+ self.to_index(StateID::new_unchecked(id.as_usize() - min))
+ }
+
+ /// Returns the index of the accelerator state for the given ID. If the
+ /// given ID does not correspond to an accelerator state, then this may
+ /// panic or produce an incorrect result.
+ fn accelerator_index(&self, id: StateID) -> usize {
+ let min = self.special().min_accel.as_usize();
+ // CORRECTNESS: We're allowed to produce an incorrect result or panic,
+ // so both the subtraction and the unchecked StateID construction is
+ // OK.
+ self.to_index(StateID::new_unchecked(id.as_usize() - min))
+ }
+
+ /// Return the accelerators for this DFA.
+ fn accels(&self) -> Accels<&[u32]> {
+ self.accels.as_ref()
+ }
+
+ /// Return this DFA's transition table as a slice.
+ fn trans(&self) -> &[StateID] {
+ self.tt.table()
+ }
+}
+
+impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "dense::DFA(")?;
+ for state in self.states() {
+ fmt_state_indicator(f, self, state.id())?;
+ let id = if f.alternate() {
+ state.id().as_usize()
+ } else {
+ self.to_index(state.id())
+ };
+ write!(f, "{:06?}: ", id)?;
+ state.fmt(f)?;
+ write!(f, "\n")?;
+ }
+ writeln!(f, "")?;
+ for (i, (start_id, sty, pid)) in self.starts().enumerate() {
+ let id = if f.alternate() {
+ start_id.as_usize()
+ } else {
+ self.to_index(start_id)
+ };
+ if i % self.st.stride == 0 {
+ match pid {
+ None => writeln!(f, "START-GROUP(ALL)")?,
+ Some(pid) => {
+ writeln!(f, "START_GROUP(pattern: {:?})", pid)?
+ }
+ }
+ }
+ writeln!(f, " {:?} => {:06?}", sty, id)?;
+ }
+ if self.pattern_count() > 1 {
+ writeln!(f, "")?;
+ for i in 0..self.ms.count() {
+ let id = self.ms.match_state_id(self, i);
+ let id = if f.alternate() {
+ id.as_usize()
+ } else {
+ self.to_index(id)
+ };
+ write!(f, "MATCH({:06?}): ", id)?;
+ for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate()
+ {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?}", pid)?;
+ }
+ writeln!(f, "")?;
+ }
+ }
+ writeln!(f, "state count: {:?}", self.state_count())?;
+ writeln!(f, "pattern count: {:?}", self.pattern_count())?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
+ #[inline]
+ fn is_special_state(&self, id: StateID) -> bool {
+ self.special.is_special_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: StateID) -> bool {
+ self.special.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_quit_state(&self, id: StateID) -> bool {
+ self.special.is_quit_state(id)
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: StateID) -> bool {
+ self.special.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_start_state(&self, id: StateID) -> bool {
+ self.special.is_start_state(id)
+ }
+
+ #[inline]
+ fn is_accel_state(&self, id: StateID) -> bool {
+ self.special.is_accel_state(id)
+ }
+
+ #[inline]
+ fn next_state(&self, current: StateID, input: u8) -> StateID {
+ let input = self.byte_classes().get(input);
+ let o = current.as_usize() + usize::from(input);
+ self.trans()[o]
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID {
+ let input = self.byte_classes().get_unchecked(input);
+ let o = current.as_usize() + usize::from(input);
+ *self.trans().get_unchecked(o)
+ }
+
+ #[inline]
+ fn next_eoi_state(&self, current: StateID) -> StateID {
+ let eoi = self.byte_classes().eoi().as_usize();
+ let o = current.as_usize() + eoi;
+ self.trans()[o]
+ }
+
+ #[inline]
+ fn pattern_count(&self) -> usize {
+ self.ms.patterns
+ }
+
+ #[inline]
+ fn match_count(&self, id: StateID) -> usize {
+ self.match_pattern_len(id)
+ }
+
+ #[inline]
+ fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
+ // This is an optimization for the very common case of a DFA with a
+ // single pattern. This conditional avoids a somewhat more costly path
+ // that finds the pattern ID from the state machine, which requires
+ // a bit of slicing/pointer-chasing. This optimization tends to only
+ // matter when matches are frequent.
+ if self.ms.patterns == 1 {
+ return PatternID::ZERO;
+ }
+ let state_index = self.match_state_index(id);
+ self.ms.pattern_id(state_index, match_index)
+ }
+
+ #[inline]
+ fn start_state_forward(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID {
+ let index = Start::from_position_fwd(bytes, start, end);
+ self.st.start(index, pattern_id)
+ }
+
+ #[inline]
+ fn start_state_reverse(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID {
+ let index = Start::from_position_rev(bytes, start, end);
+ self.st.start(index, pattern_id)
+ }
+
+ #[inline(always)]
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ if !self.is_accel_state(id) {
+ return &[];
+ }
+ self.accels.needles(self.accelerator_index(id))
+ }
+}
+
+/// The transition table portion of a dense DFA.
+///
+/// The transition table is the core part of the DFA in that it describes how
+/// to move from one state to another based on the input sequence observed.
+#[derive(Clone)]
+pub(crate) struct TransitionTable<T> {
+ /// A contiguous region of memory representing the transition table in
+ /// row-major order. The representation is dense. That is, every state
+ /// has precisely the same number of transitions. The maximum number of
+ /// transitions per state is 257 (256 for each possible byte value, plus 1
+ /// for the special EOI transition). If a DFA has been instructed to use
+ /// byte classes (the default), then the number of transitions is usually
+ /// substantially fewer.
+ ///
+ /// In practice, T is either `Vec<u32>` or `&[u32]`.
+ table: T,
+ /// A set of equivalence classes, where a single equivalence class
+ /// represents a set of bytes that never discriminate between a match
+ /// and a non-match in the DFA. Each equivalence class corresponds to a
+ /// single character in this DFA's alphabet, where the maximum number of
+ /// characters is 257 (each possible value of a byte plus the special
+ /// EOI transition). Consequently, the number of equivalence classes
+ /// corresponds to the number of transitions for each DFA state. Note
+ /// though that the *space* used by each DFA state in the transition table
+ /// may be larger. The total space used by each DFA state is known as the
+ /// stride.
+ ///
+ /// The only time the number of equivalence classes is fewer than 257 is if
+ /// the DFA's kind uses byte classes (which is the default). Equivalence
+ /// classes should generally only be disabled when debugging, so that
+ /// the transitions themselves aren't obscured. Disabling them has no
+ /// other benefit, since the equivalence class map is always used while
+ /// searching. In the vast majority of cases, the number of equivalence
+ /// classes is substantially smaller than 257, particularly when large
+ /// Unicode classes aren't used.
+ classes: ByteClasses,
+ /// The stride of each DFA state, expressed as a power-of-two exponent.
+ ///
+ /// The stride of a DFA corresponds to the total amount of space used by
+ /// each DFA state in the transition table. This may be bigger than the
+ /// size of a DFA's alphabet, since the stride is always the smallest
+ /// power of two greater than or equal to the alphabet size.
+ ///
+ /// While this wastes space, this avoids the need for integer division
+ /// to convert between premultiplied state IDs and their corresponding
+ /// indices. Instead, we can use simple bit-shifts.
+ ///
+ /// See the docs for the `stride2` method for more details.
+ ///
+ /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+ /// while the maximum `stride2` value is `9` (corresponding to a stride of
+ /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
+ /// when accounting for the special EOI transition. However, an alphabet
+ /// length of that size is exceptionally rare since the alphabet is shrunk
+ /// into equivalence classes.
+ stride2: usize,
+}
+
+impl<'a> TransitionTable<&'a [u32]> {
+ /// Deserialize a transition table starting at the beginning of `slice`.
+ /// Upon success, return the total number of bytes read along with the
+ /// transition table.
+ ///
+ /// If there was a problem deserializing any part of the transition table,
+ /// then this returns an error. Notably, if the given slice does not have
+ /// the same alignment as `StateID`, then this will return an error (among
+ /// other possible errors).
+ ///
+ /// This is guaranteed to execute in constant time.
+ ///
+ /// # Safety
+ ///
+ /// This routine is not safe because it does not check the valdity of the
+ /// transition table itself. In particular, the transition table can be
+ /// quite large, so checking its validity can be somewhat expensive. An
+ /// invalid transition table is not safe because other code may rely on the
+ /// transition table being correct (such as explicit bounds check elision).
+ /// Therefore, an invalid transition table can lead to undefined behavior.
+ ///
+ /// Callers that use this function must either pass on the safety invariant
+ /// or guarantee that the bytes given contain a valid transition table.
+ /// This guarantee is upheld by the bytes written by `write_to`.
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr() as usize;
+
+ let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?;
+ slice = &slice[nr..];
+
+ let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?;
+ slice = &slice[nr..];
+
+ let (classes, nr) = ByteClasses::from_bytes(slice)?;
+ slice = &slice[nr..];
+
+ // The alphabet length (determined by the byte class map) cannot be
+ // bigger than the stride (total space used by each DFA state).
+ if stride2 > 9 {
+ return Err(DeserializeError::generic(
+ "dense DFA has invalid stride2 (too big)",
+ ));
+ }
+ // It also cannot be zero, since even a DFA that never matches anything
+ // has a non-zero number of states with at least two equivalence
+ // classes: one for all 256 byte values and another for the EOI
+ // sentinel.
+ if stride2 < 1 {
+ return Err(DeserializeError::generic(
+ "dense DFA has invalid stride2 (too small)",
+ ));
+ }
+ // This is OK since 1 <= stride2 <= 9.
+ let stride =
+ 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap();
+ if classes.alphabet_len() > stride {
+ return Err(DeserializeError::generic(
+ "alphabet size cannot be bigger than transition table stride",
+ ));
+ }
+
+ let trans_count =
+ bytes::shl(count, stride2, "dense table transition count")?;
+ let table_bytes_len = bytes::mul(
+ trans_count,
+ StateID::SIZE,
+ "dense table state byte count",
+ )?;
+ bytes::check_slice_len(slice, table_bytes_len, "transition table")?;
+ bytes::check_alignment::<StateID>(slice)?;
+ let table_bytes = &slice[..table_bytes_len];
+ slice = &slice[table_bytes_len..];
+ // SAFETY: Since StateID is always representable as a u32, all we need
+ // to do is ensure that we have the proper length and alignment. We've
+ // checked both above, so the cast below is safe.
+ //
+ // N.B. This is the only not-safe code in this function, so we mark
+ // it explicitly to call it out, even though it is technically
+ // superfluous.
+ #[allow(unused_unsafe)]
+ let table = unsafe {
+ core::slice::from_raw_parts(
+ table_bytes.as_ptr() as *const u32,
+ trans_count,
+ )
+ };
+ let tt = TransitionTable { table, classes, stride2 };
+ Ok((tt, slice.as_ptr() as usize - slice_start))
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl TransitionTable<Vec<u32>> {
+ /// Create a minimal transition table with just two states: a dead state
+ /// and a quit state. The alphabet length and stride of the transition
+ /// table is determined by the given set of equivalence classes.
+ fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> {
+ let mut tt = TransitionTable {
+ table: vec![],
+ classes,
+ stride2: classes.stride2(),
+ };
+ // Two states, regardless of alphabet size, can always fit into u32.
+ tt.add_empty_state().unwrap(); // dead state
+ tt.add_empty_state().unwrap(); // quit state
+ tt
+ }
+
+ /// Set a transition in this table. Both the `from` and `to` states must
+ /// already exist, otherwise this panics. `unit` should correspond to the
+ /// transition out of `from` to set to `to`.
+ fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) {
+ assert!(self.is_valid(from), "invalid 'from' state");
+ assert!(self.is_valid(to), "invalid 'to' state");
+ self.table[from.as_usize() + self.classes.get_by_unit(unit)] =
+ to.as_u32();
+ }
+
+ /// Add an empty state (a state where all transitions lead to a dead state)
+ /// and return its identifier. The identifier returned is guaranteed to
+ /// not point to any other existing state.
+ ///
+ /// If adding a state would exhaust the state identifier space, then this
+ /// returns an error.
+ fn add_empty_state(&mut self) -> Result<StateID, Error> {
+ // Normally, to get a fresh state identifier, we would just
+ // take the index of the next state added to the transition
+ // table. However, we actually perform an optimization here
+ // that premultiplies state IDs by the stride, such that they
+ // point immediately at the beginning of their transitions in
+ // the transition table. This avoids an extra multiplication
+ // instruction for state lookup at search time.
+ //
+ // Premultiplied identifiers means that instead of your matching
+ // loop looking something like this:
+ //
+ // state = dfa.start
+ // for byte in haystack:
+ // next = dfa.transitions[state * stride + byte]
+ // if dfa.is_match(next):
+ // return true
+ // return false
+ //
+ // it can instead look like this:
+ //
+ // state = dfa.start
+ // for byte in haystack:
+ // next = dfa.transitions[state + byte]
+ // if dfa.is_match(next):
+ // return true
+ // return false
+ //
+ // In other words, we save a multiplication instruction in the
+ // critical path. This turns out to be a decent performance win.
+ // The cost of using premultiplied state ids is that they can
+ // require a bigger state id representation. (And they also make
+ // the code a bit more complex, especially during minimization and
+ // when reshuffling states, as one needs to convert back and forth
+ // between state IDs and state indices.)
+ //
+ // To do this, we simply take the index of the state into the
+ // entire transition table, rather than the index of the state
+ // itself. e.g., If the stride is 64, then the ID of the 3rd state
+ // is 192, not 2.
+ let next = self.table.len();
+ let id = StateID::new(next).map_err(|_| Error::too_many_states())?;
+ self.table.extend(iter::repeat(0).take(self.stride()));
+ Ok(id)
+ }
+
+ /// Swap the two states given in this transition table.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// swap. Callers must ensure that other states pointing to id1 and id2 are
+ /// updated appropriately.
+ ///
+ /// Both id1 and id2 must point to valid states, otherwise this panics.
+ fn swap(&mut self, id1: StateID, id2: StateID) {
+ assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1);
+ assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2);
+ // We only need to swap the parts of the state that are used. So if the
+ // stride is 64, but the alphabet length is only 33, then we save a lot
+ // of work.
+ for b in 0..self.classes.alphabet_len() {
+ self.table.swap(id1.as_usize() + b, id2.as_usize() + b);
+ }
+ }
+
+ /// Truncate the states in this transition table to the given count.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// truncation. Callers must ensure that other states pointing to truncated
+ /// states are updated appropriately.
+ fn truncate(&mut self, count: usize) {
+ self.table.truncate(count << self.stride2);
+ }
+
+ /// Return a mutable representation of the state corresponding to the given
+ /// id. This is useful for implementing routines that manipulate DFA states
+ /// (e.g., swapping states).
+ fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+ let alphabet_len = self.alphabet_len();
+ let i = id.as_usize();
+ StateMut {
+ id,
+ stride2: self.stride2,
+ transitions: &mut self.table_mut()[i..i + alphabet_len],
+ }
+ }
+}
+
+impl<T: AsRef<[u32]>> TransitionTable<T> {
+ /// Writes a serialized form of this transition table to the buffer given.
+ /// If the buffer is too small, then an error is returned. To determine
+ /// how big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("transition table"));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write state count
+ // Unwrap is OK since number of states is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.count()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write state stride (as power of 2)
+ // Unwrap is OK since stride2 is guaranteed to be <= 9.
+ E::write_u32(u32::try_from(self.stride2).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write byte class map
+ let n = self.classes.write_to(dst)?;
+ dst = &mut dst[n..];
+
+ // write actual transitions
+ for &sid in self.table() {
+ let n = bytes::write_state_id::<E>(sid, &mut dst);
+ dst = &mut dst[n..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // state count
+ + size_of::<u32>() // stride2
+ + self.classes.write_to_len()
+ + (self.table().len() * StateID::SIZE)
+ }
+
+ /// Validates that every state ID in this transition table is valid.
+ ///
+ /// That is, every state ID can be used to correctly index a state in this
+ /// table.
+ fn validate(&self) -> Result<(), DeserializeError> {
+ for state in self.states() {
+ for (_, to) in state.transitions() {
+ if !self.is_valid(to) {
+ return Err(DeserializeError::generic(
+ "found invalid state ID in transition table",
+ ));
+ }
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts this transition table to a borrowed value.
+ fn as_ref(&self) -> TransitionTable<&'_ [u32]> {
+ TransitionTable {
+ table: self.table.as_ref(),
+ classes: self.classes.clone(),
+ stride2: self.stride2,
+ }
+ }
+
+ /// Converts this transition table to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> TransitionTable<Vec<u32>> {
+ TransitionTable {
+ table: self.table.as_ref().to_vec(),
+ classes: self.classes.clone(),
+ stride2: self.stride2,
+ }
+ }
+
+ /// Return the state for the given ID. If the given ID is not valid, then
+ /// this panics.
+ fn state(&self, id: StateID) -> State<'_> {
+ assert!(self.is_valid(id));
+
+ let i = id.as_usize();
+ State {
+ id,
+ stride2: self.stride2,
+ transitions: &self.table()[i..i + self.alphabet_len()],
+ }
+ }
+
+ /// Returns an iterator over all states in this transition table.
+ ///
+ /// This iterator yields a tuple for each state. The first element of the
+ /// tuple corresponds to a state's identifier, and the second element
+ /// corresponds to the state itself (comprised of its transitions).
+ fn states(&self) -> StateIter<'_, T> {
+ StateIter {
+ tt: self,
+ it: self.table().chunks(self.stride()).enumerate(),
+ }
+ }
+
+ /// Convert a state identifier to an index to a state (in the range
+ /// 0..self.count()).
+ ///
+ /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+ /// to some other information (such as a remapped state ID).
+ ///
+ /// If the given ID is not valid, then this may panic or produce an
+ /// incorrect index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert an index to a state (in the range 0..self.count()) to an actual
+ /// state identifier.
+ ///
+ /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+ /// to some other information (such as a remapped state ID).
+ ///
+ /// If the given index is not in the specified range, then this may panic
+ /// or produce an incorrect state ID.
+ fn from_index(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID.
+ StateID::new_unchecked(index << self.stride2)
+ }
+
+ /// Returns the state ID for the state immediately following the one given.
+ ///
+ /// This does not check whether the state ID returned is invalid. In fact,
+ /// if the state ID given is the last state in this DFA, then the state ID
+ /// returned is guaranteed to be invalid.
+ #[cfg(feature = "alloc")]
+ fn next_state_id(&self, id: StateID) -> StateID {
+ self.from_index(self.to_index(id).checked_add(1).unwrap())
+ }
+
+ /// Returns the state ID for the state immediately preceding the one given.
+ ///
+ /// If the dead ID given (which is zero), then this panics.
+ #[cfg(feature = "alloc")]
+ fn prev_state_id(&self, id: StateID) -> StateID {
+ self.from_index(self.to_index(id).checked_sub(1).unwrap())
+ }
+
+ /// Returns the table as a slice of state IDs.
+ fn table(&self) -> &[StateID] {
+ let integers = self.table.as_ref();
+ // SAFETY: This is safe because StateID is guaranteed to be
+ // representable as a u32.
+ unsafe {
+ core::slice::from_raw_parts(
+ integers.as_ptr() as *const StateID,
+ integers.len(),
+ )
+ }
+ }
+
+ /// Returns the total number of states in this transition table.
+ ///
+ /// Note that a DFA always has at least two states: the dead and quit
+ /// states. In particular, the dead state always has ID 0 and is
+ /// correspondingly always the first state. The dead state is never a match
+ /// state.
+ fn count(&self) -> usize {
+ self.table().len() >> self.stride2
+ }
+
+ /// Returns the total stride for every state in this DFA. This corresponds
+ /// to the total number of transitions used by each state in this DFA's
+ /// transition table.
+ fn stride(&self) -> usize {
+ 1 << self.stride2
+ }
+
+ /// Returns the total number of elements in the alphabet for this
+ /// transition table. This is always less than or equal to `self.stride()`.
+ /// It is only equal when the alphabet length is a power of 2. Otherwise,
+ /// it is always strictly less.
+ fn alphabet_len(&self) -> usize {
+ self.classes.alphabet_len()
+ }
+
+ /// Returns true if and only if the given state ID is valid for this
+ /// transition table. Validity in this context means that the given ID can
+ /// be used as a valid offset with `self.stride()` to index this transition
+ /// table.
+ fn is_valid(&self, id: StateID) -> bool {
+ let id = id.as_usize();
+ id < self.table().len() && id % self.stride() == 0
+ }
+
+ /// Return the memory usage, in bytes, of this transition table.
+ ///
+ /// This does not include the size of a `TransitionTable` value itself.
+ fn memory_usage(&self) -> usize {
+ self.table().len() * StateID::SIZE
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u32]>> TransitionTable<T> {
+ /// Returns the table as a slice of state IDs.
+ fn table_mut(&mut self) -> &mut [StateID] {
+ let integers = self.table.as_mut();
+ // SAFETY: This is safe because StateID is guaranteed to be
+ // representable as a u32.
+ unsafe {
+ core::slice::from_raw_parts_mut(
+ integers.as_mut_ptr() as *mut StateID,
+ integers.len(),
+ )
+ }
+ }
+}
+
+/// The set of all possible starting states in a DFA.
+///
+/// The set of starting states corresponds to the possible choices one can make
+/// in terms of starting a DFA. That is, before following the first transition,
+/// you first need to select the state that you start in.
+///
+/// Normally, a DFA converted from an NFA that has a single starting state
+/// would itself just have one starting state. However, our support for look
+/// around generally requires more starting states. The correct starting state
+/// is chosen based on certain properties of the position at which we begin
+/// our search.
+///
+/// Before listing those properties, we first must define two terms:
+///
+/// * `haystack` - The bytes to execute the search. The search always starts
+/// at the beginning of `haystack` and ends before or at the end of
+/// `haystack`.
+/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack`
+/// must be contained within `context` such that `context` is at least as big
+/// as `haystack`.
+///
+/// This split is crucial for dealing with look-around. For example, consider
+/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This
+/// regex should _not_ match the haystack since `bar` does not appear at the
+/// beginning of the input. Similarly, the regex `\Bbar\B` should match the
+/// haystack because `bar` is not surrounded by word boundaries. But a search
+/// that does not take context into account would not permit `\B` to match
+/// since the beginning of any string matches a word boundary. Similarly, a
+/// search that does not take context into account when searching `^bar$` in
+/// the haystack `bar` would produce a match when it shouldn't.
+///
+/// Thus, it follows that the starting state is chosen based on the following
+/// criteria, derived from the position at which the search starts in the
+/// `context` (corresponding to the start of `haystack`):
+///
+/// 1. If the search starts at the beginning of `context`, then the `Text`
+/// start state is used. (Since `^` corresponds to
+/// `hir::Anchor::StartText`.)
+/// 2. If the search starts at a position immediately following a line
+/// terminator, then the `Line` start state is used. (Since `(?m:^)`
+/// corresponds to `hir::Anchor::StartLine`.)
+/// 3. If the search starts at a position immediately following a byte
+/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte`
+/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.)
+/// 4. Otherwise, if the search starts at a position immediately following
+/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`),
+/// then the `NonWordByte` start state is used. (Since `(?-u:\B)`
+/// corresponds to a not-word-boundary.)
+///
+/// (N.B. Unicode word boundaries are not supported by the DFA because they
+/// require multi-byte look-around and this is difficult to support in a DFA.)
+///
+/// To further complicate things, we also support constructing individual
+/// anchored start states for each pattern in the DFA. (Which is required to
+/// implement overlapping regexes correctly, but is also generally useful.)
+/// Thus, when individual start states for each pattern are enabled, then the
+/// total number of start states represented is `4 + (4 * #patterns)`, where
+/// the 4 comes from each of the 4 possibilities above. The first 4 represents
+/// the starting states for the entire DFA, which support searching for
+/// multiple patterns simultaneously (possibly unanchored).
+///
+/// If individual start states are disabled, then this will only store 4
+/// start states. Typically, individual start states are only enabled when
+/// constructing the reverse DFA for regex matching. But they are also useful
+/// for building DFAs that can search for a specific pattern or even to support
+/// both anchored and unanchored searches with the same DFA.
+///
+/// Note though that while the start table always has either `4` or
+/// `4 + (4 * #patterns)` starting state *ids*, the total number of states
+/// might be considerably smaller. That is, many of the IDs may be duplicative.
+/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no
+/// reason to generate a unique starting state for handling word boundaries.
+/// Similarly for start/end anchors.)
+#[derive(Clone)]
+pub(crate) struct StartTable<T> {
+ /// The initial start state IDs.
+ ///
+ /// In practice, T is either `Vec<u32>` or `&[u32]`.
+ ///
+ /// The first `stride` (currently always 4) entries always correspond to
+ /// the start states for the entire DFA. After that, there are
+ /// `stride * patterns` state IDs, where `patterns` may be zero in the
+ /// case of a DFA with no patterns or in the case where the DFA was built
+ /// without enabling starting states for each pattern.
+ table: T,
+ /// The number of starting state IDs per pattern.
+ stride: usize,
+ /// The total number of patterns for which starting states are encoded.
+ /// This may be zero for non-empty DFAs when the DFA was built without
+ /// start states for each pattern. Thus, one cannot use this field to
+ /// say how many patterns are in the DFA in all cases. It is specific to
+ /// how many patterns are represented in this start table.
+ patterns: usize,
+}
+
+#[cfg(feature = "alloc")]
+impl StartTable<Vec<u32>> {
+ /// Create a valid set of start states all pointing to the dead state.
+ ///
+ /// When the corresponding DFA is constructed with start states for each
+ /// pattern, then `patterns` should be the number of patterns. Otherwise,
+ /// it should be zero.
+ ///
+ /// If the total table size could exceed the allocatable limit, then this
+ /// returns an error. In practice, this is unlikely to be able to occur,
+ /// since it's likely that allocation would have failed long before it got
+ /// to this point.
+ fn dead(patterns: usize) -> Result<StartTable<Vec<u32>>, Error> {
+ assert!(patterns <= PatternID::LIMIT);
+ let stride = Start::count();
+ let pattern_starts_len = match stride.checked_mul(patterns) {
+ Some(x) => x,
+ None => return Err(Error::too_many_start_states()),
+ };
+ let table_len = match stride.checked_add(pattern_starts_len) {
+ Some(x) => x,
+ None => return Err(Error::too_many_start_states()),
+ };
+ if table_len > core::isize::MAX as usize {
+ return Err(Error::too_many_start_states());
+ }
+ let table = vec![DEAD.as_u32(); table_len];
+ Ok(StartTable { table, stride, patterns })
+ }
+}
+
+impl<'a> StartTable<&'a [u32]> {
+ /// Deserialize a table of start state IDs starting at the beginning of
+ /// `slice`. Upon success, return the total number of bytes read along with
+ /// the table of starting state IDs.
+ ///
+ /// If there was a problem deserializing any part of the starting IDs,
+ /// then this returns an error. Notably, if the given slice does not have
+ /// the same alignment as `StateID`, then this will return an error (among
+ /// other possible errors).
+ ///
+ /// This is guaranteed to execute in constant time.
+ ///
+ /// # Safety
+ ///
+ /// This routine is not safe because it does not check the valdity of the
+ /// starting state IDs themselves. In particular, the number of starting
+ /// IDs can be of variable length, so it's possible that checking their
+ /// validity cannot be done in constant time. An invalid starting state
+ /// ID is not safe because other code may rely on the starting IDs being
+ /// correct (such as explicit bounds check elision). Therefore, an invalid
+ /// start ID can lead to undefined behavior.
+ ///
+ /// Callers that use this function must either pass on the safety invariant
+ /// or guarantee that the bytes given contain valid starting state IDs.
+ /// This guarantee is upheld by the bytes written by `write_to`.
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr() as usize;
+
+ let (stride, nr) =
+ bytes::try_read_u32_as_usize(slice, "start table stride")?;
+ slice = &slice[nr..];
+
+ let (patterns, nr) =
+ bytes::try_read_u32_as_usize(slice, "start table patterns")?;
+ slice = &slice[nr..];
+
+ if stride != Start::count() {
+ return Err(DeserializeError::generic(
+ "invalid starting table stride",
+ ));
+ }
+ if patterns > PatternID::LIMIT {
+ return Err(DeserializeError::generic(
+ "invalid number of patterns",
+ ));
+ }
+ let pattern_table_size =
+ bytes::mul(stride, patterns, "invalid pattern count")?;
+ // Our start states always start with a single stride of start states
+ // for the entire automaton which permit it to match any pattern. What
+ // follows it are an optional set of start states for each pattern.
+ let start_state_count = bytes::add(
+ stride,
+ pattern_table_size,
+ "invalid 'any' pattern starts size",
+ )?;
+ let table_bytes_len = bytes::mul(
+ start_state_count,
+ StateID::SIZE,
+ "pattern table bytes length",
+ )?;
+ bytes::check_slice_len(slice, table_bytes_len, "start ID table")?;
+ bytes::check_alignment::<StateID>(slice)?;
+ let table_bytes = &slice[..table_bytes_len];
+ slice = &slice[table_bytes_len..];
+ // SAFETY: Since StateID is always representable as a u32, all we need
+ // to do is ensure that we have the proper length and alignment. We've
+ // checked both above, so the cast below is safe.
+ //
+ // N.B. This is the only not-safe code in this function, so we mark
+ // it explicitly to call it out, even though it is technically
+ // superfluous.
+ #[allow(unused_unsafe)]
+ let table = unsafe {
+ core::slice::from_raw_parts(
+ table_bytes.as_ptr() as *const u32,
+ start_state_count,
+ )
+ };
+ let st = StartTable { table, stride, patterns };
+ Ok((st, slice.as_ptr() as usize - slice_start))
+ }
+}
+
+impl<T: AsRef<[u32]>> StartTable<T> {
+ /// Writes a serialized form of this start table to the buffer given. If
+ /// the buffer is too small, then an error is returned. To determine how
+ /// big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "starting table ids",
+ ));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write stride
+ // Unwrap is OK since the stride is always 4 (currently).
+ E::write_u32(u32::try_from(self.stride).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ // write pattern count
+ // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ // write start IDs
+ for &sid in self.table() {
+ let n = bytes::write_state_id::<E>(sid, &mut dst);
+ dst = &mut dst[n..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this start ID table
+ /// will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // stride
+ + size_of::<u32>() // # patterns
+ + (self.table().len() * StateID::SIZE)
+ }
+
+ /// Validates that every state ID in this start table is valid by checking
+ /// it against the given transition table (which must be for the same DFA).
+ ///
+ /// That is, every state ID can be used to correctly index a state.
+ fn validate(
+ &self,
+ tt: &TransitionTable<T>,
+ ) -> Result<(), DeserializeError> {
+ for &id in self.table() {
+ if !tt.is_valid(id) {
+ return Err(DeserializeError::generic(
+ "found invalid starting state ID",
+ ));
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts this start list to a borrowed value.
+ fn as_ref(&self) -> StartTable<&'_ [u32]> {
+ StartTable {
+ table: self.table.as_ref(),
+ stride: self.stride,
+ patterns: self.patterns,
+ }
+ }
+
+ /// Converts this start list to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> StartTable<Vec<u32>> {
+ StartTable {
+ table: self.table.as_ref().to_vec(),
+ stride: self.stride,
+ patterns: self.patterns,
+ }
+ }
+
+ /// Return the start state for the given start index and pattern ID. If the
+ /// pattern ID is None, then the corresponding start state for the entire
+ /// DFA is returned. If the pattern ID is not None, then the corresponding
+ /// starting state for the given pattern is returned. If this start table
+ /// does not have individual starting states for each pattern, then this
+ /// panics.
+ fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
+ let start_index = index.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ let pid = pid.as_usize();
+ assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+ self.stride + (self.stride * pid) + start_index
+ }
+ };
+ self.table()[index]
+ }
+
+ /// Returns an iterator over all start state IDs in this table.
+ ///
+ /// Each item is a triple of: start state ID, the start state type and the
+ /// pattern ID (if any).
+ fn iter(&self) -> StartStateIter<'_> {
+ StartStateIter { st: self.as_ref(), i: 0 }
+ }
+
+ /// Returns the table as a slice of state IDs.
+ fn table(&self) -> &[StateID] {
+ let integers = self.table.as_ref();
+ // SAFETY: This is safe because StateID is guaranteed to be
+ // representable as a u32.
+ unsafe {
+ core::slice::from_raw_parts(
+ integers.as_ptr() as *const StateID,
+ integers.len(),
+ )
+ }
+ }
+
+ /// Return the memory usage, in bytes, of this start list.
+ ///
+ /// This does not include the size of a `StartList` value itself.
+ fn memory_usage(&self) -> usize {
+ self.table().len() * StateID::SIZE
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u32]>> StartTable<T> {
+ /// Set the start state for the given index and pattern.
+ ///
+ /// If the pattern ID or state ID are not valid, then this will panic.
+ fn set_start(
+ &mut self,
+ index: Start,
+ pattern_id: Option<PatternID>,
+ id: StateID,
+ ) {
+ let start_index = index.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => self
+ .stride
+ .checked_mul(pid.as_usize())
+ .unwrap()
+ .checked_add(self.stride)
+ .unwrap()
+ .checked_add(start_index)
+ .unwrap(),
+ };
+ self.table_mut()[index] = id;
+ }
+
+ /// Returns the table as a mutable slice of state IDs.
+ fn table_mut(&mut self) -> &mut [StateID] {
+ let integers = self.table.as_mut();
+ // SAFETY: This is safe because StateID is guaranteed to be
+ // representable as a u32.
+ unsafe {
+ core::slice::from_raw_parts_mut(
+ integers.as_mut_ptr() as *mut StateID,
+ integers.len(),
+ )
+ }
+ }
+}
+
+/// An iterator over start state IDs.
+///
+/// This iterator yields a triple of start state ID, the start state type
+/// and the pattern ID (if any). The pattern ID is None for start states
+/// corresponding to the entire DFA and non-None for start states corresponding
+/// to a specific pattern. The latter only occurs when the DFA is compiled with
+/// start states for each pattern.
+pub(crate) struct StartStateIter<'a> {
+ st: StartTable<&'a [u32]>,
+ i: usize,
+}
+
+impl<'a> Iterator for StartStateIter<'a> {
+ type Item = (StateID, Start, Option<PatternID>);
+
+ fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
+ let i = self.i;
+ let table = self.st.table();
+ if i >= table.len() {
+ return None;
+ }
+ self.i += 1;
+
+ // This unwrap is okay since the stride of the starting state table
+ // must always match the number of start state types.
+ let start_type = Start::from_usize(i % self.st.stride).unwrap();
+ let pid = if i < self.st.stride {
+ None
+ } else {
+ Some(
+ PatternID::new((i - self.st.stride) / self.st.stride).unwrap(),
+ )
+ };
+ Some((table[i], start_type, pid))
+ }
+}
+
+/// This type represents that patterns that should be reported whenever a DFA
+/// enters a match state. This structure exists to support DFAs that search for
+/// matches for multiple regexes.
+///
+/// This structure relies on the fact that all match states in a DFA occur
+/// contiguously in the DFA's transition table. (See dfa/special.rs for a more
+/// detailed breakdown of the representation.) Namely, when a match occurs, we
+/// know its state ID. Since we know the start and end of the contiguous region
+/// of match states, we can use that to compute the position at which the match
+/// state occurs. That in turn is used as an offset into this structure.
+#[derive(Clone, Debug)]
+struct MatchStates<T> {
+ /// Slices is a flattened sequence of pairs, where each pair points to a
+ /// sub-slice of pattern_ids. The first element of the pair is an offset
+ /// into pattern_ids and the second element of the pair is the number
+ /// of 32-bit pattern IDs starting at that position. That is, each pair
+ /// corresponds to a single DFA match state and its corresponding match
+ /// IDs. The number of pairs always corresponds to the number of distinct
+ /// DFA match states.
+ ///
+ /// In practice, T is either Vec<u32> or &[u32].
+ slices: T,
+ /// A flattened sequence of pattern IDs for each DFA match state. The only
+ /// way to correctly read this sequence is indirectly via `slices`.
+ ///
+ /// In practice, T is either Vec<u32> or &[u32].
+ pattern_ids: T,
+ /// The total number of unique patterns represented by these match states.
+ patterns: usize,
+}
+
+impl<'a> MatchStates<&'a [u32]> {
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr() as usize;
+
+ // Read the total number of match states.
+ let (count, nr) =
+ bytes::try_read_u32_as_usize(slice, "match state count")?;
+ slice = &slice[nr..];
+
+ // Read the slice start/length pairs.
+ let pair_count = bytes::mul(2, count, "match state offset pairs")?;
+ let slices_bytes_len = bytes::mul(
+ pair_count,
+ PatternID::SIZE,
+ "match state slice offset byte length",
+ )?;
+ bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?;
+ bytes::check_alignment::<PatternID>(slice)?;
+ let slices_bytes = &slice[..slices_bytes_len];
+ slice = &slice[slices_bytes_len..];
+ // SAFETY: Since PatternID is always representable as a u32, all we
+ // need to do is ensure that we have the proper length and alignment.
+ // We've checked both above, so the cast below is safe.
+ //
+ // N.B. This is one of the few not-safe snippets in this function, so
+ // we mark it explicitly to call it out, even though it is technically
+ // superfluous.
+ #[allow(unused_unsafe)]
+ let slices = unsafe {
+ core::slice::from_raw_parts(
+ slices_bytes.as_ptr() as *const u32,
+ pair_count,
+ )
+ };
+
+ // Read the total number of unique pattern IDs (which is always 1 more
+ // than the maximum pattern ID in this automaton, since pattern IDs are
+ // handed out contiguously starting at 0).
+ let (patterns, nr) =
+ bytes::try_read_u32_as_usize(slice, "pattern count")?;
+ slice = &slice[nr..];
+
+ // Now read the pattern ID count. We don't need to store this
+ // explicitly, but we need it to know how many pattern IDs to read.
+ let (idcount, nr) =
+ bytes::try_read_u32_as_usize(slice, "pattern ID count")?;
+ slice = &slice[nr..];
+
+ // Read the actual pattern IDs.
+ let pattern_ids_len =
+ bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?;
+ bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
+ bytes::check_alignment::<PatternID>(slice)?;
+ let pattern_ids_bytes = &slice[..pattern_ids_len];
+ slice = &slice[pattern_ids_len..];
+ // SAFETY: Since PatternID is always representable as a u32, all we
+ // need to do is ensure that we have the proper length and alignment.
+ // We've checked both above, so the cast below is safe.
+ //
+ // N.B. This is one of the few not-safe snippets in this function, so
+ // we mark it explicitly to call it out, even though it is technically
+ // superfluous.
+ #[allow(unused_unsafe)]
+ let pattern_ids = unsafe {
+ core::slice::from_raw_parts(
+ pattern_ids_bytes.as_ptr() as *const u32,
+ idcount,
+ )
+ };
+
+ let ms = MatchStates { slices, pattern_ids, patterns };
+ Ok((ms, slice.as_ptr() as usize - slice_start))
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl MatchStates<Vec<u32>> {
+ fn empty(pattern_count: usize) -> MatchStates<Vec<u32>> {
+ assert!(pattern_count <= PatternID::LIMIT);
+ MatchStates {
+ slices: vec![],
+ pattern_ids: vec![],
+ patterns: pattern_count,
+ }
+ }
+
+ fn new(
+ matches: &BTreeMap<StateID, Vec<PatternID>>,
+ pattern_count: usize,
+ ) -> Result<MatchStates<Vec<u32>>, Error> {
+ let mut m = MatchStates::empty(pattern_count);
+ for (_, pids) in matches.iter() {
+ let start = PatternID::new(m.pattern_ids.len())
+ .map_err(|_| Error::too_many_match_pattern_ids())?;
+ m.slices.push(start.as_u32());
+ // This is always correct since the number of patterns in a single
+ // match state can never exceed maximum number of allowable
+ // patterns. Why? Because a pattern can only appear once in a
+ // particular match state, by construction. (And since our pattern
+ // ID limit is one less than u32::MAX, we're guaranteed that the
+ // length fits in a u32.)
+ m.slices.push(u32::try_from(pids.len()).unwrap());
+ for &pid in pids {
+ m.pattern_ids.push(pid.as_u32());
+ }
+ }
+ m.patterns = pattern_count;
+ Ok(m)
+ }
+
+ fn new_with_map(
+ &self,
+ matches: &BTreeMap<StateID, Vec<PatternID>>,
+ ) -> Result<MatchStates<Vec<u32>>, Error> {
+ MatchStates::new(matches, self.patterns)
+ }
+}
+
+impl<T: AsRef<[u32]>> MatchStates<T> {
+ /// Writes a serialized form of these match states to the buffer given. If
+ /// the buffer is too small, then an error is returned. To determine how
+ /// big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("match states"));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write state ID count
+ // Unwrap is OK since number of states is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.count()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write slice offset pairs
+ for &pid in self.slices() {
+ let n = bytes::write_pattern_id::<E>(pid, &mut dst);
+ dst = &mut dst[n..];
+ }
+
+ // write unique pattern ID count
+ // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write pattern ID count
+ // Unwrap is OK since we check at construction (and deserialization)
+ // that the number of patterns is representable as a u32.
+ E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write pattern IDs
+ for &pid in self.pattern_ids() {
+ let n = bytes::write_pattern_id::<E>(pid, &mut dst);
+ dst = &mut dst[n..];
+ }
+
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // match state count
+ + (self.slices().len() * PatternID::SIZE)
+ + size_of::<u32>() // unique pattern ID count
+ + size_of::<u32>() // pattern ID count
+ + (self.pattern_ids().len() * PatternID::SIZE)
+ }
+
+ /// Valides that the match state info is itself internally consistent and
+ /// consistent with the recorded match state region in the given DFA.
+ fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
+ if self.count() != dfa.special.match_len(dfa.stride()) {
+ return Err(DeserializeError::generic(
+ "match state count mismatch",
+ ));
+ }
+ for si in 0..self.count() {
+ let start = self.slices()[si * 2].as_usize();
+ let len = self.slices()[si * 2 + 1].as_usize();
+ if start >= self.pattern_ids().len() {
+ return Err(DeserializeError::generic(
+ "invalid pattern ID start offset",
+ ));
+ }
+ if start + len > self.pattern_ids().len() {
+ return Err(DeserializeError::generic(
+ "invalid pattern ID length",
+ ));
+ }
+ for mi in 0..len {
+ let pid = self.pattern_id(si, mi);
+ if pid.as_usize() >= self.patterns {
+ return Err(DeserializeError::generic(
+ "invalid pattern ID",
+ ));
+ }
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts these match states back into their map form. This is useful
+ /// when shuffling states, as the normal MatchStates representation is not
+ /// amenable to easy state swapping. But with this map, to swap id1 and
+ /// id2, all you need to do is:
+ ///
+ /// if let Some(pids) = map.remove(&id1) {
+ /// map.insert(id2, pids);
+ /// }
+ ///
+ /// Once shuffling is done, use MatchStates::new to convert back.
+ #[cfg(feature = "alloc")]
+ fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> {
+ let mut map = BTreeMap::new();
+ for i in 0..self.count() {
+ let mut pids = vec![];
+ for j in 0..self.pattern_len(i) {
+ pids.push(self.pattern_id(i, j));
+ }
+ map.insert(self.match_state_id(dfa, i), pids);
+ }
+ map
+ }
+
+ /// Converts these match states to a borrowed value.
+ fn as_ref(&self) -> MatchStates<&'_ [u32]> {
+ MatchStates {
+ slices: self.slices.as_ref(),
+ pattern_ids: self.pattern_ids.as_ref(),
+ patterns: self.patterns,
+ }
+ }
+
+ /// Converts these match states to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> MatchStates<Vec<u32>> {
+ MatchStates {
+ slices: self.slices.as_ref().to_vec(),
+ pattern_ids: self.pattern_ids.as_ref().to_vec(),
+ patterns: self.patterns,
+ }
+ }
+
+ /// Returns the match state ID given the match state index. (Where the
+ /// first match state corresponds to index 0.)
+ ///
+ /// This panics if there is no match state at the given index.
+ fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID {
+ assert!(dfa.special.matches(), "no match states to index");
+ // This is one of the places where we rely on the fact that match
+ // states are contiguous in the transition table. Namely, that the
+ // first match state ID always corresponds to dfa.special.min_start.
+ // From there, since we know the stride, we can compute the ID of any
+ // match state given its index.
+ let stride2 = u32::try_from(dfa.stride2()).unwrap();
+ let offset = index.checked_shl(stride2).unwrap();
+ let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap();
+ let sid = StateID::new(id).unwrap();
+ assert!(dfa.is_match_state(sid));
+ sid
+ }
+
+ /// Returns the pattern ID at the given match index for the given match
+ /// state.
+ ///
+ /// The match state index is the state index minus the state index of the
+ /// first match state in the DFA.
+ ///
+ /// The match index is the index of the pattern ID for the given state.
+ /// The index must be less than `self.pattern_len(state_index)`.
+ fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID {
+ self.pattern_id_slice(state_index)[match_index]
+ }
+
+ /// Returns the number of patterns in the given match state.
+ ///
+ /// The match state index is the state index minus the state index of the
+ /// first match state in the DFA.
+ fn pattern_len(&self, state_index: usize) -> usize {
+ self.slices()[state_index * 2 + 1].as_usize()
+ }
+
+ /// Returns all of the pattern IDs for the given match state index.
+ ///
+ /// The match state index is the state index minus the state index of the
+ /// first match state in the DFA.
+ fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] {
+ let start = self.slices()[state_index * 2].as_usize();
+ let len = self.pattern_len(state_index);
+ &self.pattern_ids()[start..start + len]
+ }
+
+ /// Returns the pattern ID offset slice of u32 as a slice of PatternID.
+ fn slices(&self) -> &[PatternID] {
+ let integers = self.slices.as_ref();
+ // SAFETY: This is safe because PatternID is guaranteed to be
+ // representable as a u32.
+ unsafe {
+ core::slice::from_raw_parts(
+ integers.as_ptr() as *const PatternID,
+ integers.len(),
+ )
+ }
+ }
+
+ /// Returns the total number of match states.
+ fn count(&self) -> usize {
+ assert_eq!(0, self.slices().len() % 2);
+ self.slices().len() / 2
+ }
+
+ /// Returns the pattern ID slice of u32 as a slice of PatternID.
+ fn pattern_ids(&self) -> &[PatternID] {
+ let integers = self.pattern_ids.as_ref();
+ // SAFETY: This is safe because PatternID is guaranteed to be
+ // representable as a u32.
+ unsafe {
+ core::slice::from_raw_parts(
+ integers.as_ptr() as *const PatternID,
+ integers.len(),
+ )
+ }
+ }
+
+ /// Return the memory usage, in bytes, of these match pairs.
+ fn memory_usage(&self) -> usize {
+ (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE
+ }
+}
+
+/// An iterator over all states in a DFA.
+///
+/// This iterator yields a tuple for each state. The first element of the
+/// tuple corresponds to a state's identifier, and the second element
+/// corresponds to the state itself (comprised of its transitions).
+///
+/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
+/// the type of the transition table itself.
+pub(crate) struct StateIter<'a, T> {
+ tt: &'a TransitionTable<T>,
+ it: iter::Enumerate<slice::Chunks<'a, StateID>>,
+}
+
+impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> {
+ type Item = State<'a>;
+
+ fn next(&mut self) -> Option<State<'a>> {
+ self.it.next().map(|(index, _)| {
+ let id = self.tt.from_index(index);
+ self.tt.state(id)
+ })
+ }
+}
+
+/// An immutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table.
+pub(crate) struct State<'a> {
+ id: StateID,
+ stride2: usize,
+ transitions: &'a [StateID],
+}
+
+impl<'a> State<'a> {
+ /// Return an iterator over all transitions in this state. This yields
+ /// a number of transitions equivalent to the alphabet length of the
+ /// corresponding DFA.
+ ///
+ /// Each transition is represented by a tuple. The first element is
+ /// the input byte for that transition and the second element is the
+ /// transitions itself.
+ pub(crate) fn transitions(&self) -> StateTransitionIter<'_> {
+ StateTransitionIter {
+ len: self.transitions.len(),
+ it: self.transitions.iter().enumerate(),
+ }
+ }
+
+ /// Return an iterator over a sparse representation of the transitions in
+ /// this state. Only non-dead transitions are returned.
+ ///
+ /// The "sparse" representation in this case corresponds to a sequence of
+ /// triples. The first two elements of the triple comprise an inclusive
+ /// byte range while the last element corresponds to the transition taken
+ /// for all bytes in the range.
+ ///
+ /// This is somewhat more condensed than the classical sparse
+ /// representation (where you have an element for every non-dead
+ /// transition), but in practice, checking if a byte is in a range is very
+ /// cheap and using ranges tends to conserve quite a bit more space.
+ pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> {
+ StateSparseTransitionIter { dense: self.transitions(), cur: None }
+ }
+
+ /// Returns the identifier for this state.
+ pub(crate) fn id(&self) -> StateID {
+ self.id
+ }
+
+ /// Analyzes this state to determine whether it can be accelerated. If so,
+ /// it returns an accelerator that contains at least one byte.
+ #[cfg(feature = "alloc")]
+ fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> {
+ // We just try to add bytes to our accelerator. Once adding fails
+ // (because we've added too many bytes), then give up.
+ let mut accel = Accel::new();
+ for (class, id) in self.transitions() {
+ if id == self.id() {
+ continue;
+ }
+ for unit in classes.elements(class) {
+ if let Some(byte) = unit.as_u8() {
+ if !accel.add(byte) {
+ return None;
+ }
+ }
+ }
+ }
+ if accel.is_empty() {
+ None
+ } else {
+ Some(accel)
+ }
+ }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ for (i, (start, end, id)) in self.sparse_transitions().enumerate() {
+ let index = if f.alternate() {
+ id.as_usize()
+ } else {
+ id.as_usize() >> self.stride2
+ };
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(f, "{:?} => {:?}", start, index)?;
+ } else {
+ write!(f, "{:?}-{:?} => {:?}", start, end, index)?;
+ }
+ }
+ Ok(())
+ }
+}
+
+/// A mutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table.
+#[cfg(feature = "alloc")]
+pub(crate) struct StateMut<'a> {
+ id: StateID,
+ stride2: usize,
+ transitions: &'a mut [StateID],
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> StateMut<'a> {
+ /// Return an iterator over all transitions in this state. This yields
+ /// a number of transitions equivalent to the alphabet length of the
+ /// corresponding DFA.
+ ///
+ /// Each transition is represented by a tuple. The first element is the
+ /// input byte for that transition and the second element is a mutable
+ /// reference to the transition itself.
+ pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> {
+ StateTransitionIterMut {
+ len: self.transitions.len(),
+ it: self.transitions.iter_mut().enumerate(),
+ }
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> fmt::Debug for StateMut<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ fmt::Debug::fmt(
+ &State {
+ id: self.id,
+ stride2: self.stride2,
+ transitions: self.transitions,
+ },
+ f,
+ )
+ }
+}
+
+/// An iterator over all transitions in a single DFA state. This yields
+/// a number of transitions equivalent to the alphabet length of the
+/// corresponding DFA.
+///
+/// Each transition is represented by a tuple. The first element is the input
+/// byte for that transition and the second element is the transition itself.
+#[derive(Debug)]
+pub(crate) struct StateTransitionIter<'a> {
+ len: usize,
+ it: iter::Enumerate<slice::Iter<'a, StateID>>,
+}
+
+impl<'a> Iterator for StateTransitionIter<'a> {
+ type Item = (alphabet::Unit, StateID);
+
+ fn next(&mut self) -> Option<(alphabet::Unit, StateID)> {
+ self.it.next().map(|(i, &id)| {
+ let unit = if i + 1 == self.len {
+ alphabet::Unit::eoi(i)
+ } else {
+ let b = u8::try_from(i)
+ .expect("raw byte alphabet is never exceeded");
+ alphabet::Unit::u8(b)
+ };
+ (unit, id)
+ })
+ }
+}
+
+/// A mutable iterator over all transitions in a DFA state.
+///
+/// Each transition is represented by a tuple. The first element is the
+/// input byte for that transition and the second element is a mutable
+/// reference to the transition itself.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub(crate) struct StateTransitionIterMut<'a> {
+ len: usize,
+ it: iter::Enumerate<slice::IterMut<'a, StateID>>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for StateTransitionIterMut<'a> {
+ type Item = (alphabet::Unit, &'a mut StateID);
+
+ fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> {
+ self.it.next().map(|(i, id)| {
+ let unit = if i + 1 == self.len {
+ alphabet::Unit::eoi(i)
+ } else {
+ let b = u8::try_from(i)
+ .expect("raw byte alphabet is never exceeded");
+ alphabet::Unit::u8(b)
+ };
+ (unit, id)
+ })
+ }
+}
+
+/// An iterator over all non-DEAD transitions in a single DFA state using a
+/// sparse representation.
+///
+/// Each transition is represented by a triple. The first two elements of the
+/// triple comprise an inclusive byte range while the last element corresponds
+/// to the transition taken for all bytes in the range.
+///
+/// As a convenience, this always returns `alphabet::Unit` values of the same
+/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte,
+/// byte) and (EOI, EOI) values are yielded.
+#[derive(Debug)]
+pub(crate) struct StateSparseTransitionIter<'a> {
+ dense: StateTransitionIter<'a>,
+ cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>,
+}
+
+impl<'a> Iterator for StateSparseTransitionIter<'a> {
+ type Item = (alphabet::Unit, alphabet::Unit, StateID);
+
+ fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> {
+ while let Some((unit, next)) = self.dense.next() {
+ let (prev_start, prev_end, prev_next) = match self.cur {
+ Some(t) => t,
+ None => {
+ self.cur = Some((unit, unit, next));
+ continue;
+ }
+ };
+ if prev_next == next && !unit.is_eoi() {
+ self.cur = Some((prev_start, unit, prev_next));
+ } else {
+ self.cur = Some((unit, unit, next));
+ if prev_next != DEAD {
+ return Some((prev_start, prev_end, prev_next));
+ }
+ }
+ }
+ if let Some((start, end, next)) = self.cur.take() {
+ if next != DEAD {
+ return Some((start, end, next));
+ }
+ }
+ None
+ }
+}
+
+/// An iterator over pattern IDs for a single match state.
+#[derive(Debug)]
+pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>);
+
+impl<'a> Iterator for PatternIDIter<'a> {
+ type Item = PatternID;
+
+ fn next(&mut self) -> Option<PatternID> {
+ self.0.next().copied()
+ }
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// dense DFA. This is useful when one wants to shuffle states into different
+/// positions in the DFA.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` should be called, which will rewrite
+/// all pertinent transitions to updated state IDs.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+struct Remapper {
+ /// A map from the index of a state to its pre-multiplied identifier.
+ ///
+ /// When a state is swapped with another, then their corresponding
+ /// locations in this map are also swapped. Thus, its new position will
+ /// still point to its old pre-multiplied StateID.
+ ///
+ /// While there is a bit more to it, this then allows us to rewrite the
+ /// state IDs in a DFA's transition table in a single pass. This is done
+ /// by iterating over every ID in this map, then iterating over each
+ /// transition for the state at that ID and re-mapping the transition from
+ /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+ /// in this map where `old_id` *started*, and set it to where it ended up
+ /// after all swaps have been completed.
+ map: Vec<StateID>,
+}
+
+#[cfg(feature = "alloc")]
+impl Remapper {
+ fn from_dfa(dfa: &OwnedDFA) -> Remapper {
+ Remapper {
+ map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(),
+ }
+ }
+
+ fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) {
+ dfa.swap_states(id1, id2);
+ self.map.swap(dfa.to_index(id1), dfa.to_index(id2));
+ }
+
+ fn remap(mut self, dfa: &mut OwnedDFA) {
+ // Update the map to account for states that have been swapped
+ // multiple times. For example, if (A, C) and (C, G) are swapped, then
+ // transitions previously pointing to A should now point to G. But if
+ // we don't update our map, they will erroneously be set to C. All we
+ // do is follow the swaps in our map until we see our original state
+ // ID.
+ let oldmap = self.map.clone();
+ for i in 0..dfa.state_count() {
+ let cur_id = dfa.from_index(i);
+ let mut new = oldmap[i];
+ if cur_id == new {
+ continue;
+ }
+ loop {
+ let id = oldmap[dfa.to_index(new)];
+ if cur_id == id {
+ self.map[i] = new;
+ break;
+ }
+ new = id;
+ }
+ }
+
+ // To work around the borrow checker for converting state IDs to
+ // indices. We cannot borrow self while mutably iterating over a
+ // state's transitions. Otherwise, we'd just use dfa.to_index(..).
+ let stride2 = dfa.stride2();
+ let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
+
+ // Now that we've finished shuffling, we need to remap all of our
+ // transitions. We don't need to handle re-mapping accelerated states
+ // since `accels` is only populated after shuffling.
+ for &id in self.map.iter() {
+ for (_, next_id) in dfa.state_mut(id).iter_mut() {
+ *next_id = self.map[to_index(*next_id)];
+ }
+ }
+ for start_id in dfa.st.table_mut().iter_mut() {
+ *start_id = self.map[to_index(*start_id)];
+ }
+ }
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn errors_with_unicode_word_boundary() {
+ let pattern = r"\b";
+ assert!(Builder::new().build(pattern).is_err());
+ }
+
+ #[test]
+ fn roundtrip_never_match() {
+ let dfa = DFA::never_match().unwrap();
+ let (buf, _) = dfa.to_bytes_native_endian();
+ let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
+
+ assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap());
+ }
+
+ #[test]
+ fn roundtrip_always_match() {
+ use crate::HalfMatch;
+
+ let dfa = DFA::always_match().unwrap();
+ let (buf, _) = dfa.to_bytes_native_endian();
+ let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
+
+ assert_eq!(
+ Some(HalfMatch::must(0, 0)),
+ dfa.find_leftmost_fwd(b"foo12345").unwrap()
+ );
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs
new file mode 100644
index 000000000..61603481b
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/determinize.rs
@@ -0,0 +1,547 @@
+use alloc::{
+ collections::BTreeMap,
+ vec::{self, Vec},
+};
+
+use crate::{
+ dfa::{dense, Error, DEAD},
+ nfa::thompson,
+ util::{
+ self,
+ alphabet::{self, ByteSet},
+ determinize::{State, StateBuilderEmpty, StateBuilderNFA},
+ id::{PatternID, StateID},
+ matchtypes::MatchKind,
+ sparse_set::{SparseSet, SparseSets},
+ start::Start,
+ },
+};
+
+/// A builder for configuring and running a DFA determinizer.
+#[derive(Clone, Debug)]
+pub(crate) struct Config {
+ anchored: bool,
+ match_kind: MatchKind,
+ quit: ByteSet,
+ dfa_size_limit: Option<usize>,
+ determinize_size_limit: Option<usize>,
+}
+
+impl Config {
+ /// Create a new default config for a determinizer. The determinizer may be
+ /// configured before calling `run`.
+ pub fn new() -> Config {
+ Config {
+ anchored: false,
+ match_kind: MatchKind::LeftmostFirst,
+ quit: ByteSet::empty(),
+ dfa_size_limit: None,
+ determinize_size_limit: None,
+ }
+ }
+
+ /// Run determinization on the given NFA and write the resulting DFA into
+ /// the one given. The DFA given should be initialized but otherwise empty.
+ /// "Initialized" means that it is setup to handle the NFA's byte classes,
+ /// number of patterns and whether to build start states for each pattern.
+ pub fn run(
+ &self,
+ nfa: &thompson::NFA,
+ dfa: &mut dense::OwnedDFA,
+ ) -> Result<(), Error> {
+ let dead = State::dead();
+ let quit = State::dead();
+ let mut cache = StateMap::default();
+ // We only insert the dead state here since its representation is
+ // identical to the quit state. And we never want anything pointing
+ // to the quit state other than specific transitions derived from the
+ // determinizer's configured "quit" bytes.
+ //
+ // We do put the quit state into 'builder_states' below. This ensures
+ // that a proper DFA state ID is allocated for it, and that no other
+ // DFA state uses the "location after the DEAD state." That is, it
+ // is assumed that the quit state is always the state immediately
+ // following the DEAD state.
+ cache.insert(dead.clone(), DEAD);
+
+ let runner = Runner {
+ config: self.clone(),
+ nfa,
+ dfa,
+ builder_states: alloc::vec![dead, quit],
+ cache,
+ memory_usage_state: 0,
+ sparses: SparseSets::new(nfa.len()),
+ stack: alloc::vec![],
+ scratch_state_builder: StateBuilderEmpty::new(),
+ };
+ runner.run()
+ }
+
+ /// Whether to build an anchored DFA or not. When disabled (the default),
+ /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
+ /// the anchored start state of the NFA is used to start the DFA.
+ pub fn anchored(&mut self, yes: bool) -> &mut Config {
+ self.anchored = yes;
+ self
+ }
+
+ /// The match semantics to use for determinization.
+ ///
+ /// MatchKind::All corresponds to the standard textbook construction.
+ /// All possible match states are represented in the DFA.
+ /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
+ /// simulate the match semantics of backtracking regex engines. Namely,
+ /// only a subset of match states are built, and dead states are used to
+ /// stop searches with an unanchored prefix.
+ ///
+ /// The default is MatchKind::LeftmostFirst.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+ self.match_kind = kind;
+ self
+ }
+
+ /// The set of bytes to use that will cause the DFA to enter a quit state,
+ /// stop searching and return an error. By default, this is empty.
+ pub fn quit(&mut self, set: ByteSet) -> &mut Config {
+ self.quit = set;
+ self
+ }
+
+ /// The limit, in bytes of the heap, that the DFA is permitted to use. This
+ /// does not include the auxiliary heap storage used by determinization.
+ pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
+ self.dfa_size_limit = bytes;
+ self
+ }
+
+ /// The limit, in bytes of the heap, that determinization itself is allowed
+ /// to use. This does not include the size of the DFA being built.
+ pub fn determinize_size_limit(
+ &mut self,
+ bytes: Option<usize>,
+ ) -> &mut Config {
+ self.determinize_size_limit = bytes;
+ self
+ }
+}
+
+/// The actual implementation of determinization that converts an NFA to a DFA
+/// through powerset construction.
+///
+/// This determinizer roughly follows the typical powerset construction, where
+/// each DFA state is comprised of one or more NFA states. In the worst case,
+/// there is one DFA state for every possible combination of NFA states. In
+/// practice, this only happens in certain conditions, typically when there are
+/// bounded repetitions.
+///
+/// The main differences between this implementation and typical deteminization
+/// are that this implementation delays matches by one state and hackily makes
+/// look-around work. Comments below attempt to explain this.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
+/// whichever is shorter.
+#[derive(Debug)]
+struct Runner<'a> {
+ /// The configuration used to initialize determinization.
+ config: Config,
+ /// The NFA we're converting into a DFA.
+ nfa: &'a thompson::NFA,
+ /// The DFA we're building.
+ dfa: &'a mut dense::OwnedDFA,
+ /// Each DFA state being built is defined as an *ordered* set of NFA
+ /// states, along with some meta facts about the ordered set of NFA states.
+ ///
+ /// This is never empty. The first state is always a dummy state such that
+ /// a state id == 0 corresponds to a dead state. The second state is always
+ /// the quit state.
+ ///
+ /// Why do we have states in both a `Vec` and in a cache map below?
+ /// Well, they serve two different roles based on access patterns.
+ /// `builder_states` is the canonical home of each state, and provides
+ /// constant random access by a DFA state's ID. The cache map below, on
+ /// the other hand, provides a quick way of searching for identical DFA
+ /// states by using the DFA state as a key in the map. Of course, we use
+ /// reference counting to avoid actually duplicating the state's data
+ /// itself. (Although this has never been benchmarked.) Note that the cache
+ /// map does not give us full minimization; it just lets us avoid some very
+ /// obvious redundant states.
+ ///
+ /// Note that the index into this Vec isn't quite the DFA's state ID.
+ /// Rather, it's just an index. To get the state ID, you have to multiply
+ /// it by the DFA's stride. That's done by self.dfa.from_index. And the
+ /// inverse is self.dfa.to_index.
+ ///
+ /// Moreover, DFA states don't usually retain the IDs assigned to them
+ /// by their position in this Vec. After determinization completes,
+ /// states are shuffled around to support other optimizations. See the
+ /// sibling 'special' module for more details on that. (The reason for
+ /// mentioning this is that if you print out the DFA for debugging during
+ /// determinization, and then print out the final DFA after it is fully
+ /// built, then the state IDs likely won't match up.)
+ builder_states: Vec<State>,
+ /// A cache of DFA states that already exist and can be easily looked up
+ /// via ordered sets of NFA states.
+ ///
+ /// See `builder_states` docs for why we store states in two different
+ /// ways.
+ cache: StateMap,
+ /// The memory usage, in bytes, used by builder_states and cache. We track
+ /// this as new states are added since states use a variable amount of
+ /// heap. Tracking this as we add states makes it possible to compute the
+ /// total amount of memory used by the determinizer in constant time.
+ memory_usage_state: usize,
+ /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
+ /// These are reused throughout determinization. A bounded sparse set
+ /// gives us constant time insertion, membership testing and clearing.
+ sparses: SparseSets,
+ /// Scratch space for a stack of NFA states to visit, for depth first
+ /// visiting without recursion.
+ stack: Vec<StateID>,
+ /// Scratch space for storing an ordered sequence of NFA states, for
+ /// amortizing allocation. This is principally useful for when we avoid
+ /// adding a new DFA state since it already exists. In order to detect this
+ /// case though, we still need an ordered set of NFA state IDs. So we use
+ /// this space to stage that ordered set before we know whether we need to
+ /// create a new DFA state or not.
+ scratch_state_builder: StateBuilderEmpty,
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, StateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = BTreeMap<State, StateID>;
+
+impl<'a> Runner<'a> {
+ /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+ /// the chosen state identifier representation is too small), then an error
+ /// is returned.
+ fn run(mut self) -> Result<(), Error> {
+ if self.nfa.has_word_boundary_unicode()
+ && !self.config.quit.contains_range(0x80, 0xFF)
+ {
+ return Err(Error::unsupported_dfa_word_boundary_unicode());
+ }
+
+ // A sequence of "representative" bytes drawn from each equivalence
+ // class. These representative bytes are fed to the NFA to compute
+ // state transitions. This allows us to avoid re-computing state
+ // transitions for bytes that are guaranteed to produce identical
+ // results.
+ let representatives: Vec<alphabet::Unit> =
+ self.dfa.byte_classes().representatives().collect();
+ // The set of all DFA state IDs that still need to have their
+ // transitions set. We start by seeding this with all starting states.
+ let mut uncompiled = alloc::vec![];
+ self.add_all_starts(&mut uncompiled)?;
+ while let Some(dfa_id) = uncompiled.pop() {
+ for &unit in &representatives {
+ if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
+ {
+ continue;
+ }
+ // In many cases, the state we transition to has already been
+ // computed. 'cached_state' will do the minimal amount of work
+ // to check this, and if it exists, immediately return an
+ // already existing state ID.
+ let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
+ self.dfa.set_transition(dfa_id, unit, next_dfa_id);
+ // If the state ID we got back is newly created, then we need
+ // to compile it, so add it to our uncompiled frontier.
+ if is_new {
+ uncompiled.push(next_dfa_id);
+ }
+ }
+ }
+ trace!(
+ "determinization complete, memory usage: {}, dense DFA size: {}",
+ self.memory_usage(),
+ self.dfa.memory_usage(),
+ );
+
+ // A map from DFA state ID to one or more NFA match IDs. Each NFA match
+ // ID corresponds to a distinct regex pattern that matches in the state
+ // corresponding to the key.
+ let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
+ self.cache.clear();
+ #[allow(unused_variables)]
+ let mut total_pat_count = 0;
+ for (i, state) in self.builder_states.into_iter().enumerate() {
+ if let Some(pat_ids) = state.match_pattern_ids() {
+ let id = self.dfa.from_index(i);
+ total_pat_count += pat_ids.len();
+ matches.insert(id, pat_ids);
+ }
+ }
+ log! {
+ use core::mem::size_of;
+ let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
+ let pats = total_pat_count * size_of::<PatternID>();
+ let mem = (matches.len() * per_elem) + pats;
+ log::trace!("matches map built, memory usage: {}", mem);
+ }
+ // At this point, we shuffle the "special" states in the final DFA.
+ // This permits a DFA's match loop to detect a match condition (among
+ // other things) by merely inspecting the current state's identifier,
+ // and avoids the need for any additional auxiliary storage.
+ self.dfa.shuffle(matches)?;
+ Ok(())
+ }
+
+ /// Return the identifier for the next DFA state given an existing DFA
+ /// state and an input byte. If the next DFA state already exists, then
+ /// return its identifier from the cache. Otherwise, build the state, cache
+ /// it and return its identifier.
+ ///
+ /// This routine returns a boolean indicating whether a new state was
+ /// built. If a new state is built, then the caller needs to add it to its
+ /// frontier of uncompiled DFA states to compute transitions for.
+ fn cached_state(
+ &mut self,
+ dfa_id: StateID,
+ unit: alphabet::Unit,
+ ) -> Result<(StateID, bool), Error> {
+ // Compute the set of all reachable NFA states, including epsilons.
+ let empty_builder = self.get_state_builder();
+ let builder = util::determinize::next(
+ self.nfa,
+ self.config.match_kind,
+ &mut self.sparses,
+ &mut self.stack,
+ &self.builder_states[self.dfa.to_index(dfa_id)],
+ unit,
+ empty_builder,
+ );
+ self.maybe_add_state(builder)
+ }
+
+ /// Compute the set of DFA start states and add their identifiers in
+ /// 'dfa_state_ids' (no duplicates are added).
+ fn add_all_starts(
+ &mut self,
+ dfa_state_ids: &mut Vec<StateID>,
+ ) -> Result<(), Error> {
+ // Always add the (possibly unanchored) start states for matching any
+ // of the patterns in this DFA.
+ self.add_start_group(None, dfa_state_ids)?;
+ // We only need to compute anchored start states for each pattern if it
+ // was requested to do so.
+ if self.dfa.has_starts_for_each_pattern() {
+ for pid in PatternID::iter(self.dfa.pattern_count()) {
+ self.add_start_group(Some(pid), dfa_state_ids)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Add a group of start states for the given match pattern ID. Any new
+ /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
+ /// pushed.)
+ ///
+ /// When pattern_id is None, then this will compile a group of unanchored
+ /// start states (if the DFA is unanchored). When the pattern_id is
+ /// present, then this will compile a group of anchored start states that
+ /// only match the given pattern.
+ fn add_start_group(
+ &mut self,
+ pattern_id: Option<PatternID>,
+ dfa_state_ids: &mut Vec<StateID>,
+ ) -> Result<(), Error> {
+ let nfa_start = match pattern_id {
+ Some(pid) => self.nfa.start_pattern(pid),
+ None if self.config.anchored => self.nfa.start_anchored(),
+ None => self.nfa.start_unanchored(),
+ };
+
+ // When compiling start states, we're careful not to build additional
+ // states that aren't necessary. For example, if the NFA has no word
+ // boundary assertion, then there's no reason to have distinct start
+ // states for 'NonWordByte' and 'WordByte' starting configurations.
+ // Instead, the 'WordByte' starting configuration can just point
+ // directly to the start state for the 'NonWordByte' config.
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::NonWordByte)?;
+ self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ if !self.nfa.has_word_boundary() {
+ self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+ } else {
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::WordByte)?;
+ self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+ }
+ if !self.nfa.has_any_anchor() {
+ self.dfa.set_start_state(Start::Text, pattern_id, id);
+ self.dfa.set_start_state(Start::Line, pattern_id, id);
+ } else {
+ let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
+ self.dfa.set_start_state(Start::Text, pattern_id, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
+ self.dfa.set_start_state(Start::Line, pattern_id, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Add a new DFA start state corresponding to the given starting NFA
+ /// state, and the starting search configuration. (The starting search
+ /// configuration essentially tells us which look-behind assertions are
+ /// true for this particular state.)
+ ///
+ /// The boolean returned indicates whether the state ID returned is a newly
+ /// created state, or a previously cached state.
+ fn add_one_start(
+ &mut self,
+ nfa_start: StateID,
+ start: Start,
+ ) -> Result<(StateID, bool), Error> {
+ // Compute the look-behind assertions that are true in this starting
+ // configuration, and the determine the epsilon closure. While
+ // computing the epsilon closure, we only follow condiional epsilon
+ // transitions that satisfy the look-behind assertions in 'facts'.
+ let mut builder_matches = self.get_state_builder().into_matches();
+ util::determinize::set_lookbehind_from_start(
+ &start,
+ &mut builder_matches,
+ );
+ self.sparses.set1.clear();
+ util::determinize::epsilon_closure(
+ self.nfa,
+ nfa_start,
+ *builder_matches.look_have(),
+ &mut self.stack,
+ &mut self.sparses.set1,
+ );
+ let mut builder = builder_matches.into_nfa();
+ util::determinize::add_nfa_states(
+ &self.nfa,
+ &self.sparses.set1,
+ &mut builder,
+ );
+ self.maybe_add_state(builder)
+ }
+
+ /// Adds the given state to the DFA being built depending on whether it
+ /// already exists in this determinizer's cache.
+ ///
+ /// If it does exist, then the memory used by 'state' is put back into the
+ /// determinizer and the previously created state's ID is returned. (Along
+ /// with 'false', indicating that no new state was added.)
+ ///
+ /// If it does not exist, then the state is added to the DFA being built
+ /// and a fresh ID is allocated (if ID allocation fails, then an error is
+ /// returned) and returned. (Along with 'true', indicating that a new state
+ /// was added.)
+ fn maybe_add_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ ) -> Result<(StateID, bool), Error> {
+ if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ self.put_state_builder(builder);
+ return Ok((cached_id, false));
+ }
+ self.add_state(builder).map(|sid| (sid, true))
+ }
+
+ /// Add the given state to the DFA and make it available in the cache.
+ ///
+ /// The state initially has no transitions. That is, it transitions to the
+ /// dead state for all possible inputs, and transitions to the quit state
+ /// for all quit bytes.
+ ///
+ /// If adding the state would exceed the maximum value for StateID, then an
+ /// error is returned.
+ fn add_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ ) -> Result<StateID, Error> {
+ let id = self.dfa.add_empty_state()?;
+ if !self.config.quit.is_empty() {
+ for b in self.config.quit.iter() {
+ self.dfa.set_transition(
+ id,
+ alphabet::Unit::u8(b),
+ self.dfa.quit_id(),
+ );
+ }
+ }
+ let state = builder.to_state();
+ // States use reference counting internally, so we only need to count
+ // their memroy usage once.
+ self.memory_usage_state += state.memory_usage();
+ self.builder_states.push(state.clone());
+ self.cache.insert(state, id);
+ self.put_state_builder(builder);
+ if let Some(limit) = self.config.dfa_size_limit {
+ if self.dfa.memory_usage() > limit {
+ return Err(Error::dfa_exceeded_size_limit(limit));
+ }
+ }
+ if let Some(limit) = self.config.determinize_size_limit {
+ if self.memory_usage() > limit {
+ return Err(Error::determinize_exceeded_size_limit(limit));
+ }
+ }
+ Ok(id)
+ }
+
+ /// Returns a state builder from this determinizer that might have existing
+ /// capacity. This helps avoid allocs in cases where a state is built that
+ /// turns out to already be cached.
+ ///
+ /// Callers must put the state builder back with 'put_state_builder',
+ /// otherwise the allocation reuse won't work.
+ fn get_state_builder(&mut self) -> StateBuilderEmpty {
+ core::mem::replace(
+ &mut self.scratch_state_builder,
+ StateBuilderEmpty::new(),
+ )
+ }
+
+ /// Puts the given state builder back into this determinizer for reuse.
+ ///
+ /// Note that building a 'State' from a builder always creates a new
+ /// alloc, so callers should always put the builder back.
+ fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+ let _ = core::mem::replace(
+ &mut self.scratch_state_builder,
+ builder.clear(),
+ );
+ }
+
+ /// Return the memory usage, in bytes, of this determinizer at the current
+ /// point in time. This does not include memory used by the NFA or the
+ /// dense DFA itself.
+ fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ self.builder_states.len() * size_of::<State>()
+ // Maps likely use more memory than this, but it's probably close.
+ + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
+ + self.memory_usage_state
+ + self.stack.capacity() * size_of::<StateID>()
+ + self.scratch_state_builder.capacity()
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/error.rs b/vendor/regex-automata/src/dfa/error.rs
new file mode 100644
index 000000000..6497a4cff
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/error.rs
@@ -0,0 +1,162 @@
+use crate::{
+ nfa,
+ util::{
+ id::{PatternID, StateID},
+ start::Start,
+ },
+};
+
+/// An error that occurred during the construction of a DFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`nfa::thompson::Error`] type from its `source`
+/// method via the `std::error::Error` trait. This error only occurs when using
+/// convenience routines for building a DFA directly from a pattern string.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct Error {
+ kind: ErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a DFA.
+///
+/// Note that this error is non-exhaustive. Adding new variants is not
+/// considered a breaking change.
+#[derive(Clone, Debug)]
+enum ErrorKind {
+ /// An error that occurred while constructing an NFA as a precursor step
+ /// before a DFA is compiled.
+ NFA(nfa::thompson::Error),
+ /// An error that occurred because an unsupported regex feature was used.
+ /// The message string describes which unsupported feature was used.
+ ///
+ /// The primary regex feature that is unsupported by DFAs is the Unicode
+ /// word boundary look-around assertion (`\b`). This can be worked around
+ /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the
+ /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary)
+ /// option when building a DFA.
+ Unsupported(&'static str),
+ /// An error that occurs if too many states are produced while building a
+ /// DFA.
+ TooManyStates,
+ /// An error that occurs if too many start states are needed while building
+ /// a DFA.
+ ///
+ /// This is a kind of oddball error that occurs when building a DFA with
+ /// start states enabled for each pattern and enough patterns to cause
+ /// the table of start states to overflow `usize`.
+ TooManyStartStates,
+ /// This is another oddball error that can occur if there are too many
+ /// patterns spread out across too many match states.
+ TooManyMatchPatternIDs,
+ /// An error that occurs if the DFA got too big during determinization.
+ DFAExceededSizeLimit { limit: usize },
+ /// An error that occurs if auxiliary storage (not the DFA) used during
+ /// determinization got too big.
+ DeterminizeExceededSizeLimit { limit: usize },
+}
+
+impl Error {
+ /// Return the kind of this error.
+ fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn nfa(err: nfa::thompson::Error) -> Error {
+ Error { kind: ErrorKind::NFA(err) }
+ }
+
+ pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error {
+ let msg = "cannot build DFAs for regexes with Unicode word \
+ boundaries; switch to ASCII word boundaries, or \
+ heuristically enable Unicode word boundaries or use a \
+ different regex engine";
+ Error { kind: ErrorKind::Unsupported(msg) }
+ }
+
+ pub(crate) fn too_many_states() -> Error {
+ Error { kind: ErrorKind::TooManyStates }
+ }
+
+ pub(crate) fn too_many_start_states() -> Error {
+ Error { kind: ErrorKind::TooManyStartStates }
+ }
+
+ pub(crate) fn too_many_match_pattern_ids() -> Error {
+ Error { kind: ErrorKind::TooManyMatchPatternIDs }
+ }
+
+ pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error {
+ Error { kind: ErrorKind::DFAExceededSizeLimit { limit } }
+ }
+
+ pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error {
+ Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind() {
+ ErrorKind::NFA(ref err) => Some(err),
+ ErrorKind::Unsupported(_) => None,
+ ErrorKind::TooManyStates => None,
+ ErrorKind::TooManyStartStates => None,
+ ErrorKind::TooManyMatchPatternIDs => None,
+ ErrorKind::DFAExceededSizeLimit { .. } => None,
+ ErrorKind::DeterminizeExceededSizeLimit { .. } => None,
+ }
+ }
+}
+
+impl core::fmt::Display for Error {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind() {
+ ErrorKind::NFA(_) => write!(f, "error building NFA"),
+ ErrorKind::Unsupported(ref msg) => {
+ write!(f, "unsupported regex feature for DFAs: {}", msg)
+ }
+ ErrorKind::TooManyStates => write!(
+ f,
+ "number of DFA states exceeds limit of {}",
+ StateID::LIMIT,
+ ),
+ ErrorKind::TooManyStartStates => {
+ let stride = Start::count();
+ // The start table has `stride` entries for starting states for
+ // the entire DFA, and then `stride` entries for each pattern
+ // if start states for each pattern are enabled (which is the
+ // only way this error can occur). Thus, the total number of
+ // patterns that can fit in the table is `stride` less than
+ // what we can allocate.
+ let limit = ((core::isize::MAX as usize) - stride) / stride;
+ write!(
+ f,
+ "compiling DFA with start states exceeds pattern \
+ pattern limit of {}",
+ limit,
+ )
+ }
+ ErrorKind::TooManyMatchPatternIDs => write!(
+ f,
+ "compiling DFA with total patterns in all match states \
+ exceeds limit of {}",
+ PatternID::LIMIT,
+ ),
+ ErrorKind::DFAExceededSizeLimit { limit } => write!(
+ f,
+ "DFA exceeded size limit of {:?} during determinization",
+ limit,
+ ),
+ ErrorKind::DeterminizeExceededSizeLimit { limit } => {
+ write!(f, "determinization exceeded size limit of {:?}", limit)
+ }
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/minimize.rs b/vendor/regex-automata/src/dfa/minimize.rs
index ededa5f66..80e2f4e73 100644
--- a/vendor/regex-automata/src/minimize.rs
+++ b/vendor/regex-automata/src/dfa/minimize.rs
@@ -1,12 +1,14 @@
-use std::cell::RefCell;
-use std::fmt;
-use std::mem;
-use std::rc::Rc;
+use core::{cell::RefCell, fmt, mem};
-use dense;
-use state_id::{dead_id, StateID};
+use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
-type DFARepr<S> = dense::Repr<Vec<S>, S>;
+use crate::{
+ dfa::{automaton::Automaton, dense, DEAD},
+ util::{
+ alphabet,
+ id::{PatternID, StateID},
+ },
+};
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
///
@@ -39,15 +41,15 @@ type DFARepr<S> = dense::Repr<Vec<S>, S>;
/// point during NFA compilation via the algorithm described in the
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
/// paper.)
-pub(crate) struct Minimizer<'a, S: 'a> {
- dfa: &'a mut DFARepr<S>,
- in_transitions: Vec<Vec<Vec<S>>>,
- partitions: Vec<StateSet<S>>,
- waiting: Vec<StateSet<S>>,
+pub(crate) struct Minimizer<'a> {
+ dfa: &'a mut dense::OwnedDFA,
+ in_transitions: Vec<Vec<Vec<StateID>>>,
+ partitions: Vec<StateSet>,
+ waiting: Vec<StateSet>,
}
-impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+impl<'a> fmt::Debug for Minimizer<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Minimizer")
.field("dfa", &self.dfa)
.field("in_transitions", &self.in_transitions)
@@ -70,26 +72,47 @@ impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
/// computing intersection/subtraction on this representation is especially
/// fast.
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
-struct StateSet<S>(Rc<RefCell<Vec<S>>>);
+struct StateSet {
+ ids: Rc<RefCell<Vec<StateID>>>,
+}
-impl<'a, S: StateID> Minimizer<'a, S> {
- pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
+impl<'a> Minimizer<'a> {
+ pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
let in_transitions = Minimizer::incoming_transitions(dfa);
let partitions = Minimizer::initial_partitions(dfa);
- let waiting = vec![partitions[0].clone()];
-
+ let waiting = partitions.clone();
Minimizer { dfa, in_transitions, partitions, waiting }
}
pub fn run(mut self) {
+ let stride2 = self.dfa.stride2();
+ let as_state_id = |index: usize| -> StateID {
+ StateID::new(index << stride2).unwrap()
+ };
+ let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
+
let mut incoming = StateSet::empty();
let mut scratch1 = StateSet::empty();
let mut scratch2 = StateSet::empty();
let mut newparts = vec![];
+ // This loop is basically Hopcroft's algorithm. Everything else is just
+ // shuffling data around to fit our representation.
while let Some(set) = self.waiting.pop() {
- for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
+ for b in self.dfa.byte_classes().iter() {
self.find_incoming_to(b, &set, &mut incoming);
+ // If incoming is empty, then the intersection with any other
+ // set must also be empty. So 'newparts' just ends up being
+ // 'self.partitions'. So there's no need to go through the loop
+ // below.
+ //
+ // This actually turns out to be rather large optimization. On
+ // the order of making minimization 4-5x faster. It's likely
+ // that the vast majority of all states have very few incoming
+ // transitions.
+ if incoming.is_empty() {
+ continue;
+ }
for p in 0..self.partitions.len() {
self.partitions[p].intersection(&incoming, &mut scratch1);
@@ -135,38 +158,42 @@ impl<'a, S: StateID> Minimizer<'a, S> {
// Create a map from DFA state ID to the representative ID of the
// equivalence class to which it belongs. The representative ID of an
// equivalence class of states is the minimum ID in that class.
- let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
+ let mut state_to_part = vec![DEAD; self.dfa.state_count()];
for p in &self.partitions {
- p.iter(|id| state_to_part[id.to_usize()] = p.min());
+ p.iter(|id| state_to_part[as_index(id)] = p.min());
}
// Generate a new contiguous sequence of IDs for minimal states, and
// create a map from equivalence IDs to the new IDs. Thus, the new
// minimal ID of *any* state in the unminimized DFA can be obtained
// with minimals_ids[state_to_part[old_id]].
- let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
- let mut new_id = S::from_usize(0);
- for (id, _) in self.dfa.states() {
- if state_to_part[id.to_usize()] == id {
- minimal_ids[id.to_usize()] = new_id;
- new_id = S::from_usize(new_id.to_usize() + 1);
+ let mut minimal_ids = vec![DEAD; self.dfa.state_count()];
+ let mut new_index = 0;
+ for state in self.dfa.states() {
+ if state_to_part[as_index(state.id())] == state.id() {
+ minimal_ids[as_index(state.id())] = as_state_id(new_index);
+ new_index += 1;
}
}
// The total number of states in the minimal DFA.
- let minimal_count = new_id.to_usize();
+ let minimal_count = new_index;
+ // Convenience function for remapping state IDs. This takes an old ID,
+ // looks up its Hopcroft partition and then maps that to the new ID
+ // range.
+ let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
// Re-map this DFA in place such that the only states remaining
// correspond to the representative states of every equivalence class.
- for id in (0..self.dfa.state_count()).map(S::from_usize) {
+ for id in (0..self.dfa.state_count()).map(as_state_id) {
// If this state isn't a representative for an equivalence class,
// then we skip it since it won't appear in the minimal DFA.
- if state_to_part[id.to_usize()] != id {
+ if state_to_part[as_index(id)] != id {
continue;
}
- for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
- *next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
+ for (_, next) in self.dfa.state_mut(id).iter_mut() {
+ *next = remap(*next);
}
- self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
+ self.dfa.swap_states(id, minimal_ids[as_index(id)]);
}
// Trim off all unused states from the pre-minimized DFA. This
// represents all states that were merged into a non-singleton
@@ -175,12 +202,30 @@ impl<'a, S: StateID> Minimizer<'a, S> {
// equivalence class is its representative ID.)
self.dfa.truncate_states(minimal_count);
- // Update the new start state, which is now just the minimal ID of
- // whatever state the old start state was collapsed into.
- let old_start = self.dfa.start_state();
- self.dfa.set_start_state(
- minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
- );
+ // Update the new start states, which is now just the minimal ID of
+ // whatever state the old start state was collapsed into. Also, we
+ // collect everything before-hand to work around the borrow checker.
+ // We're already allocating so much that this is probably fine. If this
+ // turns out to be costly, then I guess add a `starts_mut` iterator.
+ let starts: Vec<_> = self.dfa.starts().collect();
+ for (old_start_id, start_type, pid) in starts {
+ self.dfa.set_start_state(start_type, pid, remap(old_start_id));
+ }
+
+ // Update the match state pattern ID list for multi-regexes. All we
+ // need to do is remap the match state IDs. The pattern ID lists are
+ // always the same as they were since match states with distinct
+ // pattern ID lists are always considered distinct states.
+ let mut pmap = BTreeMap::new();
+ for (match_id, pattern_ids) in self.dfa.pattern_map() {
+ let new_id = remap(match_id);
+ pmap.insert(new_id, pattern_ids);
+ }
+ // This unwrap is OK because minimization never increases the number of
+ // match states or patterns in those match states. Since minimization
+ // runs after the pattern map has already been set at least once, we
+ // know that our match states cannot error.
+ self.dfa.set_pattern_map(&pmap).unwrap();
// In order to update the ID of the maximum match state, we need to
// find the maximum ID among all of the match states in the minimized
@@ -189,117 +234,160 @@ impl<'a, S: StateID> Minimizer<'a, S> {
// earlier match state. Therefore, to find the new max match state,
// we iterate over all previous match states, find their corresponding
// new minimal ID, and take the maximum of those.
- let old_max = self.dfa.max_match_state();
- self.dfa.set_max_match_state(dead_id());
- for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
- let part = state_to_part[id.to_usize()];
- let new_id = minimal_ids[part.to_usize()];
- if new_id > self.dfa.max_match_state() {
- self.dfa.set_max_match_state(new_id);
+ let old = self.dfa.special().clone();
+ let new = self.dfa.special_mut();
+ // ... but only remap if we had match states.
+ if old.matches() {
+ new.min_match = StateID::MAX;
+ new.max_match = StateID::ZERO;
+ for i in as_index(old.min_match)..=as_index(old.max_match) {
+ let new_id = remap(as_state_id(i));
+ if new_id < new.min_match {
+ new.min_match = new_id;
+ }
+ if new_id > new.max_match {
+ new.max_match = new_id;
+ }
}
}
+ // ... same, but for start states.
+ if old.starts() {
+ new.min_start = StateID::MAX;
+ new.max_start = StateID::ZERO;
+ for i in as_index(old.min_start)..=as_index(old.max_start) {
+ let new_id = remap(as_state_id(i));
+ if new_id == DEAD {
+ continue;
+ }
+ if new_id < new.min_start {
+ new.min_start = new_id;
+ }
+ if new_id > new.max_start {
+ new.max_start = new_id;
+ }
+ }
+ if new.max_start == DEAD {
+ new.min_start = DEAD;
+ }
+ }
+ new.quit_id = remap(new.quit_id);
+ new.set_max();
}
- fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
+ fn find_waiting(&self, set: &StateSet) -> Option<usize> {
self.waiting.iter().position(|s| s == set)
}
fn find_incoming_to(
&self,
- b: u8,
- set: &StateSet<S>,
- incoming: &mut StateSet<S>,
+ b: alphabet::Unit,
+ set: &StateSet,
+ incoming: &mut StateSet,
) {
incoming.clear();
set.iter(|id| {
- for &inid in &self.in_transitions[id.to_usize()][b as usize] {
+ for &inid in
+ &self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
+ {
incoming.add(inid);
}
});
incoming.canonicalize();
}
- fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
- let mut is_match = StateSet::empty();
+ fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
+ // For match states, we know that two match states with different
+ // pattern ID lists will *always* be distinct, so we can partition them
+ // initially based on that.
+ let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
+ let mut is_quit = StateSet::empty();
let mut no_match = StateSet::empty();
- for (id, _) in dfa.states() {
- if dfa.is_match_state(id) {
- is_match.add(id);
+ for state in dfa.states() {
+ if dfa.is_match_state(state.id()) {
+ let mut pids = vec![];
+ for i in 0..dfa.match_count(state.id()) {
+ pids.push(dfa.match_pattern(state.id(), i));
+ }
+ matching
+ .entry(pids)
+ .or_insert(StateSet::empty())
+ .add(state.id());
+ } else if dfa.is_quit_state(state.id()) {
+ is_quit.add(state.id());
} else {
- no_match.add(id);
+ no_match.add(state.id());
}
}
- let mut sets = vec![is_match];
- if !no_match.is_empty() {
- sets.push(no_match);
- }
- sets.sort_by_key(|s| s.len());
+ let mut sets: Vec<StateSet> =
+ matching.into_iter().map(|(_, set)| set).collect();
+ sets.push(no_match);
+ sets.push(is_quit);
sets
}
- fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
+ fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
let mut incoming = vec![];
for _ in dfa.states() {
incoming.push(vec![vec![]; dfa.alphabet_len()]);
}
- for (id, state) in dfa.states() {
+ for state in dfa.states() {
for (b, next) in state.transitions() {
- incoming[next.to_usize()][b as usize].push(id);
+ incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
}
}
incoming
}
}
-impl<S: StateID> StateSet<S> {
- fn empty() -> StateSet<S> {
- StateSet(Rc::new(RefCell::new(vec![])))
+impl StateSet {
+ fn empty() -> StateSet {
+ StateSet { ids: Rc::new(RefCell::new(vec![])) }
}
- fn add(&mut self, id: S) {
- self.0.borrow_mut().push(id);
+ fn add(&mut self, id: StateID) {
+ self.ids.borrow_mut().push(id);
}
- fn min(&self) -> S {
- self.0.borrow()[0]
+ fn min(&self) -> StateID {
+ self.ids.borrow()[0]
}
fn canonicalize(&mut self) {
- self.0.borrow_mut().sort();
- self.0.borrow_mut().dedup();
+ self.ids.borrow_mut().sort();
+ self.ids.borrow_mut().dedup();
}
fn clear(&mut self) {
- self.0.borrow_mut().clear();
+ self.ids.borrow_mut().clear();
}
fn len(&self) -> usize {
- self.0.borrow().len()
+ self.ids.borrow().len()
}
fn is_empty(&self) -> bool {
self.len() == 0
}
- fn deep_clone(&self) -> StateSet<S> {
- let ids = self.0.borrow().iter().cloned().collect();
- StateSet(Rc::new(RefCell::new(ids)))
+ fn deep_clone(&self) -> StateSet {
+ let ids = self.ids.borrow().iter().cloned().collect();
+ StateSet { ids: Rc::new(RefCell::new(ids)) }
}
- fn iter<F: FnMut(S)>(&self, mut f: F) {
- for &id in self.0.borrow().iter() {
+ fn iter<F: FnMut(StateID)>(&self, mut f: F) {
+ for &id in self.ids.borrow().iter() {
f(id);
}
}
- fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
+ fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
dest.clear();
if self.is_empty() || other.is_empty() {
return;
}
- let (seta, setb) = (self.0.borrow(), other.0.borrow());
+ let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
@@ -327,14 +415,14 @@ impl<S: StateID> StateSet<S> {
}
}
- fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
+ fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
dest.clear();
if self.is_empty() || other.is_empty() {
self.iter(|s| dest.add(s));
return;
}
- let (seta, setb) = (self.0.borrow(), other.0.borrow());
+ let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs
new file mode 100644
index 000000000..6f9fe605e
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/mod.rs
@@ -0,0 +1,363 @@
+/*!
+A module for building and searching with determinstic finite automata (DFAs).
+
+Like other modules in this crate, DFAs support a rich regex syntax with Unicode
+features. DFAs also have extensive options for configuring the best space vs
+time trade off for your use case and provides support for cheap deserialization
+of automata for use in `no_std` environments.
+
+If you're looking for lazy DFAs that build themselves incrementally during
+search, then please see the top-level [`hybrid` module](crate::hybrid).
+
+# Overview
+
+This section gives a brief overview of the primary types in this module:
+
+* A [`regex::Regex`] provides a way to search for matches of a regular
+expression using DFAs. This includes iterating over matches with both the start
+and end positions of each match.
+* A [`dense::DFA`] provides low level access to a DFA that uses a dense
+representation (uses lots of space, but fast searching).
+* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
+representation (uses less space, but slower searching).
+* An [`Automaton`] trait that defines an interface that both dense and sparse
+DFAs implement. (A `regex::Regex` is generic over this trait.)
+* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
+[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
+[`dense::DFA::from_bytes`]).
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: searching with regex sets
+
+The DFAs in this module all fully support searching with multiple regexes
+simultaneously. You can use this support with standard leftmost-first style
+searching to find non-overlapping matches:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let re = Regex::new_many(&[r"\w+", r"\S+"])?;
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(1, 0, 4),
+ MultiMatch::must(0, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Or use overlapping style searches to find all possible occurrences:
+
+```
+use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}};
+
+// N.B. For overlapping searches, we need the underlying DFA to report all
+// possible matches.
+let re = Regex::builder()
+ .dense(dense::Config::new().match_kind(MatchKind::All))
+ .build_many(&[r"\w{3}", r"\S{3}"])?;
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> = re.find_overlapping_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(1, 0, 3),
+ MultiMatch::must(0, 1, 4),
+ MultiMatch::must(1, 1, 4),
+ MultiMatch::must(0, 5, 8),
+ MultiMatch::must(1, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: use sparse DFAs
+
+By default, compiling a regex will use dense DFAs internally. This uses more
+memory, but executes searches more quickly. If you can abide slower searches
+(somewhere around 3-5x), then sparse DFAs might make more sense since they can
+use significantly less space.
+
+Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
+`Regex::new`:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+If you already have dense DFAs for some reason, they can be converted to sparse
+DFAs and used to build a new `Regex`. For example:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let sparse_re = Regex::builder().build_from_dfas(
+ dense_re.forward().to_sparse()?,
+ dense_re.reverse().to_sparse()?,
+);
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = sparse_re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: deserialize a DFA
+
+This shows how to first serialize a DFA into raw bytes, and then deserialize
+those raw bytes back into a DFA. While this particular example is a
+bit contrived, this same technique can be used in your program to
+deserialize a DFA at start up time or by memory mapping a file.
+
+```
+use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}};
+
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both the forward and reverse DFAs, see note below
+let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
+let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
+// now deserialize both---we need to specify the correct type!
+let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
+let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
+// finally, reconstruct our regex
+let re2 = Regex::builder().build_from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+There are a few points worth noting here:
+
+* We need to extract the raw DFAs used by the regex and serialize those. You
+can build the DFAs manually yourself using [`dense::Builder`], but using
+the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
+particular, a `Regex` constructs a reverse DFA for finding the starting
+location of matches.)
+* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
+In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
+or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
+deserializing your DFA from. If you intend to deserialize on either platform,
+then you'll need to serialize both and deserialize the right one depending on
+your target's endianness.
+* Safely deserializing a DFA requires verifying the raw bytes, particularly if
+they are untrusted, since an invalid DFA could cause logical errors, panics
+or even undefined behavior. This verification step requires visiting all of
+the transitions in the DFA, which can be costly. If cheaper verification is
+desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
+verification that can be performed in constant time. However, one can only use
+this routine if the caller can guarantee that the bytes provided encoded a
+valid DFA.
+
+The same process can be achieved with sparse DFAs as well:
+
+```
+use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}};
+
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both
+let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
+let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
+// now deserialize both---we need to specify the correct type!
+let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
+let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
+// finally, reconstruct our regex
+let re2 = Regex::builder().build_from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
+Conversely, dense DFAs must be be aligned to the same alignment as a
+[`StateID`](crate::util::id::StateID).
+
+# Support for `no_std` and `alloc`-only
+
+This crate comes with `alloc` and `std` features that are enabled by default.
+When the `alloc` or `std` features are enabled, the API of this module will
+include the facilities necessary for compiling, serializing, deserializing
+and searching with DFAs. When only the `alloc` feature is enabled, then
+implementations of the `std::error::Error` trait are dropped, but everything
+else generally remains the same. When both the `alloc` and `std` features are
+disabled, the API of this module will shrink such that it only includes the
+facilities necessary for deserializing and searching with DFAs.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `alloc` or `std` features that compiles and
+serializes a regular expression. You may need to serialize both little and big
+endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+your previously serialized DFAs into regexes. You can then search with them as
+you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+TODO: Include link to `regex-cli` here pointing out how to generate Rust code
+for deserializing DFAs.
+
+# Syntax
+
+This module supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
+
+There are two things that are not supported by the DFAs in this module:
+
+* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
+of them) can only find the offsets of an entire match, but cannot resolve
+the offsets of each capturing group. This is because DFAs do not have the
+expressive power necessary.
+* Unicode word boundaries. These present particularly difficult challenges for
+DFA construction and would result in an explosion in the number of states.
+One can enable [`dense::Config::unicode_word_boundary`] though, which provides
+heuristic support for Unicode word boundaries that only works on ASCII text.
+Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
+on any input.
+
+There are no plans to lift either of these limitations.
+
+Note that these restrictions are identical to the restrictions on lazy DFAs.
+
+# Differences with general purpose regexes
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this module provides a lower level
+regular expression interface based exclusively on DFAs that is a bit less
+convenient while providing more explicit control over memory usage and search
+times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+of the regex pattern. While most patterns do not exhibit worst case exponential
+time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
+with approximately `2^(N+2)` states. For this reason, untrusted patterns should
+not be compiled with this module. (In the future, the API may expose an option
+to return an error if the DFA gets too big.)
+* This module does not support sub-match extraction via capturing groups, which
+can be achieved with the regex crate's "captures" API.
+* While the regex crate doesn't necessarily sport fast compilation times,
+the regexes in this module are almost universally slow to compile, especially
+when they contain large Unicode character classes. For example, on my system,
+compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
+a sparse regex takes about the same time but only uses about 1.2MB of
+memory.) Conversly, compiling the same regex without Unicode support, e.g.,
+`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
+reason, you should only use Unicode character classes if you absolutely need
+them! (They are enabled by default though.)
+* This module does not support Unicode word boundaries. ASCII word bondaries
+may be used though by disabling Unicode or selectively doing so in the syntax,
+e.g., `(?-u:\b)`. There is also an option to
+[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
+where the corresponding DFA will give up if any non-ASCII byte is seen.
+* As a lower level API, this module does not do literal optimizations
+automatically. Although it does provide hooks in its API to make use of the
+[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
+optimizations means that searches may run much slower than what you're
+accustomed to, although, it does provide more predictable and consistent
+performance.
+* There is no `&str` API like in the regex crate. In this module, all APIs
+operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8
+boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8),
+[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or
+[`regex::Config::utf8`] are disabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+deserialized. Deserialization can be done in constant time with the unchecked
+APIs, since searching can be performed directly on the raw serialized bytes of
+a DFA.
+* This module was specifically designed so that the searching phase of a
+DFA has minimal runtime requirements, and can therefore be used in `no_std`
+environments. While `no_std` environments cannot compile regexes, they can
+deserialize pre-compiled regexes.
+* Since this module builds DFAs ahead of time, it will generally out-perform
+the `regex` crate on equivalent tasks. The performance difference is likely
+not large. However, because of a complex set of optimizations in the regex
+crate (like literal optimizations), an accurate performance comparison may be
+difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+performance a small amount, but uses much less storage space. Potentially even
+less than what the regex crate uses.
+* This module exposes DFAs directly, such as [`dense::DFA`] and
+[`sparse::DFA`], which enables one to do less work in some cases. For example,
+if you only need the end of a match and not the start of a match, then you can
+use a DFA directly without building a `Regex`, which always requires a second
+DFA to find the start of a match.
+* This module provides more control over memory usage. Aside from choosing
+between dense and sparse DFAs, one can also choose a smaller state identifier
+representation to use less space. Also, one can enable DFA minimization
+via [`dense::Config::minimize`], but it can increase compilation times
+dramatically.
+*/
+
+pub use crate::dfa::automaton::{Automaton, OverlappingState};
+#[cfg(feature = "alloc")]
+pub use crate::dfa::error::Error;
+
+/// This is an alias for a state ID of zero. It has special significance
+/// because it always corresponds to the first state in a DFA, and the first
+/// state in a DFA is always "dead." That is, the dead state always has all
+/// of its transitions set to itself. Moreover, the dead state is used as a
+/// sentinel for various things. e.g., In search, reaching a dead state means
+/// that the search must stop.
+const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO;
+
+mod accel;
+mod automaton;
+pub mod dense;
+#[cfg(feature = "alloc")]
+mod determinize;
+#[cfg(feature = "alloc")]
+pub(crate) mod error;
+#[cfg(feature = "alloc")]
+mod minimize;
+pub mod regex;
+mod search;
+pub mod sparse;
+mod special;
+#[cfg(feature = "transducer")]
+mod transducer;
diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs
new file mode 100644
index 000000000..d0917e17d
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/regex.rs
@@ -0,0 +1,2146 @@
+/*!
+A DFA-backed `Regex`.
+
+This module provides [`Regex`], which is defined generically over the
+[`Automaton`] trait. A `Regex` implements convenience routines you might have
+come to expect, such as finding the start/end of a match and iterating over
+all non-overlapping matches. This `Regex` type is limited in its capabilities
+to what a DFA can provide. Therefore, APIs involving capturing groups, for
+example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::dfa) for examples.
+*/
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+use crate::{
+ dfa::automaton::{Automaton, OverlappingState},
+ util::prefilter::{self, Prefilter},
+ MatchError, MultiMatch,
+};
+#[cfg(feature = "alloc")]
+use crate::{
+ dfa::{dense, error::Error, sparse},
+ nfa::thompson,
+ util::matchtypes::MatchKind,
+};
+
+// When the alloc feature is enabled, the regex type sets its A type parameter
+// to default to an owned dense DFA. But without alloc, we set no default. This
+// makes things a lot more convenient in the common case, since writing out the
+// DFA types is pretty annoying.
+//
+// Since we have two different definitions but only want to write one doc
+// string, we use a macro to capture the doc and other attributes once and then
+// repeat them for each definition.
+macro_rules! define_regex_type {
+ ($(#[$doc:meta])*) => {
+ #[cfg(feature = "alloc")]
+ $(#[$doc])*
+ pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> {
+ prefilter: Option<P>,
+ forward: A,
+ reverse: A,
+ utf8: bool,
+ }
+
+ #[cfg(not(feature = "alloc"))]
+ $(#[$doc])*
+ pub struct Regex<A, P = prefilter::None> {
+ prefilter: Option<P>,
+ forward: A,
+ reverse: A,
+ utf8: bool,
+ }
+ };
+}
+
+define_regex_type!(
+ /// A regular expression that uses deterministic finite automata for fast
+ /// searching.
+ ///
+ /// A regular expression is comprised of two DFAs, a "forward" DFA and a
+ /// "reverse" DFA. The forward DFA is responsible for detecting the end of
+ /// a match while the reverse DFA is responsible for detecting the start
+ /// of a match. Thus, in order to find the bounds of any given match, a
+ /// forward search must first be run followed by a reverse search. A match
+ /// found by the forward DFA guarantees that the reverse DFA will also find
+ /// a match.
+ ///
+ /// The type of the DFA used by a `Regex` corresponds to the `A` type
+ /// parameter, which must satisfy the [`Automaton`] trait. Typically,
+ /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
+ /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
+ /// memory but search faster, while sparse DFAs use less memory but search
+ /// more slowly.
+ ///
+ /// By default, a regex's automaton type parameter is set to
+ /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
+ /// in-memory work loads, this is the most convenient type that gives the
+ /// best search performance. When the `alloc` feature is disabled, no
+ /// default type is used.
+ ///
+ /// A `Regex` also has a `P` type parameter, which is used to select the
+ /// prefilter used during search. By default, no prefilter is enabled by
+ /// setting the type to default to [`prefilter::None`]. A prefilter can be
+ /// enabled by using the [`Regex::prefilter`] method.
+ ///
+ /// # When should I use this?
+ ///
+ /// Generally speaking, if you can afford the overhead of building a full
+ /// DFA for your regex, and you don't need things like capturing groups,
+ /// then this is a good choice if you're looking to optimize for matching
+ /// speed. Note however that its speed may be worse than a general purpose
+ /// regex engine if you don't select a good [prefilter].
+ ///
+ /// # Earliest vs Leftmost vs Overlapping
+ ///
+ /// The search routines exposed on a `Regex` reflect three different ways
+ /// of searching:
+ ///
+ /// * "earliest" means to stop as soon as a match has been detected.
+ /// * "leftmost" means to continue matching until the underlying
+ /// automaton cannot advance. This reflects "standard" searching you
+ /// might be used to in other regex engines. e.g., This permits
+ /// non-greedy and greedy searching to work as you would expect.
+ /// * "overlapping" means to find all possible matches, even if they
+ /// overlap.
+ ///
+ /// Generally speaking, when doing an overlapping search, you'll want to
+ /// build your regex DFAs with [`MatchKind::All`] semantics. Using
+ /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
+ /// likely to lead to odd behavior since `LeftmostFirst` specifically omits
+ /// some matches that can never be reported due to its semantics.
+ ///
+ /// The following example shows the differences between how these different
+ /// types of searches impact looking for matches of `[a-z]+` in the
+ /// haystack `abc`.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch};
+ ///
+ /// let pattern = r"[a-z]+";
+ /// let haystack = "abc".as_bytes();
+ ///
+ /// // With leftmost-first semantics, we test "earliest" and "leftmost".
+ /// let re = dfa::regex::Builder::new()
+ /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst))
+ /// .build(pattern)?;
+ ///
+ /// // "earliest" searching isn't impacted by greediness
+ /// let mut it = re.find_earliest_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// // "leftmost" searching supports greediness (and non-greediness)
+ /// let mut it = re.find_leftmost_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// // For overlapping, we want "all" match kind semantics.
+ /// let re = dfa::regex::Builder::new()
+ /// .dense(dense::Config::new().match_kind(MatchKind::All))
+ /// .build(pattern)?;
+ ///
+ /// // In the overlapping search, we find all three possible matches
+ /// // starting at the beginning of the haystack.
+ /// let mut it = re.find_overlapping_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Sparse DFAs
+ ///
+ /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
+ /// used with any kind of DFA. While this crate constructs dense DFAs by
+ /// default, it is easy enough to build corresponding sparse DFAs, and then
+ /// build a regex from them:
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// // First, build a regex that uses dense DFAs.
+ /// let dense_re = Regex::new("foo[0-9]+")?;
+ ///
+ /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+ /// let fwd = dense_re.forward().to_sparse()?;
+ /// let rev = dense_re.reverse().to_sparse()?;
+ ///
+ /// // Third, build a new regex from the constituent sparse DFAs.
+ /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
+ ///
+ /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+ /// assert_eq!(true, sparse_re.is_match(b"foo123"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
+ /// more succinctly. (Note though that dense DFAs are still constructed
+ /// first internally, and then converted to sparse DFAs, as in the example
+ /// above.)
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
+ /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+ /// assert!(sparse_re.is_match(b"foo123"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Fallibility
+ ///
+ /// In non-default configurations, the DFAs generated in this module may
+ /// return an error during a search. (Currently, the only way this happens
+ /// is if quit bytes are added or Unicode word boundaries are heuristically
+ /// enabled, both of which are turned off by default.) For convenience, the
+ /// main search routines, like [`find_leftmost`](Regex::find_leftmost),
+ /// will panic if an error occurs. However, if you need to use DFAs
+ /// which may produce an error at search time, then there are fallible
+ /// equivalents of all search routines. For example, for `find_leftmost`,
+ /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost).
+ /// The routines prefixed with `try_` return `Result<Option<MultiMatch>,
+ /// MatchError>`, where as the infallible routines simply return
+ /// `Option<MultiMatch>`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte, and handle the error returned. This could be useful if, for
+ /// example, you wanted to prevent a user supplied pattern from matching
+ /// across a line boundary.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{self, regex::Regex}, MatchError};
+ ///
+ /// let re = Regex::builder()
+ /// .dense(dfa::dense::Config::new().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ ///
+ /// let haystack = "foo\nbar".as_bytes();
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+ /// let got = re.try_find_leftmost(haystack).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[derive(Clone, Debug)]
+);
+
+#[cfg(feature = "alloc")]
+impl Regex {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 14)),
+ /// re.find_leftmost(b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex, Error> {
+ Builder::new().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+ ///
+ /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> {
+ Builder::new().build_many(patterns)
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl Regex<sparse::DFA<Vec<u8>>> {
+ /// Parse the given regular expression using the default configuration,
+ /// except using sparse DFAs, and return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_sparse("foo[0-9]+bar")?;
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 14)),
+ /// re.find_leftmost(b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_sparse(
+ pattern: &str,
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ Builder::new().build_sparse(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set"
+ /// using sparse DFAs. This otherwise similarly uses the default regex
+ /// configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
+ ///
+ /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many_sparse<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ Builder::new().build_many_sparse(patterns)
+ }
+}
+
+/// Convenience routines for regex construction.
+#[cfg(feature = "alloc")]
+impl Regex {
+ /// Return a default configuration for a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a regex.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to disable UTF-8 mode for `Regex` iteration.
+ /// When UTF-8 mode is disabled, the position immediately following an
+ /// empty match is where the next search begins, instead of the next
+ /// position of a UTF-8 encoded codepoint.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .build(r"")?;
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::regex::Regex,
+ /// nfa::thompson,
+ /// MultiMatch, SyntaxConfig,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(MultiMatch::must(0, 1, 9));
+ /// let got = re.find_leftmost(haystack);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+/// Standard search routines for finding and iterating over matches.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_is_match`](Regex::try_is_match).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(true, re.is_match(b"foo12345bar"));
+ /// assert_eq!(false, re.is_match(b"foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn is_match(&self, haystack: &[u8]) -> bool {
+ self.is_match_at(haystack, 0, haystack.len())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest`](Regex::try_find_earliest).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// // Normally, the leftmost first match would greedily consume as many
+ /// // decimal digits as it could. But a match is detected as soon as one
+ /// // digit is seen.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 4)),
+ /// re.find_earliest(b"foo12345"),
+ /// );
+ ///
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the "earliest" match semantics detect a match earlier.
+ /// let re = Regex::new("abc|a")?;
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> {
+ self.find_earliest_at(haystack, 0, haystack.len())
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost`](Regex::try_find_leftmost).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// // Greediness is applied appropriately when compared to find_earliest.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 11)),
+ /// re.find_leftmost(b"zzzfoo12345zzz"),
+ /// );
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the default leftmost-first match semantics demand that we find the
+ /// // earliest match that prefers earlier parts of the pattern over latter
+ /// // parts.
+ /// let re = Regex::new("abc|a")?;
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> {
+ self.find_leftmost_at(haystack, 0, haystack.len())
+ }
+
+ /// Search for the first overlapping match in `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping`](Regex::try_find_overlapping).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an overlapping search with multiple
+ /// regexes.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = dfa::OverlappingState::start();
+ ///
+ /// let expected = Some(MultiMatch::must(1, 0, 4));
+ /// let got = re.find_overlapping(haystack, &mut state);
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(MultiMatch::must(0, 1, 4));
+ /// let got = re.find_overlapping(haystack, &mut state);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_overlapping(
+ &self,
+ haystack: &[u8],
+ state: &mut OverlappingState,
+ ) -> Option<MultiMatch> {
+ self.find_overlapping_at(haystack, 0, haystack.len(), state)
+ }
+
+ /// Returns an iterator over all non-overlapping "earliest" matches.
+ ///
+ /// Match positions are reported as soon as a match is known to occur, even
+ /// if the standard leftmost match would be longer.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error during iteration, then iteration
+ /// panics. This only occurs in non-default configurations where quit bytes
+ /// are used or Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an "earliest" iterator.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::new("[0-9]+")?;
+ /// let haystack = "123".as_bytes();
+ ///
+ /// // Normally, a standard leftmost iterator would return a single
+ /// // match, but since "earliest" detects matches earlier, we get
+ /// // three matches.
+ /// let mut it = re.find_earliest_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_earliest_iter<'r, 't>(
+ &'r self,
+ haystack: &'t [u8],
+ ) -> FindEarliestMatches<'r, 't, A, P> {
+ FindEarliestMatches::new(self, haystack)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error during iteration, then iteration
+ /// panics. This only occurs in non-default configurations where quit bytes
+ /// are used or Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let text = b"foo1 foo12 foo123";
+ /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+ /// assert_eq!(matches, vec![
+ /// MultiMatch::must(0, 0, 4),
+ /// MultiMatch::must(0, 5, 10),
+ /// MultiMatch::must(0, 11, 17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_leftmost_iter<'r, 't>(
+ &'r self,
+ haystack: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 't, A, P> {
+ FindLeftmostMatches::new(self, haystack)
+ }
+
+ /// Returns an iterator over all overlapping matches in the given haystack.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// The iterator takes care of handling the overlapping state that must be
+ /// threaded through every search.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error during iteration, then iteration
+ /// panics. This only occurs in non-default configurations where quit bytes
+ /// are used or Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an overlapping search with multiple
+ /// regexes.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let haystack = "@foo".as_bytes();
+ ///
+ /// let mut it = re.find_overlapping_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_overlapping_iter<'r, 't>(
+ &'r self,
+ haystack: &'t [u8],
+ ) -> FindOverlappingMatches<'r, 't, A, P> {
+ FindOverlappingMatches::new(self, haystack)
+ }
+}
+
+/// Lower level infallible search routines that permit controlling where
+/// the search starts and ends in a particular sequence. This is useful for
+/// executing searches that need to take surrounding context into account. This
+/// is required for correctly implementing iteration because of look-around
+/// operators (`^`, `$`, `\b`).
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_is_match_at`](Regex::try_is_match_at).
+ pub fn is_match_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ self.try_is_match_at(haystack, start, end).unwrap()
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
+ pub fn find_earliest_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<MultiMatch> {
+ self.try_find_earliest_at(haystack, start, end).unwrap()
+ }
+
+ /// Returns the same as `find_leftmost`, but starts the search at the given
+ /// offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, if the DFA is anchored, then
+ /// a match can only occur when `start == 0`.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches within the
+ /// same haystack, which cannot be done correctly by simply providing a
+ /// subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
+ pub fn find_leftmost_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<MultiMatch> {
+ self.try_find_leftmost_at(haystack, start, end).unwrap()
+ }
+
+ /// Search for the first overlapping match within a given range of
+ /// `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying DFAs return an error, then this routine panics. This
+ /// only occurs in non-default configurations where quit bytes are used or
+ /// Unicode word boundaries are heuristically enabled.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
+ pub fn find_overlapping_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Option<MultiMatch> {
+ self.try_find_overlapping_at(haystack, start, end, state).unwrap()
+ }
+}
+
+/// Fallible search routines. These may return an error when the underlying
+/// DFAs have been configured in a way that permits them to fail during a
+/// search.
+///
+/// Errors during search only occur when the DFA has been explicitly
+/// configured to do so, usually by specifying one or more "quit" bytes or by
+/// heuristically enabling Unicode word boundaries.
+///
+/// Errors will never be returned using the default configuration. So these
+/// fallible routines are only needed for particular configurations.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`is_match`](Regex::is_match).
+ pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> {
+ self.try_is_match_at(haystack, 0, haystack.len())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest`](Regex::find_earliest).
+ pub fn try_find_earliest(
+ &self,
+ haystack: &[u8],
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_earliest_at(haystack, 0, haystack.len())
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost`](Regex::find_leftmost).
+ pub fn try_find_leftmost(
+ &self,
+ haystack: &[u8],
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_leftmost_at(haystack, 0, haystack.len())
+ }
+
+ /// Search for the first overlapping match in `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping`](Regex::find_overlapping).
+ pub fn try_find_overlapping(
+ &self,
+ haystack: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_overlapping_at(haystack, 0, haystack.len(), state)
+ }
+
+ /// Returns an iterator over all non-overlapping "earliest" matches.
+ ///
+ /// Match positions are reported as soon as a match is known to occur, even
+ /// if the standard leftmost match would be longer.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest_iter`](Regex::find_earliest_iter).
+ pub fn try_find_earliest_iter<'r, 't>(
+ &'r self,
+ haystack: &'t [u8],
+ ) -> TryFindEarliestMatches<'r, 't, A, P> {
+ TryFindEarliestMatches::new(self, haystack)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
+ pub fn try_find_leftmost_iter<'r, 't>(
+ &'r self,
+ haystack: &'t [u8],
+ ) -> TryFindLeftmostMatches<'r, 't, A, P> {
+ TryFindLeftmostMatches::new(self, haystack)
+ }
+
+ /// Returns an iterator over all overlapping matches in the given haystack.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// The iterator takes care of handling the overlapping state that must be
+ /// threaded through every search.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
+ pub fn try_find_overlapping_iter<'r, 't>(
+ &'r self,
+ haystack: &'t [u8],
+ ) -> TryFindOverlappingMatches<'r, 't, A, P> {
+ TryFindOverlappingMatches::new(self, haystack)
+ }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`is_match_at`](Regex::is_match_at).
+ pub fn try_is_match_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<bool, MatchError> {
+ self.forward()
+ .find_earliest_fwd_at(
+ self.scanner().as_mut(),
+ None,
+ haystack,
+ start,
+ end,
+ )
+ .map(|x| x.is_some())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest_at`](Regex::find_earliest_at).
+ pub fn try_find_earliest_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_earliest_at_imp(
+ self.scanner().as_mut(),
+ haystack,
+ start,
+ end,
+ )
+ }
+
+ /// The implementation of "earliest" searching, where a prefilter scanner
+ /// may be given.
+ fn try_find_earliest_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ // N.B. We use `&&A` here to call `Automaton` methods, which ensures
+ // that we always use the `impl Automaton for &A` for calling methods.
+ // Since this is the usual way that automata are used, this helps
+ // reduce the number of monomorphized copies of the search code.
+ let (fwd, rev) = (self.forward(), self.reverse());
+ let end = match (&fwd)
+ .find_earliest_fwd_at(pre, None, haystack, start, end)?
+ {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // N.B. The only time we need to tell the reverse searcher the pattern
+ // to match is in the overlapping case, since it's ambiguous. In the
+ // leftmost case, I have tentatively convinced myself that it isn't
+ // necessary and the reverse search will always find the same pattern
+ // to match as the forward search. But I lack a rigorous proof.
+ let start = (&rev)
+ .find_earliest_rev_at(None, haystack, start, end.offset())?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern"
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost_at`](Regex::find_leftmost_at).
+ pub fn try_find_leftmost_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_leftmost_at_imp(
+ self.scanner().as_mut(),
+ haystack,
+ start,
+ end,
+ )
+ }
+
+ /// The implementation of leftmost searching, where a prefilter scanner
+ /// may be given.
+ fn try_find_leftmost_at_imp(
+ &self,
+ scanner: Option<&mut prefilter::Scanner>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ // N.B. We use `&&A` here to call `Automaton` methods, which ensures
+ // that we always use the `impl Automaton for &A` for calling methods.
+ // Since this is the usual way that automata are used, this helps
+ // reduce the number of monomorphized copies of the search code.
+ let (fwd, rev) = (self.forward(), self.reverse());
+ let end = match (&fwd)
+ .find_leftmost_fwd_at(scanner, None, haystack, start, end)?
+ {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // N.B. The only time we need to tell the reverse searcher the pattern
+ // to match is in the overlapping case, since it's ambiguous. In the
+ // leftmost case, I have tentatively convinced myself that it isn't
+ // necessary and the reverse search will always find the same pattern
+ // to match as the forward search. But I lack a rigorous proof. Why not
+ // just provide the pattern anyway? Well, if it is needed, then leaving
+ // it out gives us a chance to find a witness.
+ let start = (&rev)
+ .find_leftmost_rev_at(None, haystack, start, end.offset())?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+
+ /// Search for the first overlapping match within a given range of
+ /// `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used or Unicode word boundaries are heuristically
+ /// enabled.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping_at`](Regex::find_overlapping_at).
+ pub fn try_find_overlapping_at(
+ &self,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_overlapping_at_imp(
+ self.scanner().as_mut(),
+ haystack,
+ start,
+ end,
+ state,
+ )
+ }
+
+ /// The implementation of overlapping search at a given range in
+ /// `haystack`, where `scanner` is a prefilter (if active) and `state` is
+ /// the current state of the search.
+ fn try_find_overlapping_at_imp(
+ &self,
+ scanner: Option<&mut prefilter::Scanner>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ // N.B. We use `&&A` here to call `Automaton` methods, which ensures
+ // that we always use the `impl Automaton for &A` for calling methods.
+ // Since this is the usual way that automata are used, this helps
+ // reduce the number of monomorphized copies of the search code.
+ let (fwd, rev) = (self.forward(), self.reverse());
+ // TODO: Decide whether it's worth making this assert work. It doesn't
+ // work currently because 'has_starts_for_each_pattern' isn't on the
+ // Automaton trait. Without this assert, we still get a panic, but it's
+ // a bit more inscrutable.
+ // assert!(
+ // rev.has_starts_for_each_pattern(),
+ // "overlapping searches require that the reverse DFA is \
+ // compiled with the 'starts_for_each_pattern' option",
+ // );
+ let end = match (&fwd).find_overlapping_fwd_at(
+ scanner, None, haystack, start, end, state,
+ )? {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // Unlike the leftmost cases, the reverse overlapping search may match
+ // a different pattern than the forward search. See test failures when
+ // using `None` instead of `Some(end.pattern())` below. Thus, we must
+ // run our reverse search using the pattern that matched in the forward
+ // direction.
+ let start = (&rev)
+ .find_leftmost_rev_at(
+ Some(end.pattern()),
+ haystack,
+ 0,
+ end.offset(),
+ )?
+ .expect("reverse search must match if forward search does");
+ assert!(start.offset() <= end.offset());
+ assert_eq!(start.pattern(), end.pattern());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+ /// Attach the given prefilter to this regex.
+ pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> {
+ Regex {
+ prefilter: Some(prefilter),
+ forward: self.forward,
+ reverse: self.reverse,
+ utf8: self.utf8,
+ }
+ }
+
+ /// Remove any prefilter from this regex.
+ pub fn without_prefilter(self) -> Regex<A> {
+ Regex {
+ prefilter: None,
+ forward: self.forward,
+ reverse: self.reverse,
+ utf8: self.utf8,
+ }
+ }
+
+ /// Return the underlying DFA responsible for forward matching.
+ ///
+ /// This is useful for accessing the underlying DFA and converting it to
+ /// some other format or size. See the [`Builder::build_from_dfas`] docs
+ /// for an example of where this might be useful.
+ pub fn forward(&self) -> &A {
+ &self.forward
+ }
+
+ /// Return the underlying DFA responsible for reverse matching.
+ ///
+ /// This is useful for accessing the underlying DFA and converting it to
+ /// some other format or size. See the [`Builder::build_from_dfas`] docs
+ /// for an example of where this might be useful.
+ pub fn reverse(&self) -> &A {
+ &self.reverse
+ }
+
+ /// Returns the total number of patterns matched by this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+ /// assert_eq!(3, re.pattern_count());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_count(&self) -> usize {
+ assert_eq!(
+ self.forward().pattern_count(),
+ self.reverse().pattern_count()
+ );
+ self.forward().pattern_count()
+ }
+
+ /// Convenience function for returning this regex's prefilter as a trait
+ /// object.
+ ///
+ /// If this regex doesn't have a prefilter, then `None` is returned.
+ pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+ match self.prefilter {
+ None => None,
+ Some(ref x) => Some(&*x),
+ }
+ }
+
+ /// Convenience function for returning a prefilter scanner.
+ fn scanner(&self) -> Option<prefilter::Scanner> {
+ self.prefilter().map(prefilter::Scanner::new)
+ }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct FindEarliestMatches<'r, 't, A, P>(
+ TryFindEarliestMatches<'r, 't, A, P>,
+);
+
+impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> {
+ fn new(
+ re: &'r Regex<A, P>,
+ text: &'t [u8],
+ ) -> FindEarliestMatches<'r, 't, A, P> {
+ FindEarliestMatches(TryFindEarliestMatches::new(re, text))
+ }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+ for FindEarliestMatches<'r, 't, A, P>
+{
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct FindLeftmostMatches<'r, 't, A, P>(
+ TryFindLeftmostMatches<'r, 't, A, P>,
+);
+
+impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> {
+ fn new(
+ re: &'r Regex<A, P>,
+ text: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 't, A, P> {
+ FindLeftmostMatches(TryFindLeftmostMatches::new(re, text))
+ }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+ for FindLeftmostMatches<'r, 't, A, P>
+{
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all overlapping matches for a particular infallible
+/// search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>(
+ TryFindOverlappingMatches<'r, 't, A, P>,
+);
+
+impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> {
+ fn new(
+ re: &'r Regex<A, P>,
+ text: &'t [u8],
+ ) -> FindOverlappingMatches<'r, 't, A, P> {
+ FindOverlappingMatches(TryFindOverlappingMatches::new(re, text))
+ }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+ for FindOverlappingMatches<'r, 't, A, P>
+{
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct TryFindEarliestMatches<'r, 't, A, P> {
+ re: &'r Regex<A, P>,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> {
+ fn new(
+ re: &'r Regex<A, P>,
+ text: &'t [u8],
+ ) -> TryFindEarliestMatches<'r, 't, A, P> {
+ let scanner = re.scanner();
+ TryFindEarliestMatches {
+ re,
+ scanner,
+ text,
+ last_end: 0,
+ last_match: None,
+ }
+ }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+ for TryFindEarliestMatches<'r, 't, A, P>
+{
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_earliest_at_imp(
+ self.scanner.as_mut(),
+ self.text,
+ self.last_end,
+ self.text.len(),
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.re.utf8 {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(Ok(m))
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct TryFindLeftmostMatches<'r, 't, A, P> {
+ re: &'r Regex<A, P>,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> {
+ fn new(
+ re: &'r Regex<A, P>,
+ text: &'t [u8],
+ ) -> TryFindLeftmostMatches<'r, 't, A, P> {
+ let scanner = re.scanner();
+ TryFindLeftmostMatches {
+ re,
+ scanner,
+ text,
+ last_end: 0,
+ last_match: None,
+ }
+ }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+ for TryFindLeftmostMatches<'r, 't, A, P>
+{
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_leftmost_at_imp(
+ self.scanner.as_mut(),
+ self.text,
+ self.last_end,
+ self.text.len(),
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.re.utf8 {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(Ok(m))
+ }
+}
+
+/// An iterator over all overlapping matches for a particular fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> {
+ re: &'r Regex<A, P>,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ state: OverlappingState,
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter>
+ TryFindOverlappingMatches<'r, 't, A, P>
+{
+ fn new(
+ re: &'r Regex<A, P>,
+ text: &'t [u8],
+ ) -> TryFindOverlappingMatches<'r, 't, A, P> {
+ let scanner = re.scanner();
+ TryFindOverlappingMatches {
+ re,
+ scanner,
+ text,
+ last_end: 0,
+ state: OverlappingState::start(),
+ }
+ }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+ for TryFindOverlappingMatches<'r, 't, A, P>
+{
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_overlapping_at_imp(
+ self.scanner.as_mut(),
+ self.text,
+ self.last_end,
+ self.text.len(),
+ &mut self.state,
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ // Unlike the non-overlapping case, we're OK with empty matches at this
+ // level. In particular, the overlapping search algorithm is itself
+ // responsible for ensuring that progress is always made.
+ self.last_end = m.end();
+ Some(Ok(m))
+ }
+}
+
+/// The configuration used for compiling a DFA-backed regex.
+///
+/// A regex configuration is a simple data object that is typically used with
+/// [`Builder::configure`].
+#[cfg(feature = "alloc")]
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ utf8: Option<bool>,
+}
+
+#[cfg(feature = "alloc")]
+impl Config {
+ /// Return a new default regex compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Whether to enable UTF-8 mode or not.
+ ///
+ /// When UTF-8 mode is enabled (the default) and an empty match is seen,
+ /// the iterators on [`Regex`] will always start the next search at the
+ /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
+ /// mode is disabled, such searches are begun at the next byte offset.
+ ///
+ /// If this mode is enabled and invalid UTF-8 is given to search, then
+ /// behavior is unspecified.
+ ///
+ /// Generally speaking, one should enable this when
+ /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
+ /// and
+ /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+ /// are enabled, and disable it otherwise.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates the differences between when this option is
+ /// enabled and disabled. The differences only arise when the regex can
+ /// return matches of length zero.
+ ///
+ /// In this first snippet, we show the results when UTF-8 mode is disabled.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .build(r"")?;
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And in this snippet, we execute the same search on the same haystack,
+ /// but with UTF-8 mode enabled. Notice that byte offsets that would
+ /// otherwise split the encoding of `☃` are not returned.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(true))
+ /// .build(r"")?;
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ /// Returns true if and only if this configuration has UTF-8 mode enabled.
+ ///
+ /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
+ /// [`Regex`] will always start the next search at the next UTF-8 encoded
+ /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
+ /// next byte offset.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config { utf8: o.utf8.or(self.utf8) }
+ }
+}
+
+/// A builder for a regex based on deterministic finite automatons.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder in
+/// that it permits fine grain configuration of the construction process. The
+/// trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are three
+/// different UTF-8 modes:
+///
+/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
+/// pattern itself can contain sub-expressions that match invalid UTF-8.
+/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+/// controls whether the implicit unanchored prefix added to the NFA can
+/// match through invalid UTF-8 or not.
+/// * [`Config::utf8`] controls how the regex iterators themselves advance
+/// the starting position of the next search when a match with zero length is
+/// found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two DFAs, where one is
+/// responsible for finding the end of a match and the other is responsible
+/// for finding the start of a match. If you only need to detect whether
+/// something matched, or only the end of a match, then you should use a
+/// [`dense::Builder`] to construct a single DFA, which is cheaper than
+/// building two DFAs.
+///
+/// # Build methods
+///
+/// This builder has a few "build" methods. In general, it's the result of
+/// combining the following parameters:
+///
+/// * Building one or many regexes.
+/// * Building a regex with dense or sparse DFAs.
+///
+/// The simplest "build" method is [`Builder::build`]. It accepts a single
+/// pattern and builds a dense DFA using `usize` for the state identifier
+/// representation.
+///
+/// The most general "build" method is [`Builder::build_many`], which permits
+/// building a regex that searches for multiple patterns simultaneously while
+/// using a specific state identifier representation.
+///
+/// The most flexible "build" method, but hardest to use, is
+/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
+/// just a pair of DFAs, and this method allows you to specify those DFAs
+/// exactly.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
+/// the regex itself. This is generally what you want for matching on
+/// arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// };
+///
+/// let re = Regex::builder()
+/// .configure(Regex::config().utf8(false))
+/// .syntax(SyntaxConfig::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(MultiMatch::must(0, 1, 9));
+/// let got = re.find_leftmost(haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this. Notice also that the
+/// // search was unanchored and skipped over invalid UTF-8.
+/// // Disabling UTF-8 on the Thompson NFA permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ dfa: dense::Builder,
+}
+
+#[cfg(feature = "alloc")]
+impl Builder {
+ /// Create a new regex builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default(), dfa: dense::Builder::new() }
+ }
+
+ /// Build a regex from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a regex from the given pattern using sparse DFAs.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build_sparse(
+ &self,
+ pattern: &str,
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ self.build_many_sparse(&[pattern])
+ }
+
+ /// Build a regex from the given patterns.
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex, Error> {
+ let forward = self.dfa.build_many(patterns)?;
+ let reverse = self
+ .dfa
+ .clone()
+ .configure(
+ dense::Config::new()
+ .anchored(true)
+ .match_kind(MatchKind::All)
+ .starts_for_each_pattern(true),
+ )
+ .thompson(thompson::Config::new().reverse(true))
+ .build_many(patterns)?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a sparse regex from the given patterns.
+ pub fn build_many_sparse<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+ let re = self.build_many(patterns)?;
+ let forward = re.forward().to_sparse()?;
+ let reverse = re.reverse().to_sparse()?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a regex from its component forward and reverse DFAs.
+ ///
+ /// This is useful when deserializing a regex from some arbitrary
+ /// memory region. This is also useful for building regexes from other
+ /// types of DFAs.
+ ///
+ /// If you're building the DFAs from scratch instead of building new DFAs
+ /// from other DFAs, then you'll need to make sure that the reverse DFA is
+ /// configured correctly to match the intended semantics. Namely:
+ ///
+ /// * It should be anchored.
+ /// * It should use [`MatchKind::All`] semantics.
+ /// * It should match in reverse.
+ /// * It should have anchored start states compiled for each pattern.
+ /// * Otherwise, its configuration should match the forward DFA.
+ ///
+ /// If these conditions are satisfied, then behavior of searches is
+ /// unspecified.
+ ///
+ /// Note that when using this constructor, only the configuration from
+ /// [`Config`] is applied. The only configuration settings on this builder
+ /// only apply when the builder owns the construction of the DFAs
+ /// themselves.
+ ///
+ /// # Example
+ ///
+ /// This example is a bit a contrived. The usual use of these methods
+ /// would involve serializing `initial_re` somewhere and then deserializing
+ /// it later to build a regex. But in this case, we do everything in
+ /// memory.
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This example shows how to build a `Regex` that uses sparse DFAs instead
+ /// of dense DFAs without using one of the convenience `build_sparse`
+ /// routines:
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let fwd = initial_re.forward().to_sparse()?;
+ /// let rev = initial_re.reverse().to_sparse()?;
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_dfas<A: Automaton>(
+ &self,
+ forward: A,
+ reverse: A,
+ ) -> Regex<A> {
+ let utf8 = self.config.get_utf8();
+ Regex { prefilter: None, forward, reverse, utf8 }
+ }
+
+ /// Apply the given regex configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.dfa.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.dfa.thompson(config);
+ self
+ }
+
+ /// Set the dense DFA compilation configuration for this builder using
+ /// [`dense::Config`](dense::Config).
+ ///
+ /// This permits setting things like whether the underlying DFAs should
+ /// be minimized.
+ pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
+ self.dfa.configure(config);
+ self
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+#[inline(always)]
+fn next_unwrap(
+ item: Option<Result<MultiMatch, MatchError>>,
+) -> Option<MultiMatch> {
+ match item {
+ None => None,
+ Some(Ok(m)) => Some(m),
+ Some(Err(err)) => panic!(
+ "unexpected regex search error: {}\n\
+ to handle search errors, use try_ methods",
+ err,
+ ),
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs
new file mode 100644
index 000000000..492414981
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/search.rs
@@ -0,0 +1,493 @@
+use crate::{
+ dfa::{
+ accel,
+ automaton::{Automaton, OverlappingState, StateMatch},
+ },
+ util::{
+ id::{PatternID, StateID},
+ matchtypes::HalfMatch,
+ prefilter, MATCH_OFFSET,
+ },
+ MatchError,
+};
+
+#[inline(never)]
+pub fn find_earliest_fwd<A: Automaton + ?Sized>(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() && pattern_id.is_none() {
+ find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
+ } else {
+ find_fwd(None, true, dfa, pattern_id, bytes, start, end)
+ }
+}
+
+#[inline(never)]
+pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() && pattern_id.is_none() {
+ find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
+ } else {
+ find_fwd(None, false, dfa, pattern_id, bytes, start, end)
+ }
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
+/// getting inlined eliminate some critical branches. To avoid bloating binary
+/// size, we only call this function in a fixed number of places.
+#[inline(always)]
+fn find_fwd<A: Automaton + ?Sized>(
+ mut pre: Option<&mut prefilter::Scanner>,
+ earliest: bool,
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= haystack.len());
+ assert!(end <= haystack.len());
+
+ // Why do this? This lets 'bytes[at]' work without bounds checks below.
+ // It seems the assert on 'end <= haystack.len()' above is otherwise
+ // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+ // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+ // for resolving look-ahead.
+ let bytes = &haystack[..end];
+
+ let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
+ let mut last_match = None;
+ let mut at = start;
+ if let Some(ref mut pre) = pre {
+ // If a prefilter doesn't report false positives, then we don't need to
+ // touch the DFA at all. However, since all matches include the pattern
+ // ID, and the prefilter infrastructure doesn't report pattern IDs, we
+ // limit this optimization to cases where there is exactly one pattern.
+ // In that case, any match must be the 0th pattern.
+ if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
+ return Ok(pre.next_candidate(bytes, at).into_option().map(
+ |offset| HalfMatch { pattern: PatternID::ZERO, offset },
+ ));
+ } else if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ while at < end {
+ let byte = bytes[at];
+ state = dfa.next_state(state, byte);
+ at += 1;
+ if dfa.is_special_state(state) {
+ if dfa.is_start_state(state) {
+ if let Some(ref mut pre) = pre {
+ if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ } else if dfa.is_accel_state(state) {
+ let needles = dfa.accelerator(state);
+ at = accel::find_fwd(needles, bytes, at)
+ .unwrap_or(bytes.len());
+ }
+ } else if dfa.is_match_state(state) {
+ last_match = Some(HalfMatch {
+ pattern: dfa.match_pattern(state, 0),
+ offset: at - MATCH_OFFSET,
+ });
+ if earliest {
+ return Ok(last_match);
+ }
+ if dfa.is_accel_state(state) {
+ let needles = dfa.accelerator(state);
+ at = accel::find_fwd(needles, bytes, at)
+ .unwrap_or(bytes.len());
+ }
+ } else if dfa.is_accel_state(state) {
+ let needs = dfa.accelerator(state);
+ at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
+ } else if dfa.is_dead_state(state) {
+ return Ok(last_match);
+ } else {
+ debug_assert!(dfa.is_quit_state(state));
+ if last_match.is_some() {
+ return Ok(last_match);
+ }
+ return Err(MatchError::Quit { byte, offset: at - 1 });
+ }
+ }
+ while at < end && dfa.next_state(state, bytes[at]) == state {
+ at += 1;
+ }
+ }
+ Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
+}
+
+#[inline(never)]
+pub fn find_earliest_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ find_rev(true, dfa, pattern_id, bytes, start, end)
+}
+
+#[inline(never)]
+pub fn find_leftmost_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ find_rev(false, dfa, pattern_id, bytes, start, end)
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
+/// permits eliminating a few crucial branches.
+#[inline(always)]
+fn find_rev<A: Automaton + ?Sized>(
+ earliest: bool,
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
+ let mut last_match = None;
+ let mut at = end;
+ while at > start {
+ at -= 1;
+ while at > start && dfa.next_state(state, bytes[at]) == state {
+ at -= 1;
+ }
+
+ let byte = bytes[at];
+ state = dfa.next_state(state, byte);
+ if dfa.is_special_state(state) {
+ if dfa.is_start_state(state) {
+ if dfa.is_accel_state(state) {
+ let needles = dfa.accelerator(state);
+ at = accel::find_rev(needles, bytes, at)
+ .map(|i| i + 1)
+ .unwrap_or(0);
+ }
+ } else if dfa.is_match_state(state) {
+ last_match = Some(HalfMatch {
+ pattern: dfa.match_pattern(state, 0),
+ offset: at + MATCH_OFFSET,
+ });
+ if earliest {
+ return Ok(last_match);
+ }
+ if dfa.is_accel_state(state) {
+ let needles = dfa.accelerator(state);
+ at = accel::find_rev(needles, bytes, at)
+ .map(|i| i + 1)
+ .unwrap_or(0);
+ }
+ } else if dfa.is_accel_state(state) {
+ let needles = dfa.accelerator(state);
+ at = accel::find_rev(needles, bytes, at)
+ .map(|i| i + 1)
+ .unwrap_or(0);
+ } else if dfa.is_dead_state(state) {
+ return Ok(last_match);
+ } else {
+ debug_assert!(dfa.is_quit_state(state));
+ if last_match.is_some() {
+ return Ok(last_match);
+ }
+ return Err(MatchError::Quit { byte, offset: at });
+ }
+ }
+ }
+ Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
+}
+
+#[inline(never)]
+pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should only ever
+ // use a prefilter when no pattern ID is given.
+ if pre.is_some() && pattern_id.is_none() {
+ find_overlapping_fwd_imp(
+ pre,
+ dfa,
+ pattern_id,
+ bytes,
+ start,
+ end,
+ caller_state,
+ )
+ } else {
+ find_overlapping_fwd_imp(
+ None,
+ dfa,
+ pattern_id,
+ bytes,
+ start,
+ end,
+ caller_state,
+ )
+ }
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
+/// permits eliminating a few crucial branches and reduces code size when it is
+/// not used.
+#[inline(always)]
+fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
+ mut pre: Option<&mut prefilter::Scanner>,
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ mut start: usize,
+ end: usize,
+ caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let mut state = match caller_state.id() {
+ None => init_fwd(dfa, pattern_id, bytes, start, end)?,
+ Some(id) => {
+ if let Some(last) = caller_state.last_match() {
+ let match_count = dfa.match_count(id);
+ if last.match_index < match_count {
+ let m = HalfMatch {
+ pattern: dfa.match_pattern(id, last.match_index),
+ offset: last.offset,
+ };
+ last.match_index += 1;
+ return Ok(Some(m));
+ }
+ }
+
+ // This is a subtle but critical detail. If the caller provides a
+ // non-None state ID, then it must be the case that the state ID
+ // corresponds to one set by this function. The state ID therefore
+ // corresponds to a match state, a dead state or some other state.
+ // However, "some other" state _only_ occurs when the input has
+ // been exhausted because the only way to stop before then is to
+ // see a match or a dead/quit state.
+ //
+ // If the input is exhausted or if it's a dead state, then
+ // incrementing the starting position has no relevance on
+ // correctness, since the loop below will either not execute
+ // at all or will immediately stop due to being in a dead state.
+ // (Once in a dead state it is impossible to leave it.)
+ //
+ // Therefore, the only case we need to consider is when
+ // caller_state is a match state. In this case, since our machines
+ // support the ability to delay a match by a certain number of
+ // bytes (to support look-around), it follows that we actually
+ // consumed that many additional bytes on our previous search. When
+ // the caller resumes their search to find subsequent matches, they
+ // will use the ending location from the previous match as the next
+ // starting point, which is `MATCH_OFFSET` bytes PRIOR to where
+ // we scanned to on the previous search. Therefore, we need to
+ // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
+ //
+ // Incidentally, since MATCH_OFFSET is non-zero, this also makes
+ // dealing with empty matches convenient. Namely, callers needn't
+ // special case them when implementing an iterator. Instead, this
+ // ensures that forward progress is always made.
+ start += MATCH_OFFSET;
+ id
+ }
+ };
+
+ let mut at = start;
+ while at < end {
+ let byte = bytes[at];
+ state = dfa.next_state(state, byte);
+ at += 1;
+ if dfa.is_special_state(state) {
+ caller_state.set_id(state);
+ if dfa.is_start_state(state) {
+ if let Some(ref mut pre) = pre {
+ if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ } else if dfa.is_accel_state(state) {
+ let needles = dfa.accelerator(state);
+ at = accel::find_fwd(needles, bytes, at)
+ .unwrap_or(bytes.len());
+ }
+ } else if dfa.is_match_state(state) {
+ let offset = at - MATCH_OFFSET;
+ caller_state
+ .set_last_match(StateMatch { match_index: 1, offset });
+ return Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(state, 0),
+ offset,
+ }));
+ } else if dfa.is_accel_state(state) {
+ let needs = dfa.accelerator(state);
+ at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
+ } else if dfa.is_dead_state(state) {
+ return Ok(None);
+ } else {
+ debug_assert!(dfa.is_quit_state(state));
+ return Err(MatchError::Quit { byte, offset: at - 1 });
+ }
+ }
+ }
+
+ let result = eoi_fwd(dfa, bytes, end, &mut state);
+ caller_state.set_id(state);
+ if let Ok(Some(ref last_match)) = result {
+ caller_state.set_last_match(StateMatch {
+ match_index: 1,
+ offset: last_match.offset(),
+ });
+ }
+ result
+}
+
+fn init_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<StateID, MatchError> {
+ let state = dfa.start_state_forward(pattern_id, bytes, start, end);
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ assert!(!dfa.is_match_state(state));
+ Ok(state)
+}
+
+fn init_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<StateID, MatchError> {
+ let state = dfa.start_state_reverse(pattern_id, bytes, start, end);
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ assert!(!dfa.is_match_state(state));
+ Ok(state)
+}
+
+fn eoi_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ end: usize,
+ state: &mut StateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+ match bytes.get(end) {
+ Some(&b) => {
+ *state = dfa.next_state(*state, b);
+ if dfa.is_match_state(*state) {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(*state, 0),
+ offset: end,
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+ None => {
+ *state = dfa.next_eoi_state(*state);
+ if dfa.is_match_state(*state) {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(*state, 0),
+ offset: bytes.len(),
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+ }
+}
+
+fn eoi_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ start: usize,
+ state: StateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+ if start > 0 {
+ let state = dfa.next_state(state, bytes[start - 1]);
+ if dfa.is_match_state(state) {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(state, 0),
+ offset: start,
+ }))
+ } else {
+ Ok(None)
+ }
+ } else {
+ let state = dfa.next_eoi_state(state);
+ if dfa.is_match_state(state) {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(state, 0),
+ offset: 0,
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+}
+
+// Currently unused, but is useful to keep around. This was originally used
+// when the code above used raw pointers for its main loop.
+// /// Returns the distance between the given pointer and the start of `bytes`.
+// /// This assumes that the given pointer points to somewhere in the `bytes`
+// /// slice given.
+// fn offset(bytes: &[u8], p: *const u8) -> usize {
+// debug_assert!(bytes.as_ptr() <= p);
+// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
+// ((p as isize) - (bytes.as_ptr() as isize)) as usize
+// }
diff --git a/vendor/regex-automata/src/dfa/search_unsafe.rs b/vendor/regex-automata/src/dfa/search_unsafe.rs
new file mode 100644
index 000000000..ea1c29ff7
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/search_unsafe.rs
@@ -0,0 +1,321 @@
+use crate::dfa::automaton::{Automaton, State};
+use crate::MatchError;
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
+/// permits eliminating a few crucial branches.
+#[inline(always)]
+pub fn find_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ earliest: bool,
+) -> Result<Option<usize>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?;
+ if earliest && last_match.is_some() {
+ return Ok(last_match);
+ }
+
+ let mut at = start;
+ while at < end {
+ let byte = bytes[at];
+ state = dfa.next_state(state, byte);
+ at += 1;
+ if dfa.is_special_state(state) {
+ if dfa.is_dead_state(state) {
+ return Ok(last_match);
+ } else if dfa.is_quit_state(state) {
+ return Err(MatchError::Quit { byte, offset: at - 1 });
+ }
+ last_match = Some(at - dfa.match_offset());
+ if earliest {
+ return Ok(last_match);
+ }
+ }
+ }
+ /*
+ unsafe {
+ let mut p = bytes.as_ptr().add(start);
+ while p < bytes[end..].as_ptr() {
+ let byte = *p;
+ state = dfa.next_state_unchecked(state, byte);
+ p = p.add(1);
+ if dfa.is_special_state(state) {
+ if dfa.is_dead_state(state) {
+ return Ok(last_match);
+ } else if dfa.is_quit_state(state) {
+ return Err(MatchError::Quit {
+ byte,
+ offset: offset(bytes, p) - 1,
+ });
+ }
+ last_match = Some(offset(bytes, p) - dfa.match_offset());
+ if earliest {
+ return Ok(last_match);
+ }
+ }
+ }
+ }
+ */
+ Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match))
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
+/// permits eliminating a few crucial branches.
+#[inline(always)]
+pub fn find_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ earliest: bool,
+) -> Result<Option<usize>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?;
+ if earliest && last_match.is_some() {
+ return Ok(last_match);
+ }
+
+ let mut at = end;
+ while at > start {
+ at -= 1;
+ let byte = bytes[at];
+ state = dfa.next_state(state, byte);
+ if dfa.is_special_state(state) {
+ if dfa.is_dead_state(state) {
+ return Ok(last_match);
+ } else if dfa.is_quit_state(state) {
+ return Err(MatchError::Quit { byte, offset: at });
+ }
+ last_match = Some(at + dfa.match_offset());
+ if earliest {
+ return Ok(last_match);
+ }
+ }
+ }
+ /*
+ unsafe {
+ let mut p = bytes.as_ptr().add(end);
+ while p > bytes[start..].as_ptr() {
+ p = p.sub(1);
+ let byte = *p;
+ state = dfa.next_state_unchecked(state, byte);
+ if dfa.is_special_state(state) {
+ if dfa.is_dead_state(state) {
+ return Ok(last_match);
+ } else if dfa.is_quit_state(state) {
+ return Err(MatchError::Quit {
+ byte,
+ offset: offset(bytes, p),
+ });
+ }
+ last_match = Some(offset(bytes, p) + dfa.match_offset());
+ if earliest {
+ return Ok(last_match);
+ }
+ }
+ }
+ }
+ */
+ Ok(eof_rev(dfa, state, bytes, start)?.or(last_match))
+}
+
+pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ mut start: usize,
+ end: usize,
+ caller_state: &mut State<A::ID>,
+) -> Result<Option<usize>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let (mut state, mut last_match) = match caller_state.as_option() {
+ None => init_fwd(dfa, bytes, start, end)?,
+ Some(id) => {
+ // This is a subtle but critical detail. If the caller provides a
+ // non-None state ID, then it must be the case that the state ID
+ // corresponds to one set by this function. The state ID therefore
+ // corresponds to a match state, a dead state or some other state.
+ // However, "some other" state _only_ occurs when the input has
+ // been exhausted because the only way to stop before then is to
+ // see a match or a dead/quit state.
+ //
+ // If the input is exhausted or if it's a dead state, then
+ // incrementing the starting position has no relevance on
+ // correctness, since the loop below will either not execute
+ // at all or will immediately stop due to being in a dead state.
+ // (Once in a dead state it is impossible to leave it.)
+ //
+ // Therefore, the only case we need to consider is when
+ // caller_state is a match state. In this case, since our machines
+ // support the ability to delay a match by a certain number of
+ // bytes (to support look-around), it follows that we actually
+ // consumed that many additional bytes on our previous search. When
+ // the caller resumes their search to find subsequent matches, they
+ // will use the ending location from the previous match as the next
+ // starting point, which is `match_offset` bytes PRIOR to where
+ // we scanned to on the previous search. Therefore, we need to
+ // compensate by bumping `start` up by `match_offset` bytes.
+ start += dfa.match_offset();
+ // Since match_offset could be any arbitrary value and we use
+ // `start` in pointer arithmetic below, we check that we are still
+ // in bounds. Otherwise, we could materialize a pointer that is
+ // more than one past the end point of `bytes`, which is UB.
+ if start > end {
+ return Ok(None);
+ }
+ (id, None)
+ }
+ };
+ if last_match.is_some() {
+ caller_state.set(state);
+ return Ok(last_match);
+ }
+
+ let mut at = start;
+ while at < end {
+ let byte = bytes[at];
+ state = dfa.next_state(state, byte);
+ at += 1;
+ if dfa.is_special_state(state) {
+ caller_state.set(state);
+ if dfa.is_dead_state(state) {
+ return Ok(None);
+ } else if dfa.is_quit_state(state) {
+ return Err(MatchError::Quit { byte, offset: at - 1 });
+ } else {
+ return Ok(Some(at - dfa.match_offset()));
+ }
+ }
+ }
+ /*
+ // SAFETY: Other than the normal pointer arithmetic happening here, a
+ // unique aspect of safety for this function is the fact that the caller
+ // can provide the state that the search routine will start with. If this
+ // state were invalid, it would be possible to incorrectly index the
+ // transition table. We however prevent this from happening by guaranteeing
+ // that State is valid. Namely, callers cannot mutate a State. All they can
+ // do is create a "start" state or otherwise reuse a previously set state.
+ // Since callers can't mutate a state, it follows that a previously set
+ // state can only be retrieved by crate internal functions. Therefore, our
+ // use of it is safe since this code will only ever set the provided state
+ // to a valid state.
+ unsafe {
+ let mut p = bytes.as_ptr().add(start);
+ while p < bytes[end..].as_ptr() {
+ let byte = *p;
+ state = dfa.next_state_unchecked(state, byte);
+ p = p.add(1);
+ if dfa.is_special_state(state) {
+ caller_state.set(state);
+ return if dfa.is_dead_state(state) {
+ Ok(None)
+ } else if dfa.is_quit_state(state) {
+ Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 })
+ } else {
+ Ok(Some(offset(bytes, p) - dfa.match_offset()))
+ };
+ }
+ }
+ }
+ */
+
+ let result = eof_fwd(dfa, bytes, end, &mut state);
+ caller_state.set(state);
+ result
+}
+
+fn init_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<(A::ID, Option<usize>), MatchError> {
+ let state = dfa.start_state_forward(bytes, start, end);
+ if dfa.is_match_state(state) {
+ Ok((state, Some(start - dfa.match_offset())))
+ } else {
+ Ok((state, None))
+ }
+}
+
+fn init_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<(A::ID, Option<usize>), MatchError> {
+ let state = dfa.start_state_reverse(bytes, start, end);
+ if dfa.is_match_state(state) {
+ Ok((state, Some(end + dfa.match_offset())))
+ } else {
+ Ok((state, None))
+ }
+}
+
+fn eof_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ bytes: &[u8],
+ end: usize,
+ state: &mut A::ID,
+) -> Result<Option<usize>, MatchError> {
+ match bytes.get(end) {
+ Some(&b) => {
+ *state = dfa.next_state(*state, b);
+ if dfa.is_match_state(*state) {
+ Ok(Some(end))
+ } else {
+ Ok(None)
+ }
+ }
+ None => {
+ *state = dfa.next_eof_state(*state);
+ if dfa.is_match_state(*state) {
+ Ok(Some(bytes.len()))
+ } else {
+ Ok(None)
+ }
+ }
+ }
+}
+
+fn eof_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ state: A::ID,
+ bytes: &[u8],
+ start: usize,
+) -> Result<Option<usize>, MatchError> {
+ if start > 0 {
+ if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) {
+ Ok(Some(start))
+ } else {
+ Ok(None)
+ }
+ } else {
+ if dfa.is_match_state(dfa.next_eof_state(state)) {
+ Ok(Some(0))
+ } else {
+ Ok(None)
+ }
+ }
+}
+
+/// Returns the distance between the given pointer and the start of `bytes`.
+/// This assumes that the given pointer points to somewhere in the `bytes`
+/// slice given.
+fn offset(bytes: &[u8], p: *const u8) -> usize {
+ debug_assert!(bytes.as_ptr() <= p);
+ debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
+ ((p as isize) - (bytes.as_ptr() as isize)) as usize
+}
diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs
new file mode 100644
index 000000000..346606987
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/sparse.rs
@@ -0,0 +1,2283 @@
+/*!
+Types and routines specific to sparse DFAs.
+
+This module is the home of [`sparse::DFA`](DFA).
+
+Unlike the [`dense`](super::dense) module, this module does not contain a
+builder or configuration specific for sparse DFAs. Instead, the intended
+way to build a sparse DFA is either by using a default configuration with
+its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
+construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
+and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
+example, this configures a sparse DFA to do an overlapping search:
+
+```
+use regex_automata::{
+ dfa::{Automaton, OverlappingState, dense},
+ HalfMatch, MatchKind,
+};
+
+let dense_re = dense::Builder::new()
+ .configure(dense::Config::new().match_kind(MatchKind::All))
+ .build(r"Samwise|Sam")?;
+let sparse_re = dense_re.to_sparse()?;
+
+// Setup our haystack and initial start state.
+let haystack = b"Samwise";
+let mut state = OverlappingState::start();
+
+// First, 'Sam' will match.
+let end1 = sparse_re.find_overlapping_fwd_at(
+ None, None, haystack, 0, haystack.len(), &mut state,
+)?;
+assert_eq!(end1, Some(HalfMatch::must(0, 3)));
+
+// And now 'Samwise' will match.
+let end2 = sparse_re.find_overlapping_fwd_at(
+ None, None, haystack, 3, haystack.len(), &mut state,
+)?;
+assert_eq!(end2, Some(HalfMatch::must(0, 7)));
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+*/
+
+#[cfg(feature = "alloc")]
+use core::iter;
+use core::{
+ convert::{TryFrom, TryInto},
+ fmt,
+ mem::size_of,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::{collections::BTreeSet, vec, vec::Vec};
+
+#[cfg(feature = "alloc")]
+use crate::dfa::{dense, error::Error};
+use crate::{
+ dfa::{
+ automaton::{fmt_state_indicator, Automaton},
+ special::Special,
+ DEAD,
+ },
+ util::{
+ alphabet::ByteClasses,
+ bytes::{self, DeserializeError, Endian, SerializeError},
+ id::{PatternID, StateID},
+ start::Start,
+ DebugByte,
+ },
+};
+
+const LABEL: &str = "rust-regex-automata-dfa-sparse";
+const VERSION: u32 = 2;
+
+/// A sparse deterministic finite automaton (DFA) with variable sized states.
+///
+/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
+/// a more space efficient representation for its transitions. Consequently,
+/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
+/// price. In particular, reading the more space efficient transitions takes
+/// more work, and consequently, searching using a sparse DFA is typically
+/// slower than a dense DFA.
+///
+/// A sparse DFA can be built using the default configuration via the
+/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
+/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
+/// and then convert a dense DFA to a sparse DFA using
+/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
+///
+/// In general, a sparse DFA supports all the same search operations as a dense
+/// DFA.
+///
+/// Making the choice between a dense and sparse DFA depends on your specific
+/// work load. If you can sacrifice a bit of search time performance, then a
+/// sparse DFA might be the best choice. In particular, while sparse DFAs are
+/// probably always slower than dense DFAs, you may find that they are easily
+/// fast enough for your purposes!
+///
+/// # Type parameters
+///
+/// A `DFA` has one type parameter, `T`, which is used to represent the parts
+/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
+///
+/// # The `Automaton` trait
+///
+/// This type implements the [`Automaton`] trait, which means it can be used
+/// for searching. For example:
+///
+/// ```
+/// use regex_automata::{
+/// dfa::{Automaton, sparse::DFA},
+/// HalfMatch,
+/// };
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let expected = HalfMatch::must(0, 8);
+/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA<T> {
+ // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
+ // representation-wise. In reality, it is perhaps more complicated. Namely,
+ // in a dense DFA, all information needs to be very cheaply accessible
+ // using only state IDs. In a sparse DFA however, each state uses a
+ // variable amount of space because each state encodes more information
+ // than just its transitions. Each state also includes an accelerator if
+ // one exists, along with the matching pattern IDs if the state is a match
+ // state.
+ //
+ // That is, a lot of the complexity is pushed down into how each state
+ // itself is represented.
+ trans: Transitions<T>,
+ starts: StartTable<T>,
+ special: Special,
+}
+
+#[cfg(feature = "alloc")]
+impl DFA<Vec<u8>> {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding sparse DFA.
+ ///
+ /// If you want a non-default configuration, then use
+ /// the [`dense::Builder`](crate::dfa::dense::Builder)
+ /// to set your own configuration, and then call
+ /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
+ /// a sparse DFA.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
+ ///
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> {
+ dense::Builder::new()
+ .build(pattern)
+ .and_then(|dense| dense.to_sparse())
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// return the corresponding multi-DFA.
+ ///
+ /// If you want a non-default configuration, then use
+ /// the [`dense::Builder`](crate::dfa::dense::Builder)
+ /// to set your own configuration, and then call
+ /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
+ /// a sparse DFA.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let expected = HalfMatch::must(1, 3);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<DFA<Vec<u8>>, Error> {
+ dense::Builder::new()
+ .build_many(patterns)
+ .and_then(|dense| dense.to_sparse())
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl DFA<Vec<u8>> {
+ /// Create a new DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let dfa = sparse::DFA::always_match()?;
+ ///
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<DFA<Vec<u8>>, Error> {
+ dense::DFA::always_match()?.to_sparse()
+ }
+
+ /// Create a new sparse DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, sparse};
+ ///
+ /// let dfa = sparse::DFA::never_match()?;
+ /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
+ /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<DFA<Vec<u8>>, Error> {
+ dense::DFA::never_match()?.to_sparse()
+ }
+
+ /// The implementation for constructing a sparse DFA from a dense DFA.
+ pub(crate) fn from_dense<T: AsRef<[u32]>>(
+ dfa: &dense::DFA<T>,
+ ) -> Result<DFA<Vec<u8>>, Error> {
+ // In order to build the transition table, we need to be able to write
+ // state identifiers for each of the "next" transitions in each state.
+ // Our state identifiers correspond to the byte offset in the
+ // transition table at which the state is encoded. Therefore, we do not
+ // actually know what the state identifiers are until we've allocated
+ // exactly as much space as we need for each state. Thus, construction
+ // of the transition table happens in two passes.
+ //
+ // In the first pass, we fill out the shell of each state, which
+ // includes the transition count, the input byte ranges and zero-filled
+ // space for the transitions and accelerators, if present. In this
+ // first pass, we also build up a map from the state identifier index
+ // of the dense DFA to the state identifier in this sparse DFA.
+ //
+ // In the second pass, we fill in the transitions based on the map
+ // built in the first pass.
+
+ // The capacity given here reflects a minimum. (Well, the true minimum
+ // is likely even bigger, but hopefully this saves a few reallocs.)
+ let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count());
+ // This maps state indices from the dense DFA to StateIDs in the sparse
+ // DFA. We build out this map on the first pass, and then use it in the
+ // second pass to back-fill our transitions.
+ let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()];
+ for state in dfa.states() {
+ let pos = sparse.len();
+
+ remap[dfa.to_index(state.id())] =
+ StateID::new(pos).map_err(|_| Error::too_many_states())?;
+ // zero-filled space for the transition count
+ sparse.push(0);
+ sparse.push(0);
+
+ let mut transition_count = 0;
+ for (unit1, unit2, _) in state.sparse_transitions() {
+ match (unit1.as_u8(), unit2.as_u8()) {
+ (Some(b1), Some(b2)) => {
+ transition_count += 1;
+ sparse.push(b1);
+ sparse.push(b2);
+ }
+ (None, None) => {}
+ (Some(_), None) | (None, Some(_)) => {
+ // can never occur because sparse_transitions never
+ // groups EOI with any other transition.
+ unreachable!()
+ }
+ }
+ }
+ // Add dummy EOI transition. This is never actually read while
+ // searching, but having space equivalent to the total number
+ // of transitions is convenient. Otherwise, we'd need to track
+ // a different number of transitions for the byte ranges as for
+ // the 'next' states.
+ //
+ // N.B. The loop above is not guaranteed to yield the EOI
+ // transition, since it may point to a DEAD state. By putting
+ // it here, we always write the EOI transition, and thus
+ // guarantee that our transition count is >0. Why do we always
+ // need the EOI transition? Because in order to implement
+ // Automaton::next_eoi_state, this lets us just ask for the last
+ // transition. There are probably other/better ways to do this.
+ transition_count += 1;
+ sparse.push(0);
+ sparse.push(0);
+
+ // Check some assumptions about transition count.
+ assert_ne!(
+ transition_count, 0,
+ "transition count should be non-zero",
+ );
+ assert!(
+ transition_count <= 257,
+ "expected transition count {} to be <= 257",
+ transition_count,
+ );
+
+ // Fill in the transition count.
+ // Since transition count is always <= 257, we use the most
+ // significant bit to indicate whether this is a match state or
+ // not.
+ let ntrans = if dfa.is_match_state(state.id()) {
+ transition_count | (1 << 15)
+ } else {
+ transition_count
+ };
+ bytes::NE::write_u16(ntrans, &mut sparse[pos..]);
+
+ // zero-fill the actual transitions.
+ // Unwraps are OK since transition_count <= 257 and our minimum
+ // support usize size is 16-bits.
+ let zeros = usize::try_from(transition_count)
+ .unwrap()
+ .checked_mul(StateID::SIZE)
+ .unwrap();
+ sparse.extend(iter::repeat(0).take(zeros));
+
+ // If this is a match state, write the pattern IDs matched by this
+ // state.
+ if dfa.is_match_state(state.id()) {
+ let plen = dfa.match_pattern_len(state.id());
+ // Write the actual pattern IDs with a u32 length prefix.
+ // First, zero-fill space.
+ let mut pos = sparse.len();
+ // Unwraps are OK since it's guaranteed that plen <=
+ // PatternID::LIMIT, which is in turn guaranteed to fit into a
+ // u32.
+ let zeros = size_of::<u32>()
+ .checked_mul(plen)
+ .unwrap()
+ .checked_add(size_of::<u32>())
+ .unwrap();
+ sparse.extend(iter::repeat(0).take(zeros));
+
+ // Now write the length prefix.
+ bytes::NE::write_u32(
+ // Will never fail since u32::MAX is invalid pattern ID.
+ // Thus, the number of pattern IDs is representable by a
+ // u32.
+ plen.try_into().expect("pattern ID count fits in u32"),
+ &mut sparse[pos..],
+ );
+ pos += size_of::<u32>();
+
+ // Now write the pattern IDs.
+ for &pid in dfa.pattern_id_slice(state.id()) {
+ pos += bytes::write_pattern_id::<bytes::NE>(
+ pid,
+ &mut sparse[pos..],
+ );
+ }
+ }
+
+ // And now add the accelerator, if one exists. An accelerator is
+ // at most 4 bytes and at least 1 byte. The first byte is the
+ // length, N. N bytes follow the length. The set of bytes that
+ // follow correspond (exhaustively) to the bytes that must be seen
+ // to leave this state.
+ let accel = dfa.accelerator(state.id());
+ sparse.push(accel.len().try_into().unwrap());
+ sparse.extend_from_slice(accel);
+ }
+
+ let mut new = DFA {
+ trans: Transitions {
+ sparse,
+ classes: dfa.byte_classes().clone(),
+ count: dfa.state_count(),
+ patterns: dfa.pattern_count(),
+ },
+ starts: StartTable::from_dense_dfa(dfa, &remap)?,
+ special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
+ };
+ // And here's our second pass. Iterate over all of the dense states
+ // again, and update the transitions in each of the states in the
+ // sparse DFA.
+ for old_state in dfa.states() {
+ let new_id = remap[dfa.to_index(old_state.id())];
+ let mut new_state = new.trans.state_mut(new_id);
+ let sparse = old_state.sparse_transitions();
+ for (i, (_, _, next)) in sparse.enumerate() {
+ let next = remap[dfa.to_index(next)];
+ new_state.set_next_at(i, next);
+ }
+ }
+ trace!(
+ "created sparse DFA, memory usage: {} (dense memory usage: {})",
+ new.memory_usage(),
+ dfa.memory_usage(),
+ );
+ Ok(new)
+ }
+}
+
+impl<T: AsRef<[u8]>> DFA<T> {
+ /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
+ /// DFA returned always uses `&[u8]` for its transitions.
+ pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
+ DFA {
+ trans: self.trans.as_ref(),
+ starts: self.starts.as_ref(),
+ special: self.special,
+ }
+ }
+
+ /// Return an owned version of this sparse DFA. Specifically, the DFA
+ /// returned always uses `Vec<u8>` for its transitions.
+ ///
+ /// Effectively, this returns a sparse DFA whose transitions live on the
+ /// heap.
+ #[cfg(feature = "alloc")]
+ pub fn to_owned(&self) -> DFA<Vec<u8>> {
+ DFA {
+ trans: self.trans.to_owned(),
+ starts: self.starts.to_owned(),
+ special: self.special,
+ }
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.trans.memory_usage() + self.starts.memory_usage()
+ }
+
+ /// Returns true only if this DFA has starting states for each pattern.
+ ///
+ /// When a DFA has starting states for each pattern, then a search with the
+ /// DFA can be configured to only look for anchored matches of a specific
+ /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
+ /// can accept a non-None `pattern_id` if and only if this method returns
+ /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
+ ///
+ /// Note that if the DFA is empty, this always returns false.
+ pub fn has_starts_for_each_pattern(&self) -> bool {
+ self.starts.patterns > 0
+ }
+}
+
+/// Routines for converting a sparse DFA to other representations, such as raw
+/// bytes suitable for persistent storage.
+impl<T: AsRef<[u8]>> DFA<T> {
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
+ /// format.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+ /// serialization methods, this does not add any initial padding to the
+ /// returned bytes. Padding isn't required for sparse DFAs since they have
+ /// no alignment requirements.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_little_endian would work on a little endian target.
+ /// let buf = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_bytes_little_endian(&self) -> Vec<u8> {
+ self.to_bytes::<bytes::LE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
+ /// format.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+ /// serialization methods, this does not add any initial padding to the
+ /// returned bytes. Padding isn't required for sparse DFAs since they have
+ /// no alignment requirements.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_big_endian would work on a big endian target.
+ /// let buf = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_bytes_big_endian(&self) -> Vec<u8> {
+ self.to_bytes::<bytes::BE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
+ /// format.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+ /// serialization methods, this does not add any initial padding to the
+ /// returned bytes. Padding isn't required for sparse DFAs since they have
+ /// no alignment requirements.
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let buf = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ pub fn to_bytes_native_endian(&self) -> Vec<u8> {
+ self.to_bytes::<bytes::NE>()
+ }
+
+ /// The implementation of the public `to_bytes` serialization methods,
+ /// which is generic over endianness.
+ #[cfg(feature = "alloc")]
+ fn to_bytes<E: Endian>(&self) -> Vec<u8> {
+ let mut buf = vec![0; self.write_to_len()];
+ // This should always succeed since the only possible serialization
+ // error is providing a buffer that's too small, but we've ensured that
+ // `buf` is big enough here.
+ self.write_to::<E>(&mut buf).unwrap();
+ buf
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in little endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_little_endian would work on a little endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_little_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.write_to::<bytes::LE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in big endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_big_endian would work on a big endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_big_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.write_to::<bytes::BE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in native endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_native_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.write_to::<bytes::NE>(dst)
+ }
+
+ /// The implementation of the public `write_to` serialization methods,
+ /// which is generic over endianness.
+ fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let mut nw = 0;
+ nw += bytes::write_label(LABEL, &mut dst[nw..])?;
+ nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
+ nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
+ nw += {
+ // Currently unused, intended for future flexibility
+ E::write_u32(0, &mut dst[nw..]);
+ size_of::<u32>()
+ };
+ nw += self.trans.write_to::<E>(&mut dst[nw..])?;
+ nw += self.starts.write_to::<E>(&mut dst[nw..])?;
+ nw += self.special.write_to::<E>(&mut dst[nw..])?;
+ Ok(nw)
+ }
+
+ /// Return the total number of bytes required to serialize this DFA.
+ ///
+ /// This is useful for determining the size of the buffer required to pass
+ /// to one of the serialization routines:
+ ///
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// Passing a buffer smaller than the size returned by this method will
+ /// result in a serialization error.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to dynamically allocate enough room to serialize
+ /// a sparse DFA.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let mut buf = vec![0; original_dfa.write_to_len()];
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_len(&self) -> usize {
+ bytes::write_label_len(LABEL)
+ + bytes::write_endianness_check_len()
+ + bytes::write_version_len()
+ + size_of::<u32>() // unused, intended for future flexibility
+ + self.trans.write_to_len()
+ + self.starts.write_to_len()
+ + self.special.write_to_len()
+ }
+}
+
+impl<'a> DFA<&'a [u8]> {
+ /// Safely deserialize a sparse DFA with a specific state identifier
+ /// representation. Upon success, this returns both the deserialized DFA
+ /// and the number of bytes read from the given slice. Namely, the contents
+ /// of the slice beyond the DFA are not read.
+ ///
+ /// Deserializing a DFA using this routine will never allocate heap memory.
+ /// For safety purposes, the DFA's transitions will be verified such that
+ /// every transition points to a valid state. If this verification is too
+ /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
+ /// will always execute in constant time.
+ ///
+ /// The bytes given must be generated by one of the serialization APIs
+ /// of a `DFA` using a semver compatible release of this crate. Those
+ /// include:
+ ///
+ /// * [`DFA::to_bytes_little_endian`]
+ /// * [`DFA::to_bytes_big_endian`]
+ /// * [`DFA::to_bytes_native_endian`]
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
+ /// `write_to` methods do not allocate and write to an existing slice
+ /// (which may be on the stack). Since deserialization always uses the
+ /// native endianness of the target platform, the serialization API you use
+ /// should match the endianness of the target platform. (It's often a good
+ /// idea to generate serialized DFAs for both forms of endianness and then
+ /// load the correct one based on endianness.)
+ ///
+ /// # Errors
+ ///
+ /// Generally speaking, it's easier to state the conditions in which an
+ /// error is _not_ returned. All of the following must be true:
+ ///
+ /// * The bytes given must be produced by one of the serialization APIs
+ /// on this DFA, as mentioned above.
+ /// * The endianness of the target platform matches the endianness used to
+ /// serialized the provided DFA.
+ ///
+ /// If any of the above are not true, then an error will be returned.
+ ///
+ /// Note that unlike deserializing a
+ /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
+ /// no alignment requirements. That is, an alignment of `1` is valid.
+ ///
+ /// # Panics
+ ///
+ /// This routine will never panic for any input.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize a DFA to raw bytes, deserialize it
+ /// and then use it for searching.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let bytes = initial.to_bytes_native_endian();
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: loading a DFA from static memory
+ ///
+ /// One use case this library supports is the ability to serialize a
+ /// DFA to disk and then use `include_bytes!` to store it in a compiled
+ /// Rust program. Those bytes can then be cheaply deserialized into a
+ /// `DFA` structure at runtime and used for searching without having to
+ /// re-compile the DFA (which can be quite costly).
+ ///
+ /// We can show this in two parts. The first part is serializing the DFA to
+ /// a file:
+ ///
+ /// ```no_run
+ /// use regex_automata::dfa::{Automaton, sparse::DFA};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Write a big endian serialized version of this DFA to a file.
+ /// let bytes = dfa.to_bytes_big_endian();
+ /// std::fs::write("foo.bigendian.dfa", &bytes)?;
+ ///
+ /// // Do it again, but this time for little endian.
+ /// let bytes = dfa.to_bytes_little_endian();
+ /// std::fs::write("foo.littleendian.dfa", &bytes)?;
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And now the second part is embedding the DFA into the compiled program
+ /// and deserializing it at runtime on first use. We use conditional
+ /// compilation to choose the correct endianness. As mentioned above, we
+ /// do not need to employ any special tricks to ensure a proper alignment,
+ /// since a sparse DFA has no alignment requirements.
+ ///
+ /// ```no_run
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse},
+ /// HalfMatch,
+ /// };
+ ///
+ /// type DFA = sparse::DFA<&'static [u8]>;
+ ///
+ /// fn get_foo() -> &'static DFA {
+ /// use std::cell::Cell;
+ /// use std::mem::MaybeUninit;
+ /// use std::sync::Once;
+ ///
+ /// # const _: &str = stringify! {
+ /// #[cfg(target_endian = "big")]
+ /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
+ /// #[cfg(target_endian = "little")]
+ /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
+ /// # };
+ /// # static BYTES: &[u8] = b"";
+ ///
+ /// struct Lazy(Cell<MaybeUninit<DFA>>);
+ /// // SAFETY: This is safe because DFA impls Sync.
+ /// unsafe impl Sync for Lazy {}
+ ///
+ /// static INIT: Once = Once::new();
+ /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
+ ///
+ /// INIT.call_once(|| {
+ /// let (dfa, _) = DFA::from_bytes(BYTES)
+ /// .expect("serialized DFA should be valid");
+ /// // SAFETY: This is guaranteed to only execute once, and all
+ /// // we do with the pointer is write the DFA to it.
+ /// unsafe {
+ /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
+ /// }
+ /// });
+ /// // SAFETY: DFA is guaranteed to by initialized via INIT and is
+ /// // stored in static memory.
+ /// unsafe {
+ /// let dfa = (*DFA.0.as_ptr()).as_ptr();
+ /// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
+ /// }
+ /// }
+ ///
+ /// let dfa = get_foo();
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
+ /// ```
+ ///
+ /// Alternatively, consider using
+ /// [`lazy_static`](https://crates.io/crates/lazy_static)
+ /// or
+ /// [`once_cell`](https://crates.io/crates/once_cell),
+ /// which will guarantee safety for you.
+ pub fn from_bytes(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
+ // SAFETY: This is safe because we validate both the sparse transitions
+ // (by trying to decode every state) and start state ID list below. If
+ // either validation fails, then we return an error.
+ let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
+ dfa.trans.validate()?;
+ dfa.starts.validate(&dfa.trans)?;
+ // N.B. dfa.special doesn't have a way to do unchecked deserialization,
+ // so it has already been validated.
+ Ok((dfa, nread))
+ }
+
+ /// Deserialize a DFA with a specific state identifier representation in
+ /// constant time by omitting the verification of the validity of the
+ /// sparse transitions.
+ ///
+ /// This is just like [`DFA::from_bytes`], except it can potentially return
+ /// a DFA that exhibits undefined behavior if its transitions contains
+ /// invalid state identifiers.
+ ///
+ /// This routine is useful if you need to deserialize a DFA cheaply and
+ /// cannot afford the transition validation performed by `from_bytes`.
+ ///
+ /// # Safety
+ ///
+ /// This routine is unsafe because it permits callers to provide
+ /// arbitrary transitions with possibly incorrect state identifiers. While
+ /// the various serialization routines will never return an incorrect
+ /// DFA, there is no guarantee that the bytes provided here
+ /// are correct. While `from_bytes_unchecked` will still do several forms
+ /// of basic validation, this routine does not check that the transitions
+ /// themselves are correct. Given an incorrect transition table, it is
+ /// possible for the search routines to access out-of-bounds memory because
+ /// of explicit bounds check elision.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// HalfMatch,
+ /// };
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let bytes = initial.to_bytes_native_endian();
+ /// // SAFETY: This is guaranteed to be safe since the bytes given come
+ /// // directly from a compatible serialization routine.
+ /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
+ ///
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub unsafe fn from_bytes_unchecked(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
+ let mut nr = 0;
+
+ nr += bytes::read_label(&slice[nr..], LABEL)?;
+ nr += bytes::read_endianness_check(&slice[nr..])?;
+ nr += bytes::read_version(&slice[nr..], VERSION)?;
+
+ let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
+ nr += size_of::<u32>();
+
+ let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (special, nread) = Special::from_bytes(&slice[nr..])?;
+ nr += nread;
+ if special.max.as_usize() >= trans.sparse().len() {
+ return Err(DeserializeError::generic(
+ "max should not be greater than or equal to sparse bytes",
+ ));
+ }
+
+ Ok((DFA { trans, starts, special }, nr))
+ }
+}
+
+impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "sparse::DFA(")?;
+ for state in self.trans.states() {
+ fmt_state_indicator(f, self, state.id())?;
+ writeln!(f, "{:06?}: {:?}", state.id(), state)?;
+ }
+ writeln!(f, "")?;
+ for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() {
+ if i % self.starts.stride == 0 {
+ match pid {
+ None => writeln!(f, "START-GROUP(ALL)")?,
+ Some(pid) => {
+ writeln!(f, "START_GROUP(pattern: {:?})", pid)?
+ }
+ }
+ }
+ writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?;
+ }
+ writeln!(f, "state count: {:?}", self.trans.count)?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
+ #[inline]
+ fn is_special_state(&self, id: StateID) -> bool {
+ self.special.is_special_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: StateID) -> bool {
+ self.special.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_quit_state(&self, id: StateID) -> bool {
+ self.special.is_quit_state(id)
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: StateID) -> bool {
+ self.special.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_start_state(&self, id: StateID) -> bool {
+ self.special.is_start_state(id)
+ }
+
+ #[inline]
+ fn is_accel_state(&self, id: StateID) -> bool {
+ self.special.is_accel_state(id)
+ }
+
+ // This is marked as inline to help dramatically boost sparse searching,
+ // which decodes each state it enters to follow the next transition.
+ #[inline(always)]
+ fn next_state(&self, current: StateID, input: u8) -> StateID {
+ let input = self.trans.classes.get(input);
+ self.trans.state(current).next(input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID {
+ self.next_state(current, input)
+ }
+
+ #[inline]
+ fn next_eoi_state(&self, current: StateID) -> StateID {
+ self.trans.state(current).next_eoi()
+ }
+
+ #[inline]
+ fn pattern_count(&self) -> usize {
+ self.trans.patterns
+ }
+
+ #[inline]
+ fn match_count(&self, id: StateID) -> usize {
+ self.trans.state(id).pattern_count()
+ }
+
+ #[inline]
+ fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
+ // This is an optimization for the very common case of a DFA with a
+ // single pattern. This conditional avoids a somewhat more costly path
+ // that finds the pattern ID from the state machine, which requires
+ // a bit of slicing/pointer-chasing. This optimization tends to only
+ // matter when matches are frequent.
+ if self.trans.patterns == 1 {
+ return PatternID::ZERO;
+ }
+ self.trans.state(id).pattern_id(match_index)
+ }
+
+ #[inline]
+ fn start_state_forward(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID {
+ let index = Start::from_position_fwd(bytes, start, end);
+ self.starts.start(index, pattern_id)
+ }
+
+ #[inline]
+ fn start_state_reverse(
+ &self,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> StateID {
+ let index = Start::from_position_rev(bytes, start, end);
+ self.starts.start(index, pattern_id)
+ }
+
+ #[inline]
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ self.trans.state(id).accelerator()
+ }
+}
+
+/// The transition table portion of a sparse DFA.
+///
+/// The transition table is the core part of the DFA in that it describes how
+/// to move from one state to another based on the input sequence observed.
+///
+/// Unlike a typical dense table based DFA, states in a sparse transition
+/// table have variable size. That is, states with more transitions use more
+/// space than states with fewer transitions. This means that finding the next
+/// transition takes more work than with a dense DFA, but also typically uses
+/// much less space.
+#[derive(Clone)]
+struct Transitions<T> {
+ /// The raw encoding of each state in this DFA.
+ ///
+ /// Each state has the following information:
+ ///
+ /// * A set of transitions to subsequent states. Transitions to the dead
+ /// state are omitted.
+ /// * If the state can be accelerated, then any additional accelerator
+ /// information.
+ /// * If the state is a match state, then the state contains all pattern
+ /// IDs that match when in that state.
+ ///
+ /// To decode a state, use Transitions::state.
+ ///
+ /// In practice, T is either Vec<u8> or &[u8].
+ sparse: T,
+ /// A set of equivalence classes, where a single equivalence class
+ /// represents a set of bytes that never discriminate between a match
+ /// and a non-match in the DFA. Each equivalence class corresponds to a
+ /// single character in this DFA's alphabet, where the maximum number of
+ /// characters is 257 (each possible value of a byte plus the special
+ /// EOI transition). Consequently, the number of equivalence classes
+ /// corresponds to the number of transitions for each DFA state. Note
+ /// though that the *space* used by each DFA state in the transition table
+ /// may be larger. The total space used by each DFA state is known as the
+ /// stride and is documented above.
+ ///
+ /// The only time the number of equivalence classes is fewer than 257 is
+ /// if the DFA's kind uses byte classes which is the default. Equivalence
+ /// classes should generally only be disabled when debugging, so that
+ /// the transitions themselves aren't obscured. Disabling them has no
+ /// other benefit, since the equivalence class map is always used while
+ /// searching. In the vast majority of cases, the number of equivalence
+ /// classes is substantially smaller than 257, particularly when large
+ /// Unicode classes aren't used.
+ ///
+ /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
+ /// in the current implementation, since equivalence classes generally tend
+ /// to correspond to continuous ranges of bytes that map to the same
+ /// transition. So in a sparse DFA, equivalence classes don't really lead
+ /// to a space savings. In the future, it would be good to try and remove
+ /// them from sparse DFAs entirely, but requires a bit of work since sparse
+ /// DFAs are built from dense DFAs, which are in turn built on top of
+ /// equivalence classes.
+ classes: ByteClasses,
+ /// The total number of states in this DFA. Note that a DFA always has at
+ /// least one state---the dead state---even the empty DFA. In particular,
+ /// the dead state always has ID 0 and is correspondingly always the first
+ /// state. The dead state is never a match state.
+ count: usize,
+ /// The total number of unique patterns represented by these match states.
+ patterns: usize,
+}
+
+impl<'a> Transitions<&'a [u8]> {
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr() as usize;
+
+ let (state_count, nr) =
+ bytes::try_read_u32_as_usize(&slice, "state count")?;
+ slice = &slice[nr..];
+
+ let (pattern_count, nr) =
+ bytes::try_read_u32_as_usize(&slice, "pattern count")?;
+ slice = &slice[nr..];
+
+ let (classes, nr) = ByteClasses::from_bytes(&slice)?;
+ slice = &slice[nr..];
+
+ let (len, nr) =
+ bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?;
+ slice = &slice[nr..];
+
+ bytes::check_slice_len(slice, len, "sparse states byte length")?;
+ let sparse = &slice[..len];
+ slice = &slice[len..];
+
+ let trans = Transitions {
+ sparse,
+ classes,
+ count: state_count,
+ patterns: pattern_count,
+ };
+ Ok((trans, slice.as_ptr() as usize - slice_start))
+ }
+}
+
+impl<T: AsRef<[u8]>> Transitions<T> {
+ /// Writes a serialized form of this transition table to the buffer given.
+ /// If the buffer is too small, then an error is returned. To determine
+ /// how big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "sparse transition table",
+ ));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write state count
+ E::write_u32(u32::try_from(self.count).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write pattern count
+ E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write byte class map
+ let n = self.classes.write_to(dst)?;
+ dst = &mut dst[n..];
+
+ // write number of bytes in sparse transitions
+ E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write actual transitions
+ dst.copy_from_slice(self.sparse());
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // state count
+ + size_of::<u32>() // pattern count
+ + self.classes.write_to_len()
+ + size_of::<u32>() // sparse transitions length
+ + self.sparse().len()
+ }
+
+ /// Validates that every state ID in this transition table is valid.
+ ///
+ /// That is, every state ID can be used to correctly index a state in this
+ /// table.
+ fn validate(&self) -> Result<(), DeserializeError> {
+ // In order to validate everything, we not only need to make sure we
+ // can decode every state, but that every transition in every state
+ // points to a valid state. There are many duplicative transitions, so
+ // we record state IDs that we've verified so that we don't redo the
+ // decoding work.
+ //
+ // Except, when in no_std mode, we don't have dynamic memory allocation
+ // available to us, so we skip this optimization. It's not clear
+ // whether doing something more clever is worth it just yet. If you're
+ // profiling this code and need it to run faster, please file an issue.
+ //
+ // ---AG
+ struct Seen {
+ #[cfg(feature = "alloc")]
+ set: BTreeSet<StateID>,
+ #[cfg(not(feature = "alloc"))]
+ set: core::marker::PhantomData<StateID>,
+ }
+
+ #[cfg(feature = "alloc")]
+ impl Seen {
+ fn new() -> Seen {
+ Seen { set: BTreeSet::new() }
+ }
+ fn insert(&mut self, id: StateID) {
+ self.set.insert(id);
+ }
+ fn contains(&self, id: &StateID) -> bool {
+ self.set.contains(id)
+ }
+ }
+
+ #[cfg(not(feature = "alloc"))]
+ impl Seen {
+ fn new() -> Seen {
+ Seen { set: core::marker::PhantomData }
+ }
+ fn insert(&mut self, _id: StateID) {}
+ fn contains(&self, _id: &StateID) -> bool {
+ false
+ }
+ }
+
+ let mut verified: Seen = Seen::new();
+ // We need to make sure that we decode the correct number of states.
+ // Otherwise, an empty set of transitions would validate even if the
+ // recorded state count is non-empty.
+ let mut count = 0;
+ // We can't use the self.states() iterator because it assumes the state
+ // encodings are valid. It could panic if they aren't.
+ let mut id = DEAD;
+ while id.as_usize() < self.sparse().len() {
+ let state = self.try_state(id)?;
+ verified.insert(id);
+ // The next ID should be the offset immediately following `state`.
+ id = StateID::new(bytes::add(
+ id.as_usize(),
+ state.bytes_len(),
+ "next state ID offset",
+ )?)
+ .map_err(|err| {
+ DeserializeError::state_id_error(err, "next state ID offset")
+ })?;
+ count += 1;
+
+ // Now check that all transitions in this state are correct.
+ for i in 0..state.ntrans {
+ let to = state.next_at(i);
+ if verified.contains(&to) {
+ continue;
+ }
+ let _ = self.try_state(to)?;
+ verified.insert(id);
+ }
+ }
+ if count != self.count {
+ return Err(DeserializeError::generic(
+ "mismatching sparse state count",
+ ));
+ }
+ Ok(())
+ }
+
+ /// Converts these transitions to a borrowed value.
+ fn as_ref(&self) -> Transitions<&'_ [u8]> {
+ Transitions {
+ sparse: self.sparse(),
+ classes: self.classes.clone(),
+ count: self.count,
+ patterns: self.patterns,
+ }
+ }
+
+ /// Converts these transitions to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> Transitions<Vec<u8>> {
+ Transitions {
+ sparse: self.sparse().to_vec(),
+ classes: self.classes.clone(),
+ count: self.count,
+ patterns: self.patterns,
+ }
+ }
+
+ /// Return a convenient representation of the given state.
+ ///
+ /// This panics if the state is invalid.
+ ///
+ /// This is marked as inline to help dramatically boost sparse searching,
+ /// which decodes each state it enters to follow the next transition. Other
+ /// functions involved are also inlined, which should hopefully eliminate
+ /// a lot of the extraneous decoding that is never needed just to follow
+ /// the next transition.
+ #[inline(always)]
+ fn state(&self, id: StateID) -> State<'_> {
+ let mut state = &self.sparse()[id.as_usize()..];
+ let mut ntrans = bytes::read_u16(&state) as usize;
+ let is_match = (1 << 15) & ntrans != 0;
+ ntrans &= !(1 << 15);
+ state = &state[2..];
+
+ let (input_ranges, state) = state.split_at(ntrans * 2);
+ let (next, state) = state.split_at(ntrans * StateID::SIZE);
+ let (pattern_ids, state) = if is_match {
+ let npats = bytes::read_u32(&state) as usize;
+ state[4..].split_at(npats * 4)
+ } else {
+ (&[][..], state)
+ };
+
+ let accel_len = state[0] as usize;
+ let accel = &state[1..accel_len + 1];
+ State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
+ }
+
+ /// Like `state`, but will return an error if the state encoding is
+ /// invalid. This is useful for verifying states after deserialization,
+ /// which is required for a safe deserialization API.
+ ///
+ /// Note that this only verifies that this state is decodable and that
+ /// all of its data is consistent. It does not verify that its state ID
+ /// transitions point to valid states themselves, nor does it verify that
+ /// every pattern ID is valid.
+ fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> {
+ if id.as_usize() > self.sparse().len() {
+ return Err(DeserializeError::generic("invalid sparse state ID"));
+ }
+ let mut state = &self.sparse()[id.as_usize()..];
+ // Encoding format starts with a u16 that stores the total number of
+ // transitions in this state.
+ let (mut ntrans, _) =
+ bytes::try_read_u16_as_usize(state, "state transition count")?;
+ let is_match = ((1 << 15) & ntrans) != 0;
+ ntrans &= !(1 << 15);
+ state = &state[2..];
+ if ntrans > 257 || ntrans == 0 {
+ return Err(DeserializeError::generic("invalid transition count"));
+ }
+
+ // Each transition has two pieces: an inclusive range of bytes on which
+ // it is defined, and the state ID that those bytes transition to. The
+ // pairs come first, followed by a corresponding sequence of state IDs.
+ let input_ranges_len = ntrans.checked_mul(2).unwrap();
+ bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
+ let (input_ranges, state) = state.split_at(input_ranges_len);
+ // Every range should be of the form A-B, where A<=B.
+ for pair in input_ranges.chunks(2) {
+ let (start, end) = (pair[0], pair[1]);
+ if start > end {
+ return Err(DeserializeError::generic("invalid input range"));
+ }
+ }
+
+ // And now extract the corresponding sequence of state IDs. We leave
+ // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
+ // not have any alignment requirements.
+ let next_len = ntrans
+ .checked_mul(self.id_len())
+ .expect("state size * #trans should always fit in a usize");
+ bytes::check_slice_len(state, next_len, "sparse trans state IDs")?;
+ let (next, state) = state.split_at(next_len);
+ // We can at least verify that every state ID is in bounds.
+ for idbytes in next.chunks(self.id_len()) {
+ let (id, _) =
+ bytes::read_state_id(idbytes, "sparse state ID in try_state")?;
+ bytes::check_slice_len(
+ self.sparse(),
+ id.as_usize(),
+ "invalid sparse state ID",
+ )?;
+ }
+
+ // If this is a match state, then read the pattern IDs for this state.
+ // Pattern IDs is a u32-length prefixed sequence of native endian
+ // encoded 32-bit integers.
+ let (pattern_ids, state) = if is_match {
+ let (npats, nr) =
+ bytes::try_read_u32_as_usize(state, "pattern ID count")?;
+ let state = &state[nr..];
+
+ let pattern_ids_len =
+ bytes::mul(npats, 4, "sparse pattern ID byte length")?;
+ bytes::check_slice_len(
+ state,
+ pattern_ids_len,
+ "sparse pattern IDs",
+ )?;
+ let (pattern_ids, state) = state.split_at(pattern_ids_len);
+ for patbytes in pattern_ids.chunks(PatternID::SIZE) {
+ bytes::read_pattern_id(
+ patbytes,
+ "sparse pattern ID in try_state",
+ )?;
+ }
+ (pattern_ids, state)
+ } else {
+ (&[][..], state)
+ };
+
+ // Now read this state's accelerator info. The first byte is the length
+ // of the accelerator, which is typically 0 (for no acceleration) but
+ // is no bigger than 3. The length indicates the number of bytes that
+ // follow, where each byte corresponds to a transition out of this
+ // state.
+ if state.is_empty() {
+ return Err(DeserializeError::generic("no accelerator length"));
+ }
+ let (accel_len, state) = (state[0] as usize, &state[1..]);
+
+ if accel_len > 3 {
+ return Err(DeserializeError::generic(
+ "sparse invalid accelerator length",
+ ));
+ }
+ bytes::check_slice_len(
+ state,
+ accel_len,
+ "sparse corrupt accelerator length",
+ )?;
+ let (accel, _) = (&state[..accel_len], &state[accel_len..]);
+
+ Ok(State {
+ id,
+ is_match,
+ ntrans,
+ input_ranges,
+ next,
+ pattern_ids,
+ accel,
+ })
+ }
+
+ /// Return an iterator over all of the states in this DFA.
+ ///
+ /// The iterator returned yields tuples, where the first element is the
+ /// state ID and the second element is the state itself.
+ fn states(&self) -> StateIter<'_, T> {
+ StateIter { trans: self, id: DEAD.as_usize() }
+ }
+
+ /// Returns the sparse transitions as raw bytes.
+ fn sparse(&self) -> &[u8] {
+ self.sparse.as_ref()
+ }
+
+ /// Returns the number of bytes represented by a single state ID.
+ fn id_len(&self) -> usize {
+ StateID::SIZE
+ }
+
+ /// Return the memory usage, in bytes, of these transitions.
+ ///
+ /// This does not include the size of a `Transitions` value itself.
+ fn memory_usage(&self) -> usize {
+ self.sparse().len()
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u8]>> Transitions<T> {
+ /// Return a convenient mutable representation of the given state.
+ /// This panics if the state is invalid.
+ fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+ let mut state = &mut self.sparse_mut()[id.as_usize()..];
+ let mut ntrans = bytes::read_u16(&state) as usize;
+ let is_match = (1 << 15) & ntrans != 0;
+ ntrans &= !(1 << 15);
+ state = &mut state[2..];
+
+ let (input_ranges, state) = state.split_at_mut(ntrans * 2);
+ let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
+ let (pattern_ids, state) = if is_match {
+ let npats = bytes::read_u32(&state) as usize;
+ state[4..].split_at_mut(npats * 4)
+ } else {
+ (&mut [][..], state)
+ };
+
+ let accel_len = state[0] as usize;
+ let accel = &mut state[1..accel_len + 1];
+ StateMut {
+ id,
+ is_match,
+ ntrans,
+ input_ranges,
+ next,
+ pattern_ids,
+ accel,
+ }
+ }
+
+ /// Returns the sparse transitions as raw mutable bytes.
+ fn sparse_mut(&mut self) -> &mut [u8] {
+ self.sparse.as_mut()
+ }
+}
+
+/// The set of all possible starting states in a DFA.
+///
+/// See the eponymous type in the `dense` module for more details. This type
+/// is very similar to `dense::StartTable`, except that its underlying
+/// representation is `&[u8]` instead of `&[S]`. (The latter would require
+/// sparse DFAs to be aligned, which is explicitly something we do not require
+/// because we don't really need it.)
+#[derive(Clone)]
+struct StartTable<T> {
+ /// The initial start state IDs as a contiguous table of native endian
+ /// encoded integers, represented by `S`.
+ ///
+ /// In practice, T is either Vec<u8> or &[u8] and has no alignment
+ /// requirements.
+ ///
+ /// The first `stride` (currently always 4) entries always correspond to
+ /// the start states for the entire DFA. After that, there are
+ /// `stride * patterns` state IDs, where `patterns` may be zero in the
+ /// case of a DFA with no patterns or in the case where the DFA was built
+ /// without enabling starting states for each pattern.
+ table: T,
+ /// The number of starting state IDs per pattern.
+ stride: usize,
+ /// The total number of patterns for which starting states are encoded.
+ /// This may be zero for non-empty DFAs when the DFA was built without
+ /// start states for each pattern.
+ patterns: usize,
+}
+
+#[cfg(feature = "alloc")]
+impl StartTable<Vec<u8>> {
+ fn new(patterns: usize) -> StartTable<Vec<u8>> {
+ let stride = Start::count();
+ // This is OK since the only way we're here is if a dense DFA could be
+ // constructed successfully, which uses the same space.
+ let len = stride
+ .checked_mul(patterns)
+ .unwrap()
+ .checked_add(stride)
+ .unwrap()
+ .checked_mul(StateID::SIZE)
+ .unwrap();
+ StartTable { table: vec![0; len], stride, patterns }
+ }
+
+ fn from_dense_dfa<T: AsRef<[u32]>>(
+ dfa: &dense::DFA<T>,
+ remap: &[StateID],
+ ) -> Result<StartTable<Vec<u8>>, Error> {
+ // Unless the DFA has start states compiled for each pattern, then
+ // as far as the starting state table is concerned, there are zero
+ // patterns to account for. It will instead only store starting states
+ // for the entire DFA.
+ let start_pattern_count = if dfa.has_starts_for_each_pattern() {
+ dfa.pattern_count()
+ } else {
+ 0
+ };
+ let mut sl = StartTable::new(start_pattern_count);
+ for (old_start_id, sty, pid) in dfa.starts() {
+ let new_start_id = remap[dfa.to_index(old_start_id)];
+ sl.set_start(sty, pid, new_start_id);
+ }
+ Ok(sl)
+ }
+}
+
+impl<'a> StartTable<&'a [u8]> {
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr() as usize;
+
+ let (stride, nr) =
+ bytes::try_read_u32_as_usize(slice, "sparse start table stride")?;
+ slice = &slice[nr..];
+
+ let (patterns, nr) = bytes::try_read_u32_as_usize(
+ slice,
+ "sparse start table patterns",
+ )?;
+ slice = &slice[nr..];
+
+ if stride != Start::count() {
+ return Err(DeserializeError::generic(
+ "invalid sparse starting table stride",
+ ));
+ }
+ if patterns > PatternID::LIMIT {
+ return Err(DeserializeError::generic(
+ "sparse invalid number of patterns",
+ ));
+ }
+ let pattern_table_size =
+ bytes::mul(stride, patterns, "sparse invalid pattern count")?;
+ // Our start states always start with a single stride of start states
+ // for the entire automaton which permit it to match any pattern. What
+ // follows it are an optional set of start states for each pattern.
+ let start_state_count = bytes::add(
+ stride,
+ pattern_table_size,
+ "sparse invalid 'any' pattern starts size",
+ )?;
+ let table_bytes_len = bytes::mul(
+ start_state_count,
+ StateID::SIZE,
+ "sparse pattern table bytes length",
+ )?;
+ bytes::check_slice_len(
+ slice,
+ table_bytes_len,
+ "sparse start ID table",
+ )?;
+ let table_bytes = &slice[..table_bytes_len];
+ slice = &slice[table_bytes_len..];
+
+ let sl = StartTable { table: table_bytes, stride, patterns };
+ Ok((sl, slice.as_ptr() as usize - slice_start))
+ }
+}
+
+impl<T: AsRef<[u8]>> StartTable<T> {
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "sparse starting table ids",
+ ));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write stride
+ E::write_u32(u32::try_from(self.stride).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ // write pattern count
+ E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ // write start IDs
+ dst.copy_from_slice(self.table());
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // stride
+ + size_of::<u32>() // # patterns
+ + self.table().len()
+ }
+
+ /// Validates that every starting state ID in this table is valid.
+ ///
+ /// That is, every starting state ID can be used to correctly decode a
+ /// state in the DFA's sparse transitions.
+ fn validate(
+ &self,
+ trans: &Transitions<T>,
+ ) -> Result<(), DeserializeError> {
+ for (id, _, _) in self.iter() {
+ let _ = trans.try_state(id)?;
+ }
+ Ok(())
+ }
+
+ /// Converts this start list to a borrowed value.
+ fn as_ref(&self) -> StartTable<&'_ [u8]> {
+ StartTable {
+ table: self.table(),
+ stride: self.stride,
+ patterns: self.patterns,
+ }
+ }
+
+ /// Converts this start list to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> StartTable<Vec<u8>> {
+ StartTable {
+ table: self.table().to_vec(),
+ stride: self.stride,
+ patterns: self.patterns,
+ }
+ }
+
+ /// Return the start state for the given index and pattern ID. If the
+ /// pattern ID is None, then the corresponding start state for the entire
+ /// DFA is returned. If the pattern ID is not None, then the corresponding
+ /// starting state for the given pattern is returned. If this start table
+ /// does not have individual starting states for each pattern, then this
+ /// panics.
+ fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
+ let start_index = index.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ let pid = pid.as_usize();
+ assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+ self.stride
+ .checked_mul(pid)
+ .unwrap()
+ .checked_add(self.stride)
+ .unwrap()
+ .checked_add(start_index)
+ .unwrap()
+ }
+ };
+ let start = index * StateID::SIZE;
+ // This OK since we're allowed to assume that the start table contains
+ // valid StateIDs.
+ bytes::read_state_id_unchecked(&self.table()[start..]).0
+ }
+
+ /// Return an iterator over all start IDs in this table.
+ fn iter(&self) -> StartStateIter<'_, T> {
+ StartStateIter { st: self, i: 0 }
+ }
+
+ /// Returns the total number of start state IDs in this table.
+ fn len(&self) -> usize {
+ self.table().len() / StateID::SIZE
+ }
+
+ /// Returns the table as a raw slice of bytes.
+ fn table(&self) -> &[u8] {
+ self.table.as_ref()
+ }
+
+ /// Return the memory usage, in bytes, of this start list.
+ ///
+ /// This does not include the size of a `StartTable` value itself.
+ fn memory_usage(&self) -> usize {
+ self.table().len()
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u8]>> StartTable<T> {
+ /// Set the start state for the given index and pattern.
+ ///
+ /// If the pattern ID or state ID are not valid, then this will panic.
+ fn set_start(
+ &mut self,
+ index: Start,
+ pattern_id: Option<PatternID>,
+ id: StateID,
+ ) {
+ let start_index = index.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ let pid = pid.as_usize();
+ assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+ self.stride
+ .checked_mul(pid)
+ .unwrap()
+ .checked_add(self.stride)
+ .unwrap()
+ .checked_add(start_index)
+ .unwrap()
+ }
+ };
+ let start = index * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ bytes::write_state_id::<bytes::NE>(
+ id,
+ &mut self.table.as_mut()[start..end],
+ );
+ }
+}
+
+/// An iterator over all state state IDs in a sparse DFA.
+struct StartStateIter<'a, T> {
+ st: &'a StartTable<T>,
+ i: usize,
+}
+
+impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
+ type Item = (StateID, Start, Option<PatternID>);
+
+ fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
+ let i = self.i;
+ if i >= self.st.len() {
+ return None;
+ }
+ self.i += 1;
+
+ // This unwrap is okay since the stride of any DFA must always match
+ // the number of start state types.
+ let start_type = Start::from_usize(i % self.st.stride).unwrap();
+ let pid = if i < self.st.stride {
+ // This means we don't have start states for each pattern.
+ None
+ } else {
+ // These unwraps are OK since we may assume our table and stride
+ // is correct.
+ let pid = i
+ .checked_sub(self.st.stride)
+ .unwrap()
+ .checked_div(self.st.stride)
+ .unwrap();
+ Some(PatternID::new(pid).unwrap())
+ };
+ let start = i * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ let bytes = self.st.table()[start..end].try_into().unwrap();
+ // This is OK since we're allowed to assume that any IDs in this start
+ // table are correct and valid for this DFA.
+ let id = StateID::from_ne_bytes_unchecked(bytes);
+ Some((id, start_type, pid))
+ }
+}
+
+impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("StartStateIter").field("i", &self.i).finish()
+ }
+}
+
+/// An iterator over all states in a sparse DFA.
+///
+/// This iterator yields tuples, where the first element is the state ID and
+/// the second element is the state itself.
+struct StateIter<'a, T> {
+ trans: &'a Transitions<T>,
+ id: usize,
+}
+
+impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
+ type Item = State<'a>;
+
+ fn next(&mut self) -> Option<State<'a>> {
+ if self.id >= self.trans.sparse().len() {
+ return None;
+ }
+ let state = self.trans.state(StateID::new_unchecked(self.id));
+ self.id = self.id + state.bytes_len();
+ Some(state)
+ }
+}
+
+impl<'a, T> fmt::Debug for StateIter<'a, T> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("StateIter").field("id", &self.id).finish()
+ }
+}
+
+/// A representation of a sparse DFA state that can be cheaply materialized
+/// from a state identifier.
+#[derive(Clone)]
+struct State<'a> {
+ /// The identifier of this state.
+ id: StateID,
+ /// Whether this is a match state or not.
+ is_match: bool,
+ /// The number of transitions in this state.
+ ntrans: usize,
+ /// Pairs of input ranges, where there is one pair for each transition.
+ /// Each pair specifies an inclusive start and end byte range for the
+ /// corresponding transition.
+ input_ranges: &'a [u8],
+ /// Transitions to the next state. This slice contains native endian
+ /// encoded state identifiers, with `S` as the representation. Thus, there
+ /// are `ntrans * size_of::<S>()` bytes in this slice.
+ next: &'a [u8],
+ /// If this is a match state, then this contains the pattern IDs that match
+ /// when the DFA is in this state.
+ ///
+ /// This is a contiguous sequence of 32-bit native endian encoded integers.
+ pattern_ids: &'a [u8],
+ /// An accelerator for this state, if present. If this state has no
+ /// accelerator, then this is an empty slice. When non-empty, this slice
+ /// has length at most 3 and corresponds to the exhaustive set of bytes
+ /// that must be seen in order to transition out of this state.
+ accel: &'a [u8],
+}
+
+impl<'a> State<'a> {
+ /// Searches for the next transition given an input byte. If no such
+ /// transition could be found, then a dead state is returned.
+ ///
+ /// This is marked as inline to help dramatically boost sparse searching,
+ /// which decodes each state it enters to follow the next transition.
+ #[inline(always)]
+ fn next(&self, input: u8) -> StateID {
+ // This straight linear search was observed to be much better than
+ // binary search on ASCII haystacks, likely because a binary search
+ // visits the ASCII case last but a linear search sees it first. A
+ // binary search does do a little better on non-ASCII haystacks, but
+ // not by much. There might be a better trade off lurking here.
+ for i in 0..(self.ntrans - 1) {
+ let (start, end) = self.range(i);
+ if start <= input && input <= end {
+ return self.next_at(i);
+ }
+ // We could bail early with an extra branch: if input < b1, then
+ // we know we'll never find a matching transition. Interestingly,
+ // this extra branch seems to not help performance, or will even
+ // hurt it. It's likely very dependent on the DFA itself and what
+ // is being searched.
+ }
+ DEAD
+ }
+
+ /// Returns the next state ID for the special EOI transition.
+ fn next_eoi(&self) -> StateID {
+ self.next_at(self.ntrans - 1)
+ }
+
+ /// Returns the identifier for this state.
+ fn id(&self) -> StateID {
+ self.id
+ }
+
+ /// Returns the inclusive input byte range for the ith transition in this
+ /// state.
+ fn range(&self, i: usize) -> (u8, u8) {
+ (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
+ }
+
+ /// Returns the next state for the ith transition in this state.
+ fn next_at(&self, i: usize) -> StateID {
+ let start = i * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ let bytes = self.next[start..end].try_into().unwrap();
+ StateID::from_ne_bytes_unchecked(bytes)
+ }
+
+ /// Returns the pattern ID for the given match index. If the match index
+ /// is invalid, then this panics.
+ fn pattern_id(&self, match_index: usize) -> PatternID {
+ let start = match_index * PatternID::SIZE;
+ bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
+ }
+
+ /// Returns the total number of pattern IDs for this state. This is always
+ /// zero when `is_match` is false.
+ fn pattern_count(&self) -> usize {
+ assert_eq!(0, self.pattern_ids.len() % 4);
+ self.pattern_ids.len() / 4
+ }
+
+ /// Return the total number of bytes that this state consumes in its
+ /// encoded form.
+ fn bytes_len(&self) -> usize {
+ let mut len = 2
+ + (self.ntrans * 2)
+ + (self.ntrans * StateID::SIZE)
+ + (1 + self.accel.len());
+ if self.is_match {
+ len += size_of::<u32>() + self.pattern_ids.len();
+ }
+ len
+ }
+
+ /// Return an accelerator for this state.
+ fn accelerator(&self) -> &'a [u8] {
+ self.accel
+ }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut printed = false;
+ for i in 0..(self.ntrans - 1) {
+ let next = self.next_at(i);
+ if next == DEAD {
+ continue;
+ }
+
+ if printed {
+ write!(f, ", ")?;
+ }
+ let (start, end) = self.range(i);
+ if start == end {
+ write!(f, "{:?} => {:?}", DebugByte(start), next)?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next,
+ )?;
+ }
+ printed = true;
+ }
+ let eoi = self.next_at(self.ntrans - 1);
+ if eoi != DEAD {
+ if printed {
+ write!(f, ", ")?;
+ }
+ write!(f, "EOI => {:?}", eoi)?;
+ }
+ Ok(())
+ }
+}
+
+/// A representation of a mutable sparse DFA state that can be cheaply
+/// materialized from a state identifier.
+#[cfg(feature = "alloc")]
+struct StateMut<'a> {
+ /// The identifier of this state.
+ id: StateID,
+ /// Whether this is a match state or not.
+ is_match: bool,
+ /// The number of transitions in this state.
+ ntrans: usize,
+ /// Pairs of input ranges, where there is one pair for each transition.
+ /// Each pair specifies an inclusive start and end byte range for the
+ /// corresponding transition.
+ input_ranges: &'a mut [u8],
+ /// Transitions to the next state. This slice contains native endian
+ /// encoded state identifiers, with `S` as the representation. Thus, there
+ /// are `ntrans * size_of::<S>()` bytes in this slice.
+ next: &'a mut [u8],
+ /// If this is a match state, then this contains the pattern IDs that match
+ /// when the DFA is in this state.
+ ///
+ /// This is a contiguous sequence of 32-bit native endian encoded integers.
+ pattern_ids: &'a [u8],
+ /// An accelerator for this state, if present. If this state has no
+ /// accelerator, then this is an empty slice. When non-empty, this slice
+ /// has length at most 3 and corresponds to the exhaustive set of bytes
+ /// that must be seen in order to transition out of this state.
+ accel: &'a mut [u8],
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> StateMut<'a> {
+ /// Sets the ith transition to the given state.
+ fn set_next_at(&mut self, i: usize, next: StateID) {
+ let start = i * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]);
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> fmt::Debug for StateMut<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let state = State {
+ id: self.id,
+ is_match: self.is_match,
+ ntrans: self.ntrans,
+ input_ranges: self.input_ranges,
+ next: self.next,
+ pattern_ids: self.pattern_ids,
+ accel: self.accel,
+ };
+ fmt::Debug::fmt(&state, f)
+ }
+}
+
+/// A binary search routine specialized specifically to a sparse DFA state's
+/// transitions. Specifically, the transitions are defined as a set of pairs
+/// of input bytes that delineate an inclusive range of bytes. If the input
+/// byte is in the range, then the corresponding transition is a match.
+///
+/// This binary search accepts a slice of these pairs and returns the position
+/// of the matching pair (the ith transition), or None if no matching pair
+/// could be found.
+///
+/// Note that this routine is not currently used since it was observed to
+/// either decrease performance when searching ASCII, or did not provide enough
+/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
+/// for posterity in case we can find a way to use it.
+///
+/// In theory, we could use the standard library's search routine if we could
+/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
+/// guaranteed to be safe and is thus UB (since I don't think the in-memory
+/// representation of `(u8, u8)` has been nailed down). One could define a
+/// repr(C) type, but the casting doesn't seem justified.
+#[allow(dead_code)]
+#[inline(always)]
+fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
+ debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
+ debug_assert!(ranges.len() <= 512, "ranges should be short");
+
+ let (mut left, mut right) = (0, ranges.len() / 2);
+ while left < right {
+ let mid = (left + right) / 2;
+ let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
+ if needle < b1 {
+ right = mid;
+ } else if needle > b2 {
+ left = mid + 1;
+ } else {
+ return Some(mid);
+ }
+ }
+ None
+}
diff --git a/vendor/regex-automata/src/dfa/special.rs b/vendor/regex-automata/src/dfa/special.rs
new file mode 100644
index 000000000..3db95a707
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/special.rs
@@ -0,0 +1,477 @@
+use crate::{
+ dfa::DEAD,
+ util::{
+ bytes::{self, DeserializeError, Endian, SerializeError},
+ id::StateID,
+ },
+};
+
+macro_rules! err {
+ ($msg:expr) => {
+ return Err(DeserializeError::generic($msg));
+ };
+}
+
+// Special represents the identifiers in a DFA that correspond to "special"
+// states. If a state is one or more of the following, then it is considered
+// special:
+//
+// * dead - A non-matching state where all outgoing transitions lead back to
+// itself. There is only one of these, regardless of whether minimization
+// has run. The dead state always has an ID of 0. i.e., It is always the
+// first state in a DFA.
+// * quit - A state that is entered whenever a byte is seen that should cause
+// a DFA to give up and stop searching. This results in a MatchError::Quit
+// error being returned at search time. The default configuration for a DFA
+// has no quit bytes, which means this state is unreachable by default,
+// although it is always present for reasons of implementation simplicity.
+// This state is only reachable when the caller configures the DFA to quit
+// on certain bytes. There is always exactly one of these states and it
+// is always the second state. (Its actual ID depends on the size of the
+// alphabet in dense DFAs, since state IDs are premultiplied in order to
+// allow them to be used directly as indices into the transition table.)
+// * match - An accepting state, i.e., indicative of a match. There may be
+// zero or more of these states.
+// * accelerated - A state where all of its outgoing transitions, except a
+// few, loop back to itself. These states are candidates for acceleration
+// via memchr during search. There may be zero or more of these states.
+// * start - A non-matching state that indicates where the automaton should
+// start during a search. There is always at least one starting state and
+// all are guaranteed to be non-match states. (A start state cannot be a
+// match state because the DFAs in this crate delay all matches by one byte.
+// So every search that finds a match must move through one transition to
+// some other match state, even when searching an empty string.)
+//
+// These are not mutually exclusive categories. Namely, the following
+// overlappings can occur:
+//
+// * {dead, start} - If a DFA can never lead to a match and it is minimized,
+// then it will typically compile to something where all starting IDs point
+// to the DFA's dead state.
+// * {match, accelerated} - It is possible for a match state to have the
+// majority of its transitions loop back to itself, which means it's
+// possible for a match state to be accelerated.
+// * {start, accelerated} - Similarly, it is possible for a start state to be
+// accelerated. Note that it is possible for an accelerated state to be
+// neither a match or a start state. Also note that just because both match
+// and start states overlap with accelerated states does not mean that
+// match and start states overlap with each other. In fact, they are
+// guaranteed not to overlap.
+//
+// As a special mention, every DFA always has a dead and a quit state, even
+// though from the perspective of the DFA, they are equivalent. (Indeed,
+// minimization special cases them to ensure they don't get merged.) The
+// purpose of keeping them distinct is to use the quit state as a sentinel to
+// distguish between whether a search finished successfully without finding
+// anything or whether it gave up before finishing.
+//
+// So the main problem we want to solve here is the *fast* detection of whether
+// a state is special or not. And we also want to do this while storing as
+// little extra data as possible. AND we want to be able to quickly determine
+// which categories a state falls into above if it is special.
+//
+// We achieve this by essentially shuffling all special states to the beginning
+// of a DFA. That is, all special states appear before every other non-special
+// state. By representing special states this way, we can determine whether a
+// state is special or not by a single comparison, where special.max is the
+// identifier of the last special state in the DFA:
+//
+// if current_state <= special.max:
+// ... do something with special state
+//
+// The only thing left to do is to determine what kind of special state
+// it is. Because what we do next depends on that. Since special states
+// are typically rare, we can afford to do a bit more extra work, but we'd
+// still like this to be as fast as possible. The trick we employ here is to
+// continue shuffling states even within the special state range. Such that
+// one contiguous region corresponds to match states, another for start states
+// and then an overlapping range for accelerated states. At a high level, our
+// special state detection might look like this (for leftmost searching, where
+// we continue searching even after seeing a match):
+//
+// byte = input[offset]
+// current_state = next_state(current_state, byte)
+// offset += 1
+// if current_state <= special.max:
+// if current_state == 0:
+// # We can never leave a dead state, so this always marks the
+// # end of our search.
+// return last_match
+// if current_state == special.quit_id:
+// # A quit state means we give up. If he DFA has no quit state,
+// # then special.quit_id == 0 == dead, which is handled by the
+// # conditional above.
+// return Err(MatchError::Quit { byte, offset: offset - 1 })
+// if special.min_match <= current_state <= special.max_match:
+// last_match = Some(offset)
+// if special.min_accel <= current_state <= special.max_accel:
+// offset = accelerate(input, offset)
+// last_match = Some(offset)
+// elif special.min_start <= current_state <= special.max_start:
+// offset = prefilter.find(input, offset)
+// if special.min_accel <= current_state <= special.max_accel:
+// offset = accelerate(input, offset)
+// elif special.min_accel <= current_state <= special.max_accel:
+// offset = accelerate(input, offset)
+//
+// There are some small details left out of the logic above. For example,
+// in order to accelerate a state, we need to know which bytes to search for.
+// This in turn implies some extra data we need to store in the DFA. To keep
+// things compact, we would ideally only store
+//
+// N = special.max_accel - special.min_accel + 1
+//
+// items. But state IDs are premultiplied, which means they are not contiguous.
+// So in order to take a state ID and index an array of accelerated structures,
+// we need to do:
+//
+// i = (state_id - special.min_accel) / stride
+//
+// (N.B. 'stride' is always a power of 2, so the above can be implemented via
+// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
+// 2^x=stride.)
+//
+// Moreover, some of these specialty categories may be empty. For example,
+// DFAs are not required to have any match states or any accelerated states.
+// In that case, the lower and upper bounds are both set to 0 (the dead state
+// ID) and the first `current_state == 0` check subsumes cases where the
+// ranges are empty.
+//
+// Loop unrolling, if applicable, has also been left out of the logic above.
+//
+// Graphically, the ranges look like this, where asterisks indicate ranges
+// that can be empty. Each 'x' is a state.
+//
+// quit
+// dead|
+// ||
+// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+// | | | | start | |
+// | |-------------| |-------| |
+// | match* | | | |
+// | | | | |
+// | |----------| | |
+// | accel* | |
+// | | |
+// | | |
+// |----------------------------|------------------------
+// special non-special*
+#[derive(Clone, Copy, Debug)]
+pub struct Special {
+ /// The identifier of the last special state in a DFA. A state is special
+ /// if and only if its identifier is less than or equal to `max`.
+ pub max: StateID,
+ /// The identifier of the quit state in a DFA. (There is no analogous field
+ /// for the dead state since the dead state's ID is always zero, regardless
+ /// of state ID size.)
+ pub quit_id: StateID,
+ /// The identifier of the first match state.
+ pub min_match: StateID,
+ /// The identifier of the last match state.
+ pub max_match: StateID,
+ /// The identifier of the first accelerated state.
+ pub min_accel: StateID,
+ /// The identifier of the last accelerated state.
+ pub max_accel: StateID,
+ /// The identifier of the first start state.
+ pub min_start: StateID,
+ /// The identifier of the last start state.
+ pub max_start: StateID,
+}
+
+impl Special {
+ /// Creates a new set of special ranges for a DFA. All ranges are initially
+ /// set to only contain the dead state. This is interpreted as an empty
+ /// range.
+ #[cfg(feature = "alloc")]
+ pub fn new() -> Special {
+ Special {
+ max: DEAD,
+ quit_id: DEAD,
+ min_match: DEAD,
+ max_match: DEAD,
+ min_accel: DEAD,
+ max_accel: DEAD,
+ min_start: DEAD,
+ max_start: DEAD,
+ }
+ }
+
+ /// Remaps all of the special state identifiers using the function given.
+ #[cfg(feature = "alloc")]
+ pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
+ Special {
+ max: map(self.max),
+ quit_id: map(self.quit_id),
+ min_match: map(self.min_match),
+ max_match: map(self.max_match),
+ min_accel: map(self.min_accel),
+ max_accel: map(self.max_accel),
+ min_start: map(self.min_start),
+ max_start: map(self.max_start),
+ }
+ }
+
+ /// Deserialize the given bytes into special state ranges. If the slice
+ /// given is not big enough, then this returns an error. Similarly, if
+ /// any of the expected invariants around special state ranges aren't
+ /// upheld, an error is returned. Note that this does not guarantee that
+ /// the information returned is correct.
+ ///
+ /// Upon success, this returns the number of bytes read in addition to the
+ /// special state IDs themselves.
+ pub fn from_bytes(
+ mut slice: &[u8],
+ ) -> Result<(Special, usize), DeserializeError> {
+ bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
+
+ let mut nread = 0;
+ let mut read_id = |what| -> Result<StateID, DeserializeError> {
+ let (id, nr) = bytes::try_read_state_id(slice, what)?;
+ nread += nr;
+ slice = &slice[StateID::SIZE..];
+ Ok(id)
+ };
+
+ let max = read_id("special max id")?;
+ let quit_id = read_id("special quit id")?;
+ let min_match = read_id("special min match id")?;
+ let max_match = read_id("special max match id")?;
+ let min_accel = read_id("special min accel id")?;
+ let max_accel = read_id("special max accel id")?;
+ let min_start = read_id("special min start id")?;
+ let max_start = read_id("special max start id")?;
+
+ let special = Special {
+ max,
+ quit_id,
+ min_match,
+ max_match,
+ min_accel,
+ max_accel,
+ min_start,
+ max_start,
+ };
+ special.validate()?;
+ assert_eq!(nread, special.write_to_len());
+ Ok((special, nread))
+ }
+
+ /// Validate that the information describing special states satisfies
+ /// all known invariants.
+ pub fn validate(&self) -> Result<(), DeserializeError> {
+ // Check that both ends of the range are DEAD or neither are.
+ if self.min_match == DEAD && self.max_match != DEAD {
+ err!("min_match is DEAD, but max_match is not");
+ }
+ if self.min_match != DEAD && self.max_match == DEAD {
+ err!("max_match is DEAD, but min_match is not");
+ }
+ if self.min_accel == DEAD && self.max_accel != DEAD {
+ err!("min_accel is DEAD, but max_accel is not");
+ }
+ if self.min_accel != DEAD && self.max_accel == DEAD {
+ err!("max_accel is DEAD, but min_accel is not");
+ }
+ if self.min_start == DEAD && self.max_start != DEAD {
+ err!("min_start is DEAD, but max_start is not");
+ }
+ if self.min_start != DEAD && self.max_start == DEAD {
+ err!("max_start is DEAD, but min_start is not");
+ }
+
+ // Check that ranges are well formed.
+ if self.min_match > self.max_match {
+ err!("min_match should not be greater than max_match");
+ }
+ if self.min_accel > self.max_accel {
+ err!("min_accel should not be greater than max_accel");
+ }
+ if self.min_start > self.max_start {
+ err!("min_start should not be greater than max_start");
+ }
+
+ // Check that ranges are ordered with respect to one another.
+ if self.matches() && self.quit_id >= self.min_match {
+ err!("quit_id should not be greater than min_match");
+ }
+ if self.accels() && self.quit_id >= self.min_accel {
+ err!("quit_id should not be greater than min_accel");
+ }
+ if self.starts() && self.quit_id >= self.min_start {
+ err!("quit_id should not be greater than min_start");
+ }
+ if self.matches() && self.accels() && self.min_accel < self.min_match {
+ err!("min_match should not be greater than min_accel");
+ }
+ if self.matches() && self.starts() && self.min_start < self.min_match {
+ err!("min_match should not be greater than min_start");
+ }
+ if self.accels() && self.starts() && self.min_start < self.min_accel {
+ err!("min_accel should not be greater than min_start");
+ }
+
+ // Check that max is at least as big as everything else.
+ if self.max < self.quit_id {
+ err!("quit_id should not be greater than max");
+ }
+ if self.max < self.max_match {
+ err!("max_match should not be greater than max");
+ }
+ if self.max < self.max_accel {
+ err!("max_accel should not be greater than max");
+ }
+ if self.max < self.max_start {
+ err!("max_start should not be greater than max");
+ }
+
+ Ok(())
+ }
+
+ /// Validate that the special state information is compatible with the
+ /// given state count.
+ pub fn validate_state_count(
+ &self,
+ count: usize,
+ stride2: usize,
+ ) -> Result<(), DeserializeError> {
+ // We assume that 'validate' has already passed, so we know that 'max'
+ // is truly the max. So all we need to check is that the max state
+ // ID is less than the state ID count. The max legal value here is
+ // count-1, which occurs when there are no non-special states.
+ if (self.max.as_usize() >> stride2) >= count {
+ err!("max should not be greater than or equal to state count");
+ }
+ Ok(())
+ }
+
+ /// Write the IDs and ranges for special states to the given byte buffer.
+ /// The buffer given must have enough room to store all data, otherwise
+ /// this will return an error. The number of bytes written is returned
+ /// on success. The number of bytes written is guaranteed to be a multiple
+ /// of 8.
+ pub fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ use crate::util::bytes::write_state_id as write;
+
+ if dst.len() < self.write_to_len() {
+ return Err(SerializeError::buffer_too_small("special state ids"));
+ }
+
+ let mut nwrite = 0;
+ nwrite += write::<E>(self.max, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
+
+ assert_eq!(
+ self.write_to_len(),
+ nwrite,
+ "expected to write certain number of bytes",
+ );
+ assert_eq!(
+ nwrite % 8,
+ 0,
+ "expected to write multiple of 8 bytes for special states",
+ );
+ Ok(nwrite)
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub fn write_to_len(&self) -> usize {
+ 8 * StateID::SIZE
+ }
+
+ /// Sets the maximum special state ID based on the current values. This
+ /// should be used once all possible state IDs are set.
+ #[cfg(feature = "alloc")]
+ pub fn set_max(&mut self) {
+ use core::cmp::max;
+ self.max = max(
+ self.quit_id,
+ max(self.max_match, max(self.max_accel, self.max_start)),
+ );
+ }
+
+ /// Returns true if and only if the given state ID is a special state.
+ #[inline]
+ pub fn is_special_state(&self, id: StateID) -> bool {
+ id <= self.max
+ }
+
+ /// Returns true if and only if the given state ID is a dead state.
+ #[inline]
+ pub fn is_dead_state(&self, id: StateID) -> bool {
+ id == DEAD
+ }
+
+ /// Returns true if and only if the given state ID is a quit state.
+ #[inline]
+ pub fn is_quit_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.quit_id == id
+ }
+
+ /// Returns true if and only if the given state ID is a match state.
+ #[inline]
+ pub fn is_match_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
+ }
+
+ /// Returns true if and only if the given state ID is an accel state.
+ #[inline]
+ pub fn is_accel_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
+ }
+
+ /// Returns true if and only if the given state ID is a start state.
+ #[inline]
+ pub fn is_start_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
+ }
+
+ /// Returns the total number of match states for a dense table based DFA.
+ #[inline]
+ pub fn match_len(&self, stride: usize) -> usize {
+ if self.matches() {
+ (self.max_match.as_usize() - self.min_match.as_usize() + stride)
+ / stride
+ } else {
+ 0
+ }
+ }
+
+ /// Returns true if and only if there is at least one match state.
+ #[inline]
+ pub fn matches(&self) -> bool {
+ self.min_match != DEAD
+ }
+
+ /// Returns the total number of accel states.
+ #[cfg(feature = "alloc")]
+ pub fn accel_len(&self, stride: usize) -> usize {
+ if self.accels() {
+ (self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
+ / stride
+ } else {
+ 0
+ }
+ }
+
+ /// Returns true if and only if there is at least one accel state.
+ #[inline]
+ pub fn accels(&self) -> bool {
+ self.min_accel != DEAD
+ }
+
+ /// Returns true if and only if there is at least one start state.
+ #[inline]
+ pub fn starts(&self) -> bool {
+ self.min_start != DEAD
+ }
+}
diff --git a/vendor/regex-automata/src/dfa/transducer.rs b/vendor/regex-automata/src/dfa/transducer.rs
new file mode 100644
index 000000000..58b34e00a
--- /dev/null
+++ b/vendor/regex-automata/src/dfa/transducer.rs
@@ -0,0 +1,207 @@
+use crate::{
+ dfa::{automaton::Automaton, dense, sparse},
+ util::id::StateID,
+};
+
+impl<T: AsRef<[u32]>> fst::Automaton for dense::DFA<T> {
+ type State = StateID;
+
+ #[inline]
+ fn start(&self) -> StateID {
+ self.start_state_forward(None, &[], 0, 0)
+ }
+
+ #[inline]
+ fn is_match(&self, state: &StateID) -> bool {
+ self.is_match_state(*state)
+ }
+
+ #[inline]
+ fn accept(&self, state: &StateID, byte: u8) -> StateID {
+ if fst::Automaton::is_match(self, state) {
+ return *state;
+ }
+ self.next_state(*state, byte)
+ }
+
+ #[inline]
+ fn accept_eof(&self, state: &StateID) -> Option<StateID> {
+ if fst::Automaton::is_match(self, state) {
+ return Some(*state);
+ }
+ Some(self.next_eoi_state(*state))
+ }
+
+ #[inline]
+ fn can_match(&self, state: &StateID) -> bool {
+ !self.is_dead_state(*state)
+ }
+}
+
+impl<T: AsRef<[u8]>> fst::Automaton for sparse::DFA<T> {
+ type State = StateID;
+
+ #[inline]
+ fn start(&self) -> StateID {
+ self.start_state_forward(None, &[], 0, 0)
+ }
+
+ #[inline]
+ fn is_match(&self, state: &StateID) -> bool {
+ self.is_match_state(*state)
+ }
+
+ #[inline]
+ fn accept(&self, state: &StateID, byte: u8) -> StateID {
+ if fst::Automaton::is_match(self, state) {
+ return *state;
+ }
+ self.next_state(*state, byte)
+ }
+
+ #[inline]
+ fn accept_eof(&self, state: &StateID) -> Option<StateID> {
+ if fst::Automaton::is_match(self, state) {
+ return Some(*state);
+ }
+ Some(self.next_eoi_state(*state))
+ }
+
+ #[inline]
+ fn can_match(&self, state: &StateID) -> bool {
+ !self.is_dead_state(*state)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use bstr::BString;
+ use fst::{Automaton, IntoStreamer, Set, Streamer};
+
+ use crate::dfa::{dense, sparse};
+
+ fn search<A: Automaton, D: AsRef<[u8]>>(
+ set: &Set<D>,
+ aut: A,
+ ) -> Vec<BString> {
+ let mut stream = set.search(aut).into_stream();
+
+ let mut results = vec![];
+ while let Some(key) = stream.next() {
+ results.push(BString::from(key));
+ }
+ results
+ }
+
+ #[test]
+ fn dense_anywhere() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::DFA::new("ba.*").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+ }
+
+ #[test]
+ fn dense_anchored() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::Builder::new()
+ .configure(dense::Config::new().anchored(true))
+ .build("ba.*")
+ .unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz"]);
+ }
+
+ #[test]
+ fn dense_assertions_start() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::Builder::new().build("^ba.*").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz"]);
+ }
+
+ #[test]
+ fn dense_assertions_end() {
+ let set =
+ Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::Builder::new().build(".*x$").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bax", "xbax"]);
+ }
+
+ #[test]
+ fn dense_assertions_word() {
+ let set =
+ Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
+ let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["foo", "zzz foo zzz"]);
+ }
+
+ #[test]
+ fn sparse_anywhere() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = sparse::DFA::new("ba.*").unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+ }
+
+ #[test]
+ fn sparse_anchored() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa = dense::Builder::new()
+ .configure(dense::Config::new().anchored(true))
+ .build("ba.*")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz"]);
+ }
+
+ #[test]
+ fn sparse_assertions_start() {
+ let set =
+ Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa =
+ dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bar", "baz"]);
+ }
+
+ #[test]
+ fn sparse_assertions_end() {
+ let set =
+ Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
+ .unwrap();
+ let dfa =
+ dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["bax", "xbax"]);
+ }
+
+ #[test]
+ fn sparse_assertions_word() {
+ let set =
+ Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
+ let dfa = dense::Builder::new()
+ .build(r"(?-u)\bfoo\b")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+ let got = search(&set, &dfa);
+ assert_eq!(got, vec!["foo", "zzz foo zzz"]);
+ }
+}
diff --git a/vendor/regex-automata/src/error.rs b/vendor/regex-automata/src/error.rs
deleted file mode 100644
index 70fe436ea..000000000
--- a/vendor/regex-automata/src/error.rs
+++ /dev/null
@@ -1,150 +0,0 @@
-use std::error;
-use std::fmt;
-use std::result;
-
-use regex_syntax;
-
-pub type Result<T> = result::Result<T, Error>;
-
-/// An error that occurred during the construction of a DFA.
-#[derive(Clone, Debug)]
-pub struct Error {
- kind: ErrorKind,
-}
-
-/// The kind of error that occurred.
-#[derive(Clone, Debug)]
-pub enum ErrorKind {
- /// An error that occurred while parsing a regular expression. Note that
- /// this error may be printed over multiple lines, and is generally
- /// intended to be end user readable on its own.
- Syntax(String),
- /// An error that occurred because an unsupported regex feature was used.
- /// The message string describes which unsupported feature was used.
- ///
- /// The primary regex features that are unsupported are those that require
- /// look-around, such as the `^` and `$` anchors and the word boundary
- /// assertion `\b`. These may be supported in the future.
- Unsupported(String),
- /// An error that occurred when attempting to serialize a DFA to bytes.
- Serialize(String),
- /// An error that occurs when constructing a DFA would require the use of
- /// a state ID that overflows the chosen state ID representation. For
- /// example, if one is using `u8` for state IDs and builds a DFA with
- /// 257 states, then the last state's ID will be `256` which cannot be
- /// represented with `u8`.
- ///
- /// Typically, this error occurs in the determinization process of building
- /// a DFA (the conversion step from NFA to DFA). It can also occur when
- /// trying to build a smaller DFA from an existing one.
- StateIDOverflow {
- /// The maximum possible state ID.
- max: usize,
- },
- /// An error that occurs when premultiplication of state IDs is requested,
- /// but doing so would overflow the chosen state ID representation.
- ///
- /// When `max == requested_max`, then the state ID would overflow `usize`.
- PremultiplyOverflow {
- /// The maximum possible state id.
- max: usize,
- /// The maximum ID required by premultiplication.
- requested_max: usize,
- },
-}
-
-impl Error {
- /// Return the kind of this error.
- pub fn kind(&self) -> &ErrorKind {
- &self.kind
- }
-
- pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
- Error { kind: ErrorKind::Syntax(err.to_string()) }
- }
-
- pub(crate) fn unsupported_anchor() -> Error {
- let msg = r"anchors such as ^, $, \A and \z are not supported";
- Error { kind: ErrorKind::Unsupported(msg.to_string()) }
- }
-
- pub(crate) fn unsupported_word() -> Error {
- let msg = r"word boundary assertions (\b and \B) are not supported";
- Error { kind: ErrorKind::Unsupported(msg.to_string()) }
- }
-
- pub(crate) fn unsupported_longest_match() -> Error {
- let msg = "unachored searches with longest match \
- semantics are not supported";
- Error { kind: ErrorKind::Unsupported(msg.to_string()) }
- }
-
- pub(crate) fn serialize(message: &str) -> Error {
- Error { kind: ErrorKind::Serialize(message.to_string()) }
- }
-
- pub(crate) fn state_id_overflow(max: usize) -> Error {
- Error { kind: ErrorKind::StateIDOverflow { max } }
- }
-
- pub(crate) fn premultiply_overflow(
- max: usize,
- requested_max: usize,
- ) -> Error {
- Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
- }
-}
-
-impl error::Error for Error {
- fn description(&self) -> &str {
- match self.kind {
- ErrorKind::Syntax(_) => "syntax error",
- ErrorKind::Unsupported(_) => "unsupported syntax",
- ErrorKind::Serialize(_) => "serialization error",
- ErrorKind::StateIDOverflow { .. } => {
- "state id representation too small"
- }
- ErrorKind::PremultiplyOverflow { .. } => {
- "state id representation too small for premultiplication"
- }
- }
- }
-}
-
-impl fmt::Display for Error {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match self.kind {
- ErrorKind::Syntax(ref msg) => write!(f, "{}", msg),
- ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg),
- ErrorKind::Serialize(ref msg) => {
- write!(f, "DFA serialization error: {}", msg)
- }
- ErrorKind::StateIDOverflow { max } => write!(
- f,
- "building the DFA failed because it required building \
- more states that can be identified, where the maximum \
- ID for the chosen representation is {}",
- max,
- ),
- ErrorKind::PremultiplyOverflow { max, requested_max } => {
- if max == requested_max {
- write!(
- f,
- "premultiplication of states requires the ability to \
- represent a state ID greater than what can fit on \
- this platform's usize, which is {}",
- ::std::usize::MAX,
- )
- } else {
- write!(
- f,
- "premultiplication of states requires the ability to \
- represent at least a state ID of {}, but the chosen \
- representation only permits a maximum state ID of {}",
- requested_max, max,
- )
- }
- }
- }
- }
-}
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs
new file mode 100644
index 000000000..1fbce5f5f
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/dfa.rs
@@ -0,0 +1,3817 @@
+/*!
+Types and routines specific to lazy DFAs.
+
+This module is the home of [`hybrid::dfa::DFA`](DFA).
+
+This module also contains a [`hybrid::dfa::Builder`](Builder) and a
+[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
+*/
+
+use core::{borrow::Borrow, iter, mem::size_of};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::{
+ hybrid::{
+ error::{BuildError, CacheError},
+ id::{LazyStateID, LazyStateIDError, OverlappingState},
+ search,
+ },
+ nfa::thompson,
+ util::{
+ alphabet::{self, ByteClasses, ByteSet},
+ determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
+ id::{PatternID, StateID as NFAStateID},
+ matchtypes::{HalfMatch, MatchError, MatchKind},
+ prefilter,
+ sparse_set::SparseSets,
+ start::Start,
+ },
+};
+
+/// The mininum number of states that a lazy DFA's cache size must support.
+///
+/// This is checked at time of construction to ensure that at least some small
+/// number of states can fit in the given capacity allotment. If we can't fit
+/// at least this number of states, then the thinking is that it's pretty
+/// senseless to use the lazy DFA. More to the point, parts of the code do
+/// assume that the cache can fit at least some small number of states.
+const MIN_STATES: usize = 5;
+
+/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
+///
+/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
+/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
+/// Indeed, both support precisely the same regex features with precisely the
+/// same semantics.
+///
+/// Where as a `dense::DFA` must be completely built to handle any input before
+/// it may be used for search, a lazy DFA starts off effectively empty. During
+/// a search, a lazy DFA will build itself depending on whether it has already
+/// computed the next transition or not. If it has, then it looks a lot like
+/// a `dense::DFA` internally: it does a very fast table based access to find
+/// the next transition. Otherwise, if the state hasn't been computed, then it
+/// does determinization _for that specific transition_ to compute the next DFA
+/// state.
+///
+/// The main selling point of a lazy DFA is that, in practice, it has
+/// the performance profile of a `dense::DFA` without the weakness of it
+/// taking worst case exponential time to build. Indeed, for each byte of
+/// input, the lazy DFA will construct as most one new DFA state. Thus, a
+/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
+/// pattern.len()` and `n ~ haystack.len()`).
+///
+/// The main downsides of a lazy DFA are:
+///
+/// 1. It requires mutable "cache" space during search. This is where the
+/// transition table, among other things, is stored.
+/// 2. In pathological cases (e.g., if the cache is too small), it will run
+/// out of room and either require a bigger cache capacity or will repeatedly
+/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
+/// will tend to be slower than a typical NFA simulation.
+///
+/// # Capabilities
+///
+/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
+/// operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of a match.
+/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
+/// is reported as well.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of
+/// a match, *two* lazy DFAs are required. This functionality is provided by a
+/// [`Regex`](crate::hybrid::regex::Regex).
+///
+/// # Example
+///
+/// This shows how to build a lazy DFA with the default configuration and
+/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
+/// a cache and pass it to our search routine.
+///
+/// ```
+/// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let mut cache = dfa.create_cache();
+///
+/// let expected = Some(HalfMatch::must(0, 8));
+/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct DFA {
+ nfa: Arc<thompson::NFA>,
+ stride2: usize,
+ classes: ByteClasses,
+ quitset: ByteSet,
+ anchored: bool,
+ match_kind: MatchKind,
+ starts_for_each_pattern: bool,
+ cache_capacity: usize,
+ minimum_cache_clear_count: Option<usize>,
+}
+
+impl DFA {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding lazy DFA.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+bar")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<DFA, BuildError> {
+ DFA::builder().build(pattern)
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// return the corresponding lazy multi-DFA.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let expected = HalfMatch::must(1, 3);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
+ DFA::builder().build_many(patterns)
+ }
+
+ /// Create a new lazy DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::always_match()?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ Builder::new().build_from_nfa(Arc::new(nfa))
+ }
+
+ /// Create a new lazy DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::never_match()?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?);
+ /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ Builder::new().build_from_nfa(Arc::new(nfa))
+ }
+
+ /// Return a default configuration for a `DFA`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a lazy DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a lazy DFA that only executes searches
+ /// in anchored mode.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().anchored(true))
+ /// .build(r"[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "abc123xyz".as_bytes();
+ /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?);
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 3)),
+ /// re.find_leftmost_fwd(&mut cache, &haystack[3..6])?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere for lazy DFAs. This includes disabling it for both the
+ /// concrete syntax (e.g., `.` matches any byte and Unicode character
+ /// classes like `\p{Letter}` are not allowed) and for the unanchored
+ /// search prefix. The latter enables the regex to match anywhere in a
+ /// sequence of arbitrary bytes. (Typically, the unanchored search prefix
+ /// will only permit matching valid UTF-8.)
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// nfa::thompson,
+ /// HalfMatch, SyntaxConfig,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(HalfMatch::must(0, 9));
+ /// let got = re.find_leftmost_fwd(&mut cache, haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new cache for this lazy DFA.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// lazy DFA. If you want to reuse the cache for another DFA, then
+ /// you must call [`Cache::reset`] with that DFA (or, equivalently,
+ /// [`DFA::reset_cache`]).
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this lazy DFA (and only this DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different lazy DFA.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// lazy DFA has been configured to "give up" after it has cleared the
+ /// cache a certain number of times.
+ ///
+ /// Any lazy state ID generated by the cache prior to resetting it is
+ /// invalid after the reset.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different DFA.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa1 = DFA::new(r"\w")?;
+ /// let dfa2 = DFA::new(r"\W")?;
+ ///
+ /// let mut cache = dfa1.create_cache();
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 2)),
+ /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+ /// );
+ ///
+ /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 'dfa1' is also not
+ /// // allowed.
+ /// dfa2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 3)),
+ /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ Lazy::new(self, cache).reset_cache()
+ }
+
+ /// Returns the total number of patterns compiled into this lazy DFA.
+ ///
+ /// In the case of a DFA that contains no patterns, this returns `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the pattern count for a DFA that never matches:
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::never_match()?;
+ /// assert_eq!(dfa.pattern_count(), 0);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And another example for a DFA that matches at every position:
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::always_match()?;
+ /// assert_eq!(dfa.pattern_count(), 1);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And finally, a DFA that was constructed from multiple patterns:
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(dfa.pattern_count(), 3);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_count(&self) -> usize {
+ self.nfa.pattern_len()
+ }
+
+ /// Returns a reference to the underlying NFA.
+ pub fn nfa(&self) -> &Arc<thompson::NFA> {
+ &self.nfa
+ }
+
+ /// Returns the stride, as a base-2 exponent, required for these
+ /// equivalence classes.
+ ///
+ /// The stride is always the smallest power of 2 that is greater than or
+ /// equal to the alphabet length. This is done so that converting between
+ /// state IDs and indices can be done with shifts alone, which is much
+ /// faster than integer division.
+ fn stride2(&self) -> usize {
+ self.stride2
+ }
+
+ /// Returns the total stride for every state in this lazy DFA. This
+ /// corresponds to the total number of transitions used by each state in
+ /// this DFA's transition table.
+ fn stride(&self) -> usize {
+ 1 << self.stride2()
+ }
+
+ /// Returns the total number of elements in the alphabet for this
+ /// transition table. This is always less than or equal to `self.stride()`.
+ /// It is only equal when the alphabet length is a power of 2. Otherwise,
+ /// it is always strictly less.
+ fn alphabet_len(&self) -> usize {
+ self.classes.alphabet_len()
+ }
+
+ /// Returns the memory usage, in bytes, of this lazy DFA.
+ ///
+ /// This does **not** include the stack size used up by this lazy DFA. To
+ /// compute that, use `std::mem::size_of::<DFA>()`. This also does
+ /// not include the size of the `Cache` used.
+ pub fn memory_usage(&self) -> usize {
+ // Everything else is on the stack.
+ self.nfa.memory_usage()
+ }
+}
+
+impl DFA {
+ /// Executes a forward search and returns the end position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state. This is useful for implementing boolean `is_match`-like
+ /// routines, where as little work is done as possible.
+ ///
+ /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as
+ /// providing a prefilter, a specific pattern to match and the bounds of
+ /// the search within the haystack. This routine is meant as a convenience
+ /// for common cases where the additional functionality is not needed.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how the position returned might differ from
+ /// what one might expect when executing a traditional leftmost search.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 8,
+ /// // corresponding to the end of the input. But the "earliest" semantics
+ /// // this routine cause it to stop as soon as a match is known, which
+ /// // occurs once 'foo[0-9]' has matched.
+ /// let expected = HalfMatch::must(0, 4);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_earliest_fwd(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// let dfa = DFA::new("abc|a")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let expected = HalfMatch::must(0, 1);
+ /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_earliest_fwd(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a reverse search and returns the start position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state.
+ ///
+ /// Note that while it is not technically necessary to build a reverse
+ /// automaton to use a reverse search, it is likely that you'll want to do
+ /// so. Namely, the typical use of a reverse search is to find the starting
+ /// location of a match once its end is discovered from a forward search. A
+ /// reverse DFA automaton can be built by configuring the intermediate NFA
+ /// to be reversed via
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how the position returned might differ from
+ /// what one might expect when executing a traditional leftmost reverse
+ /// search.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("[a-z]+[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 0,
+ /// // corresponding to the beginning of the input. But the "earliest"
+ /// // semantics of this routine cause it to stop as soon as a match is
+ /// // known, which occurs once '[a-z][0-9]+' has matched.
+ /// let expected = HalfMatch::must(0, 2);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_earliest_rev(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 0,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let expected = HalfMatch::must(0, 2);
+ /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_earliest_rev(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics. (This crate does not currently support
+ /// leftmost longest semantics.)
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_fwd(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = DFA::new("abc|a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 3);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_leftmost_fwd(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// In particular, this routine is principally
+ /// useful when used in conjunction with the
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers
+ /// e) configuration. In general, it's unlikely to be correct to use both
+ /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since
+ /// any particular DFA will only support searching in one direction with
+ /// respect to the pattern.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_rev(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// // Even though a match is found after reading the last byte (`c`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_leftmost_rev(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes an overlapping forward search and returns the end position of
+ /// matches as they are found. If no match exists, then `None` is returned.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run a basic overlapping search. Notice
+ /// that we build the automaton with a `MatchKind::All` configuration.
+ /// Overlapping searches are unlikely to work as one would expect when
+ /// using the default `MatchKind::LeftmostFirst` match semantics, since
+ /// leftmost-first matching is fundamentally incompatible with overlapping
+ /// searches. Namely, overlapping searches need to report matches as they
+ /// are seen, where as leftmost-first searches will continue searching even
+ /// after a match has been observed in order to find the conventional end
+ /// position of the match. More concretely, leftmost-first searches use
+ /// dead states to terminate a search after a specific match can no longer
+ /// be extended. Overlapping searches instead do the opposite by continuing
+ /// the search to find totally new matches (potentially of other patterns).
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, OverlappingState},
+ /// HalfMatch,
+ /// MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_overlapping_fwd(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_overlapping_fwd_at(
+ cache,
+ None,
+ None,
+ bytes,
+ 0,
+ bytes.len(),
+ state,
+ )
+ }
+
+ /// Executes a forward search and returns the end position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state. This is useful for implementing boolean `is_match`-like
+ /// routines, where as little work is done as possible.
+ ///
+ /// This is like [`DFA::find_earliest_fwd`], except it provides some
+ /// additional control over how the search is executed:
+ ///
+ /// * `pre` is a prefilter scanner that, when given, is used whenever the
+ /// DFA enters its starting state. This is meant to speed up searches where
+ /// one or a small number of literal prefixes are known.
+ /// * `pattern_id` specifies a specific pattern in the DFA to run an
+ /// anchored search for. If not given, then a search for any pattern is
+ /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be
+ /// enabled to use this functionality.
+ /// * `start` and `end` permit searching a specific region of the haystack
+ /// `bytes`. This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `bytes`. (Because the existence of look-around
+ /// operations such as `\b`, `^` and `$` need to take the surrounding
+ /// context into account. This cannot be done if the haystack doesn't
+ /// contain it.)
+ ///
+ /// The examples below demonstrate each of these additional parameters.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and this lazy DFA does
+ /// not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ ///
+ /// # Example: prefilter
+ ///
+ /// This example shows how to provide a prefilter for a pattern where all
+ /// matches start with a `z` byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::prefilter::{Candidate, Prefilter, Scanner, State},
+ /// HalfMatch,
+ /// };
+ ///
+ /// #[derive(Debug)]
+ /// pub struct ZPrefilter;
+ ///
+ /// impl Prefilter for ZPrefilter {
+ /// fn next_candidate(
+ /// &self,
+ /// _: &mut State,
+ /// haystack: &[u8],
+ /// at: usize,
+ /// ) -> Candidate {
+ /// // Try changing b'z' to b'q' and observe this test fail since
+ /// // the prefilter will skip right over the match.
+ /// match haystack.iter().position(|&b| b == b'z') {
+ /// None => Candidate::None,
+ /// Some(i) => Candidate::PossibleStartOfMatch(at + i),
+ /// }
+ /// }
+ ///
+ /// fn heap_bytes(&self) -> usize {
+ /// 0
+ /// }
+ /// }
+ ///
+ /// let dfa = DFA::new("z[0-9]{3}")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "foobar z123 q123".as_bytes();
+ /// // A scanner executes a prefilter while tracking some state that helps
+ /// // determine whether a prefilter is still "effective" or not.
+ /// let mut scanner = Scanner::new(&ZPrefilter);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// Some(&mut scanner),
+ /// None,
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a lazy multi-DFA that permits searching
+ /// for specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// HalfMatch,
+ /// PatternID,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "foo123".as_bytes();
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// None,
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(HalfMatch::must(1, 6));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// Some(PatternID::must(1)),
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// // N.B. We disable Unicode here so that we use a simple ASCII word
+ /// // boundary. Alternatively, we could enable heuristic support for
+ /// // Unicode word boundaries since our haystack is pure ASCII.
+ /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about the
+ /// // larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `3` instead of `6`.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// None,
+ /// &haystack[3..6],
+ /// 0,
+ /// haystack[3..6].len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// None,
+ /// haystack,
+ /// 3,
+ /// 6,
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_earliest_fwd_at(
+ &self,
+ cache: &mut Cache,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_earliest_fwd(
+ pre, self, cache, pattern_id, bytes, start, end,
+ )
+ }
+
+ /// Executes a reverse search and returns the start position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state.
+ ///
+ /// This is like [`DFA::find_earliest_rev`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_earliest_rev_at(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_earliest_rev(self, cache, pattern_id, bytes, start, end)
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// This is like [`DFA::find_leftmost_fwd`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
+ /// additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_leftmost_fwd_at(
+ &self,
+ cache: &mut Cache,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_leftmost_fwd(
+ pre, self, cache, pattern_id, bytes, start, end,
+ )
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// This is like [`DFA::find_leftmost_rev`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
+ /// additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_leftmost_rev_at(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end)
+ }
+
+ /// Executes an overlapping forward search and returns the end position of
+ /// matches as they are found. If no match exists, then `None` is returned.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// This is like [`DFA::find_overlapping_fwd`], except it provides
+ /// some additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should always be set to the end
+ /// of the last match. If more patterns match at the previous location,
+ /// then they will be immediately returned. (This is tracked by the given
+ /// overlapping state.) Otherwise, the search continues at the starting
+ /// position given.
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_overlapping_fwd_at(
+ &self,
+ cache: &mut Cache,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_overlapping_fwd(
+ pre, self, cache, pattern_id, bytes, start, end, state,
+ )
+ }
+}
+
+impl DFA {
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input.
+ ///
+ /// The given cache is used to either reuse pre-computed state
+ /// transitions, or to store this newly computed transition for future
+ /// reuse. Thus, this routine guarantees that it will never return a state
+ /// ID that has an "unknown" tag.
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is the lazy state ID returned
+ /// by the most recent call to `next_state`, `next_state_untagged`,
+ /// `next_state_untagged_unchecked`, `start_state_forward` or
+ /// `state_state_reverse` for the given `cache`. Any state ID returned from
+ /// prior calls to these routines (with the same `cache`) is considered
+ /// invalid (even if it gives an appearance of working). State IDs returned
+ /// from _any_ prior call for different `cache` values are also always
+ /// invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID. Moreover, this routine is defined for all possible values of
+ /// `input`.
+ ///
+ /// These validity rules are not checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid or
+ /// incorrect ID.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a lazy DFA for a given
+ /// haystack by using the `next_state` method.
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new(r"[a-z]+r")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// sid = dfa.next_state(&mut cache, sid, b)?;
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+ /// // special "EOI" transition at the end of the search.
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ /// assert!(sid.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn next_state(
+ &self,
+ cache: &mut Cache,
+ current: LazyStateID,
+ input: u8,
+ ) -> Result<LazyStateID, CacheError> {
+ let class = usize::from(self.classes.get(input));
+ let offset = current.as_usize_untagged() + class;
+ let sid = cache.trans[offset];
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ let unit = alphabet::Unit::u8(input);
+ Lazy::new(self, cache).cache_next_state(current, unit)
+ }
+
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input and a state ID that is not tagged.
+ ///
+ /// The only reason to use this routine is performance. In particular, the
+ /// `next_state` method needs to do some additional checks, among them is
+ /// to account for identifiers to states that are not yet computed. In
+ /// such a case, the transition is computed on the fly. However, if it is
+ /// known that the `current` state ID is untagged, then these checks can be
+ /// omitted.
+ ///
+ /// Since this routine does not compute states on the fly, it does not
+ /// modify the cache and thus cannot return an error. Consequently, `cache`
+ /// does not need to be mutable and it is possible for this routine to
+ /// return a state ID corresponding to the special "unknown" state. In
+ /// this case, it is the caller's responsibility to use the prior state
+ /// ID and `input` with `next_state` in order to force the computation of
+ /// the unknown transition. Otherwise, trying to use the "unknown" state
+ /// ID will just result in transitioning back to itself, and thus never
+ /// terminating. (This is technically a special exemption to the state ID
+ /// validity rules, but is permissible since this routine is guarateed to
+ /// never mutate the given `cache`, and thus the identifier is guaranteed
+ /// to remain valid.)
+ ///
+ /// See [`LazyStateID`] for more details on what it means for a state ID
+ /// to be tagged. Also, see
+ /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
+ /// for this same idea, but with bounds checks forcefully elided.
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is an **untagged** lazy
+ /// state ID returned by the most recent call to `next_state`,
+ /// `next_state_untagged`, `next_state_untagged_unchecked`,
+ /// `start_state_forward` or `state_state_reverse` for the given `cache`.
+ /// Any state ID returned from prior calls to these routines (with the
+ /// same `cache`) is considered invalid (even if it gives an appearance
+ /// of working). State IDs returned from _any_ prior call for different
+ /// `cache` values are also always invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID, although it may be tagged. Moreover, this routine is defined for
+ /// all possible values of `input`.
+ ///
+ /// Not all validity rules are checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid or
+ /// incorrect ID.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a lazy DFA for a given
+ /// haystack by using the `next_state_untagged` method where possible.
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new(r"[a-z]+r")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// let mut at = 0;
+ /// while at < haystack.len() {
+ /// if sid.is_tagged() {
+ /// sid = dfa.next_state(&mut cache, sid, haystack[at])?;
+ /// } else {
+ /// let mut prev_sid = sid;
+ /// // We attempt to chew through as much as we can while moving
+ /// // through untagged state IDs. Thus, the transition function
+ /// // does less work on average per byte. (Unrolling this loop
+ /// // may help even more.)
+ /// while at < haystack.len() {
+ /// prev_sid = sid;
+ /// sid = dfa.next_state_untagged(
+ /// &mut cache, sid, haystack[at],
+ /// );
+ /// at += 1;
+ /// if sid.is_tagged() {
+ /// break;
+ /// }
+ /// }
+ /// // We must ensure that we never proceed to the next iteration
+ /// // with an unknown state ID. If we don't account for this
+ /// // case, then search isn't guaranteed to terminate since all
+ /// // transitions on unknown states loop back to itself.
+ /// if sid.is_unknown() {
+ /// sid = dfa.next_state(
+ /// &mut cache, prev_sid, haystack[at - 1],
+ /// )?;
+ /// }
+ /// }
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+ /// // special "EOI" transition at the end of the search.
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ /// assert!(sid.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn next_state_untagged(
+ &self,
+ cache: &Cache,
+ current: LazyStateID,
+ input: u8,
+ ) -> LazyStateID {
+ debug_assert!(!current.is_tagged());
+ let class = usize::from(self.classes.get(input));
+ let offset = current.as_usize_unchecked() + class;
+ cache.trans[offset]
+ }
+
+ /// Transitions from the current state to the next state, eliding bounds
+ /// checks, given the next byte of input and a state ID that is not tagged.
+ ///
+ /// The only reason to use this routine is performance. In particular, the
+ /// `next_state` method needs to do some additional checks, among them is
+ /// to account for identifiers to states that are not yet computed. In
+ /// such a case, the transition is computed on the fly. However, if it is
+ /// known that the `current` state ID is untagged, then these checks can be
+ /// omitted.
+ ///
+ /// Since this routine does not compute states on the fly, it does not
+ /// modify the cache and thus cannot return an error. Consequently, `cache`
+ /// does not need to be mutable and it is possible for this routine to
+ /// return a state ID corresponding to the special "unknown" state. In
+ /// this case, it is the caller's responsibility to use the prior state
+ /// ID and `input` with `next_state` in order to force the computation of
+ /// the unknown transition. Otherwise, trying to use the "unknown" state
+ /// ID will just result in transitioning back to itself, and thus never
+ /// terminating. (This is technically a special exemption to the state ID
+ /// validity rules, but is permissible since this routine is guarateed to
+ /// never mutate the given `cache`, and thus the identifier is guaranteed
+ /// to remain valid.)
+ ///
+ /// See [`LazyStateID`] for more details on what it means for a state ID
+ /// to be tagged. Also, see
+ /// [`next_state_untagged`](DFA::next_state_untagged)
+ /// for this same idea, but with memory safety guaranteed by retaining
+ /// bounds checks.
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is an **untagged** lazy
+ /// state ID returned by the most recent call to `next_state`,
+ /// `next_state_untagged`, `next_state_untagged_unchecked`,
+ /// `start_state_forward` or `state_state_reverse` for the given `cache`.
+ /// Any state ID returned from prior calls to these routines (with the
+ /// same `cache`) is considered invalid (even if it gives an appearance
+ /// of working). State IDs returned from _any_ prior call for different
+ /// `cache` values are also always invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID, although it may be tagged. Moreover, this routine is defined for
+ /// all possible values of `input`.
+ ///
+ /// Not all validity rules are checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Safety
+ ///
+ /// Callers of this method must guarantee that `current` refers to a valid
+ /// state ID according to the rules described above. If `current` is not a
+ /// valid state ID for this automaton, then calling this routine may result
+ /// in undefined behavior.
+ ///
+ /// If `current` is valid, then the ID returned is valid for all possible
+ /// values of `input`.
+ #[inline]
+ pub unsafe fn next_state_untagged_unchecked(
+ &self,
+ cache: &Cache,
+ current: LazyStateID,
+ input: u8,
+ ) -> LazyStateID {
+ debug_assert!(!current.is_tagged());
+ let class = usize::from(self.classes.get(input));
+ let offset = current.as_usize_unchecked() + class;
+ *cache.trans.get_unchecked(offset)
+ }
+
+ /// Transitions from the current state to the next state for the special
+ /// EOI symbol.
+ ///
+ /// The given cache is used to either reuse pre-computed state
+ /// transitions, or to store this newly computed transition for future
+ /// reuse. Thus, this routine guarantees that it will never return a state
+ /// ID that has an "unknown" tag.
+ ///
+ /// This routine must be called at the end of every search in a correct
+ /// implementation of search. Namely, lazy DFAs in this crate delay matches
+ /// by one byte in order to support look-around operators. Thus, after
+ /// reaching the end of a haystack, a search implementation must follow one
+ /// last EOI transition.
+ ///
+ /// It is best to think of EOI as an additional symbol in the alphabet of a
+ /// DFA that is distinct from every other symbol. That is, the alphabet of
+ /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
+ /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
+ /// physical alphabet size may be smaller because of alphabet compression
+ /// via equivalence classes, but EOI is always represented somehow in the
+ /// alphabet.)
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is the lazy state ID returned
+ /// by the most recent call to `next_state`, `next_state_untagged`,
+ /// `next_state_untagged_unchecked`, `start_state_forward` or
+ /// `state_state_reverse` for the given `cache`. Any state ID returned from
+ /// prior calls to these routines (with the same `cache`) is considered
+ /// invalid (even if it gives an appearance of working). State IDs returned
+ /// from _any_ prior call for different `cache` values are also always
+ /// invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID.
+ ///
+ /// These validity rules are not checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid or
+ /// incorrect ID.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a DFA for a given haystack,
+ /// and then finishing the search with the final EOI transition.
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new(r"[a-z]+r")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// sid = dfa.next_state(&mut cache, sid, b)?;
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search. Without this
+ /// // final transition, the assert below will fail since the DFA will not
+ /// // have entered a match state yet!
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ /// assert!(sid.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn next_eoi_state(
+ &self,
+ cache: &mut Cache,
+ current: LazyStateID,
+ ) -> Result<LazyStateID, CacheError> {
+ let eoi = self.classes.eoi().as_usize();
+ let offset = current.as_usize_untagged() + eoi;
+ let sid = cache.trans[offset];
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ let unit = self.classes.eoi();
+ Lazy::new(self, cache).cache_next_state(current, unit)
+ }
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// forward search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The pattern ID, if present. When the underlying DFA has been
+ /// configured with multiple patterns _and_ the DFA has been configured to
+ /// build an anchored start state for each pattern, then a pattern ID may
+ /// be specified to execute an anchored search for that specific pattern.
+ /// If `pattern_id` is invalid or if the DFA isn't configured to build
+ /// start states for each pattern, then implementations must panic. DFAs in
+ /// this crate can be configured to build start states for each pattern via
+ /// [`Config::starts_for_each_pattern`].
+ /// * When `start > 0`, the byte at index `start - 1` may influence the
+ /// start state if the regex uses `^` or `\b`.
+ /// * Similarly, when `start == 0`, it may influence the start state when
+ /// the regex uses `^` or `\A`.
+ /// * Currently, `end` is unused.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for forward searches.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
+ /// also panics if `pattern_id` is non-None and does not refer to a valid
+ /// pattern, or if the DFA was not configured to build anchored start
+ /// states for each pattern.
+ #[inline]
+ pub fn start_state_forward(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<LazyStateID, CacheError> {
+ let mut lazy = Lazy::new(self, cache);
+ let start_type = Start::from_position_fwd(bytes, start, end);
+ let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ lazy.cache_start_group(pattern_id, start_type)
+ }
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// reverse search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The pattern ID, if present. When the underlying DFA has been
+ /// configured with multiple patterns _and_ the DFA has been configured to
+ /// build an anchored start state for each pattern, then a pattern ID may
+ /// be specified to execute an anchored search for that specific pattern.
+ /// If `pattern_id` is invalid or if the DFA isn't configured to build
+ /// start states for each pattern, then implementations must panic. DFAs in
+ /// this crate can be configured to build start states for each pattern via
+ /// [`Config::starts_for_each_pattern`].
+ /// * When `end < bytes.len()`, the byte at index `end` may influence the
+ /// start state if the regex uses `$` or `\b`.
+ /// * Similarly, when `end == bytes.len()`, it may influence the start
+ /// state when the regex uses `$` or `\z`.
+ /// * Currently, `start` is unused.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for reverse searches.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
+ /// also panics if `pattern_id` is non-None and does not refer to a valid
+ /// pattern, or if the DFA was not configured to build anchored start
+ /// states for each pattern.
+ #[inline]
+ pub fn start_state_reverse(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<LazyStateID, CacheError> {
+ let mut lazy = Lazy::new(self, cache);
+ let start_type = Start::from_position_rev(bytes, start, end);
+ let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ lazy.cache_start_group(pattern_id, start_type)
+ }
+
+ /// Returns the total number of patterns that match in this state.
+ ///
+ /// If the lazy DFA was compiled with one pattern, then this must
+ /// necessarily always return `1` for all match states.
+ ///
+ /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
+ /// indices up to (but not including) the count returned by this routine
+ /// without panicking.
+ ///
+ /// If the given state is not a match state, then this may either panic
+ /// or return an incorrect result.
+ ///
+ /// # Example
+ ///
+ /// This example shows a simple instance of implementing overlapping
+ /// matches. In particular, it shows not only how to determine how many
+ /// patterns have matched in a particular state, but also how to access
+ /// which specific patterns have matched.
+ ///
+ /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
+ /// when building the DFA. If we used
+ /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
+ /// instead, then the DFA would not be constructed in a way that supports
+ /// overlapping matches. (It would only report a single pattern that
+ /// matches at any particular point in time.)
+ ///
+ /// Another thing to take note of is the patterns used and the order in
+ /// which the pattern IDs are reported. In the example below, pattern `3`
+ /// is yielded first. Why? Because it corresponds to the match that
+ /// appears first. Namely, the `@` symbol is part of `\S+` but not part
+ /// of any of the other patterns. Since the `\S+` pattern has a match that
+ /// starts to the left of any other pattern, its ID is returned before any
+ /// other.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, MatchKind};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[
+ /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
+ /// ])?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "@bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// sid = dfa.next_state(&mut cache, sid, b)?;
+ /// }
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ ///
+ /// assert!(sid.is_match());
+ /// assert_eq!(dfa.match_count(&mut cache, sid), 3);
+ /// // The following calls are guaranteed to not panic since `match_count`
+ /// // returned `3` above.
+ /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
+ /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
+ /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize {
+ assert!(id.is_match());
+ LazyRef::new(self, cache).get_cached_state(id).match_count()
+ }
+
+ /// Returns the pattern ID corresponding to the given match index in the
+ /// given state.
+ ///
+ /// See [`DFA::match_count`] for an example of how to use this method
+ /// correctly. Note that if you know your lazy DFA is configured with a
+ /// single pattern, then this routine is never necessary since it will
+ /// always return a pattern ID of `0` for an index of `0` when `id`
+ /// corresponds to a match state.
+ ///
+ /// Typically, this routine is used when implementing an overlapping
+ /// search, as the example for `DFA::match_count` does.
+ ///
+ /// # Panics
+ ///
+ /// If the state ID is not a match state or if the match index is out
+ /// of bounds for the given state, then this routine may either panic
+ /// or produce an incorrect result. If the state ID is correct and the
+ /// match index is correct, then this routine always produces a valid
+ /// `PatternID`.
+ #[inline]
+ pub fn match_pattern(
+ &self,
+ cache: &Cache,
+ id: LazyStateID,
+ match_index: usize,
+ ) -> PatternID {
+ // This is an optimization for the very common case of a DFA with a
+ // single pattern. This conditional avoids a somewhat more costly path
+ // that finds the pattern ID from the corresponding `State`, which
+ // requires a bit of slicing/pointer-chasing. This optimization tends
+ // to only matter when matches are frequent.
+ if self.pattern_count() == 1 {
+ return PatternID::ZERO;
+ }
+ LazyRef::new(self, cache)
+ .get_cached_state(id)
+ .match_pattern(match_index)
+ }
+}
+
+/// A cache represents a partially computed DFA.
+///
+/// A cache is the key component that differentiates a classical DFA and a
+/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
+/// complete transition table that can handle all possible inputs, a hybrid
+/// NFA/DFA starts with an empty transition table and builds only the parts
+/// required during search. The parts that are built are stored in a cache. For
+/// this reason, a cache is a required parameter for nearly every operation on
+/// a [`DFA`].
+///
+/// Caches can be created from their corresponding DFA via
+/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
+/// created it, or the DFA that was most recently used to reset it with
+/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
+/// or incorrect results.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ // N.B. If you're looking to understand how determinization works, it
+ // is probably simpler to first grok src/dfa/determinize.rs, since that
+ // doesn't have the "laziness" component.
+ /// The transition table.
+ ///
+ /// Given a `current` LazyStateID and an `input` byte, the next state can
+ /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
+ /// that no multiplication is used. That's because state identifiers are
+ /// "premultiplied."
+ ///
+ /// Note that the next state may be the "unknown" state. In this case, the
+ /// next state is not known and determinization for `current` on `input`
+ /// must be performed.
+ trans: Vec<LazyStateID>,
+ /// The starting states for this DFA.
+ ///
+ /// These are computed lazily. Initially, these are all set to "unknown"
+ /// lazy state IDs.
+ ///
+ /// When 'starts_for_each_pattern' is disabled (the default), then the size
+ /// of this is constrained to the possible starting configurations based
+ /// on the search parameters. (At time of writing, that's 4.) However,
+ /// when starting states for each pattern is enabled, then there are N
+ /// additional groups of starting states, where each group reflects the
+ /// different possible configurations and N is the number of patterns.
+ starts: Vec<LazyStateID>,
+ /// A sequence of NFA/DFA powerset states that have been computed for this
+ /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
+ /// tagged LazyStateID can be used to index this sequence by converting it
+ /// to its untagged form.)
+ states: Vec<State>,
+ /// A map from states to their corresponding IDs. This map may be accessed
+ /// via the raw byte representation of a state, which means that a `State`
+ /// does not need to be allocated to determine whether it already exists
+ /// in this map. Indeed, the existence of such a state is what determines
+ /// whether we allocate a new `State` or not.
+ ///
+ /// The higher level idea here is that we do just enough determinization
+ /// for a state to check whether we've already computed it. If we have,
+ /// then we can save a little (albeit not much) work. The real savings is
+ /// in memory usage. If we never checked for trivially duplicate states,
+ /// then our memory usage would explode to unreasonable levels.
+ states_to_id: StateMap,
+ /// Sparse sets used to track which NFA states have been visited during
+ /// various traversals.
+ sparses: SparseSets,
+ /// Scratch space for traversing the NFA graph. (We use space on the heap
+ /// instead of the call stack.)
+ stack: Vec<NFAStateID>,
+ /// Scratch space for building a NFA/DFA powerset state. This is used to
+ /// help amortize allocation since not every powerset state generated is
+ /// added to the cache. In particular, if it already exists in the cache,
+ /// then there is no need to allocate a new `State` for it.
+ scratch_state_builder: StateBuilderEmpty,
+ /// A simple abstraction for handling the saving of at most a single state
+ /// across a cache clearing. This is required for correctness. Namely, if
+ /// adding a new state after clearing the cache fails, then the caller
+ /// must retain the ability to continue using the state ID given. The
+ /// state corresponding to the state ID is what we preserve across cache
+ /// clearings.
+ state_saver: StateSaver,
+ /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
+ /// track this as new states are added since states use a variable amount
+ /// of heap. Tracking this as we add states makes it possible to compute
+ /// the total amount of memory used by the determinizer in constant time.
+ memory_usage_state: usize,
+ /// The number of times the cache has been cleared. When a minimum cache
+ /// clear count is set, then the cache will return an error instead of
+ /// clearing the cache if the count has been exceeded.
+ clear_count: usize,
+}
+
+impl Cache {
+ /// Create a new cache for the given lazy DFA.
+ ///
+ /// The cache returned should only be used for searches for the given DFA.
+ /// If you want to reuse the cache for another DFA, then you must call
+ /// [`Cache::reset`] with that DFA.
+ pub fn new(dfa: &DFA) -> Cache {
+ let mut cache = Cache {
+ trans: alloc::vec![],
+ starts: alloc::vec![],
+ states: alloc::vec![],
+ states_to_id: StateMap::new(),
+ sparses: SparseSets::new(dfa.nfa.len()),
+ stack: alloc::vec![],
+ scratch_state_builder: StateBuilderEmpty::new(),
+ state_saver: StateSaver::none(),
+ memory_usage_state: 0,
+ clear_count: 0,
+ };
+ Lazy { dfa, cache: &mut cache }.init_cache();
+ cache
+ }
+
+ /// Reset this cache such that it can be used for searching with the given
+ /// lazy DFA (and only that DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different lazy DFA.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// lazy DFA has been configured to "give up" after it has cleared the
+ /// cache a certain number of times.
+ ///
+ /// Any lazy state ID generated by the cache prior to resetting it is
+ /// invalid after the reset.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different DFA.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa1 = DFA::new(r"\w")?;
+ /// let dfa2 = DFA::new(r"\W")?;
+ ///
+ /// let mut cache = dfa1.create_cache();
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 2)),
+ /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+ /// );
+ ///
+ /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 'dfa1' is also not
+ /// // allowed.
+ /// cache.reset(&dfa2);
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 3)),
+ /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, dfa: &DFA) {
+ Lazy::new(dfa, self).reset_cache()
+ }
+
+ /// Returns the total number of times this cache has been cleared since it
+ /// was either created or last reset.
+ ///
+ /// This is useful for informational purposes or if you want to change
+ /// search strategies based on the number of times the cache has been
+ /// cleared.
+ pub fn clear_count(&self) -> usize {
+ self.clear_count
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ const ID_SIZE: usize = size_of::<LazyStateID>();
+ const STATE_SIZE: usize = size_of::<State>();
+
+ self.trans.len() * ID_SIZE
+ + self.starts.len() * ID_SIZE
+ + self.states.len() * STATE_SIZE
+ // Maps likely use more memory than this, but it's probably close.
+ + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
+ + self.sparses.memory_usage()
+ + self.stack.capacity() * ID_SIZE
+ + self.scratch_state_builder.capacity()
+ // Heap memory used by 'State' in both 'states' and 'states_to_id'.
+ + self.memory_usage_state
+ }
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, LazyStateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
+
+/// A type that groups methods that require the base NFA/DFA and writable
+/// access to the cache.
+#[derive(Debug)]
+struct Lazy<'i, 'c> {
+ dfa: &'i DFA,
+ cache: &'c mut Cache,
+}
+
+impl<'i, 'c> Lazy<'i, 'c> {
+ /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
+ fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
+ Lazy { dfa, cache }
+ }
+
+ /// Return an immutable view by downgrading a writable cache to a read-only
+ /// cache.
+ fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
+ LazyRef::new(self.dfa, self.cache)
+ }
+
+ /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
+ /// like 'next_state' and 'next_eoi_state' that are called in critical
+ /// areas. The idea is to let the optimizer focus on the other areas of
+ /// those methods as the hot path.
+ ///
+ /// Here's an example that justifies 'inline(never)'
+ ///
+ /// ```ignore
+ /// regex-cli find hybrid dfa \
+ /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
+ /// ```
+ ///
+ /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
+ /// codepoint, in sequence, repeated 100 times.
+ ///
+ /// With 'inline(never)' hyperfine reports 1.1s per run. With
+ /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
+ #[inline(never)]
+ fn cache_next_state(
+ &mut self,
+ mut current: LazyStateID,
+ unit: alphabet::Unit,
+ ) -> Result<LazyStateID, CacheError> {
+ let stride2 = self.dfa.stride2();
+ let empty_builder = self.get_state_builder();
+ let builder = determinize::next(
+ &self.dfa.nfa,
+ self.dfa.match_kind,
+ &mut self.cache.sparses,
+ &mut self.cache.stack,
+ &self.cache.states[current.as_usize_untagged() >> stride2],
+ unit,
+ empty_builder,
+ );
+ let save_state = !self.as_ref().state_builder_fits_in_cache(&builder);
+ if save_state {
+ self.save_state(current);
+ }
+ let next = self.add_builder_state(builder, |sid| sid)?;
+ if save_state {
+ current = self.saved_state_id();
+ }
+ // This is the payoff. The next time 'next_state' is called with this
+ // state and alphabet unit, it will find this transition and avoid
+ // having to re-determinize this transition.
+ self.set_transition(current, unit, next);
+ Ok(next)
+ }
+
+ /// Compute and cache the starting state for the given pattern ID (if
+ /// present) and the starting configuration.
+ ///
+ /// This panics if a pattern ID is given and the DFA isn't configured to
+ /// build anchored start states for each pattern.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn cache_start_group(
+ &mut self,
+ pattern_id: Option<PatternID>,
+ start: Start,
+ ) -> Result<LazyStateID, CacheError> {
+ let nfa_start_id = match pattern_id {
+ Some(pid) => {
+ assert!(
+ self.dfa.starts_for_each_pattern,
+ "attempted to search for a specific pattern \
+ without enabling starts_for_each_pattern",
+ );
+ self.dfa.nfa.start_pattern(pid)
+ }
+ None if self.dfa.anchored => self.dfa.nfa.start_anchored(),
+ None => self.dfa.nfa.start_unanchored(),
+ };
+
+ let id = self.cache_start_one(nfa_start_id, start)?;
+ self.set_start_state(pattern_id, start, id);
+ Ok(id)
+ }
+
+ /// Compute and cache the starting state for the given NFA state ID and the
+ /// starting configuration. The NFA state ID might be one of the following:
+ ///
+ /// 1) An unanchored start state to match any pattern.
+ /// 2) An anchored start state to match any pattern.
+ /// 3) An anchored start state for a particular pattern.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn cache_start_one(
+ &mut self,
+ nfa_start_id: NFAStateID,
+ start: Start,
+ ) -> Result<LazyStateID, CacheError> {
+ let mut builder_matches = self.get_state_builder().into_matches();
+ determinize::set_lookbehind_from_start(&start, &mut builder_matches);
+ self.cache.sparses.set1.clear();
+ determinize::epsilon_closure(
+ self.dfa.nfa.borrow(),
+ nfa_start_id,
+ *builder_matches.look_have(),
+ &mut self.cache.stack,
+ &mut self.cache.sparses.set1,
+ );
+ let mut builder = builder_matches.into_nfa();
+ determinize::add_nfa_states(
+ self.dfa.nfa.borrow(),
+ &self.cache.sparses.set1,
+ &mut builder,
+ );
+ self.add_builder_state(builder, |id| id.to_start())
+ }
+
+ /// Either add the given builder state to this cache, or return an ID to an
+ /// equivalent state already in this cache.
+ ///
+ /// In the case where no equivalent state exists, the idmap function given
+ /// may be used to transform the identifier allocated. This is useful if
+ /// the caller needs to tag the ID with additional information.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn add_builder_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ idmap: impl Fn(LazyStateID) -> LazyStateID,
+ ) -> Result<LazyStateID, CacheError> {
+ if let Some(&cached_id) =
+ self.cache.states_to_id.get(builder.as_bytes())
+ {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ self.put_state_builder(builder);
+ return Ok(cached_id);
+ }
+ let result = self.add_state(builder.to_state(), idmap);
+ self.put_state_builder(builder);
+ result
+ }
+
+ /// Allocate a new state ID and add the given state to this cache.
+ ///
+ /// The idmap function given may be used to transform the identifier
+ /// allocated. This is useful if the caller needs to tag the ID with
+ /// additional information.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn add_state(
+ &mut self,
+ state: State,
+ idmap: impl Fn(LazyStateID) -> LazyStateID,
+ ) -> Result<LazyStateID, CacheError> {
+ if !self.as_ref().state_fits_in_cache(&state) {
+ self.try_clear_cache()?;
+ }
+ // It's important for this to come second, since the above may clear
+ // the cache. If we clear the cache after ID generation, then the ID
+ // is likely bunk since it would have been generated based on a larger
+ // transition table.
+ let mut id = idmap(self.next_state_id()?);
+ if state.is_match() {
+ id = id.to_match();
+ }
+ // Add room in the transition table. Since this is a fresh state, all
+ // of its transitions are unknown.
+ self.cache.trans.extend(
+ iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
+ );
+ // When we add a sentinel state, we never want to set any quit
+ // transitions. Technically, this is harmless, since sentinel states
+ // have all of their transitions set to loop back to themselves. But
+ // when creating sentinel states before the quit sentinel state,
+ // this will try to call 'set_transition' on a state ID that doesn't
+ // actually exist yet, which isn't allowed. So we just skip doing so
+ // entirely.
+ if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
+ let quit_id = self.as_ref().quit_id();
+ for b in self.dfa.quitset.iter() {
+ self.set_transition(id, alphabet::Unit::u8(b), quit_id);
+ }
+ }
+ self.cache.memory_usage_state += state.memory_usage();
+ self.cache.states.push(state.clone());
+ self.cache.states_to_id.insert(state, id);
+ Ok(id)
+ }
+
+ /// Allocate a new state ID.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
+ let sid = match LazyStateID::new(self.cache.trans.len()) {
+ Ok(sid) => sid,
+ Err(_) => {
+ self.try_clear_cache()?;
+ // This has to pass since we check that ID capacity at
+ // construction time can fit at least MIN_STATES states.
+ LazyStateID::new(self.cache.trans.len()).unwrap()
+ }
+ };
+ Ok(sid)
+ }
+
+ /// Attempt to clear the cache used by this lazy DFA.
+ ///
+ /// If clearing the cache exceeds the minimum number of required cache
+ /// clearings, then this will return a cache error. In this case,
+ /// callers should bubble this up as the cache can't be used until it is
+ /// reset. Implementations of search should convert this error into a
+ /// `MatchError::GaveUp`.
+ ///
+ /// If 'self.state_saver' is set to save a state, then this state is
+ /// persisted through cache clearing. Otherwise, the cache is returned to
+ /// its state after initialization with two exceptions: its clear count
+ /// is incremented and some of its memory likely has additional capacity.
+ /// That is, clearing a cache does _not_ release memory.
+ ///
+ /// Otherwise, any lazy state ID generated by the cache prior to resetting
+ /// it is invalid after the reset.
+ fn try_clear_cache(&mut self) -> Result<(), CacheError> {
+ // Currently, the only heuristic we use is the minimum cache clear
+ // count. If we pass that minimum, then we give up.
+ //
+ // It would be good to also add a heuristic based on "bytes searched
+ // per generated state," but this requires API design work. Namely,
+ // we really do not want to add a counter increment to the transition
+ // function, which implies we need to expose APIs to update the number
+ // of bytes searched by implementers of the search routines. And that
+ // doesn't seem great... But we should do it if this heuristic isn't
+ // enough. (The original lazy DFA implementation in the 'regex' crate
+ // had this heuristic, since the lazy DFA was coupled with the search
+ // routines.)
+ if let Some(min_count) = self.dfa.minimum_cache_clear_count {
+ if self.cache.clear_count >= min_count {
+ return Err(CacheError::too_many_cache_clears());
+ }
+ }
+ self.clear_cache();
+ Ok(())
+ }
+
+ /// Clears _and_ resets the cache. Resetting the cache means that no
+ /// states are persisted and the clear count is reset to 0. No heap memory
+ /// is released.
+ ///
+ /// Note that the caller may reset a cache with a different DFA than what
+ /// it was created from. In which case, the cache can now be used with the
+ /// new DFA (and not the old DFA).
+ fn reset_cache(&mut self) {
+ self.cache.state_saver = StateSaver::none();
+ self.clear_cache();
+ // If a new DFA is used, it might have a different number of NFA
+ // states, so we need to make sure our sparse sets have the appropriate
+ // size.
+ self.cache.sparses.resize(self.dfa.nfa.len());
+ self.cache.clear_count = 0;
+ }
+
+ /// Clear the cache used by this lazy DFA.
+ ///
+ /// If clearing the cache exceeds the minimum number of required cache
+ /// clearings, then this will return a cache error. In this case,
+ /// callers should bubble this up as the cache can't be used until it is
+ /// reset. Implementations of search should convert this error into a
+ /// `MatchError::GaveUp`.
+ ///
+ /// If 'self.state_saver' is set to save a state, then this state is
+ /// persisted through cache clearing. Otherwise, the cache is returned to
+ /// its state after initialization with two exceptions: its clear count
+ /// is incremented and some of its memory likely has additional capacity.
+ /// That is, clearing a cache does _not_ release memory.
+ ///
+ /// Otherwise, any lazy state ID generated by the cache prior to resetting
+ /// it is invalid after the reset.
+ fn clear_cache(&mut self) {
+ self.cache.trans.clear();
+ self.cache.starts.clear();
+ self.cache.states.clear();
+ self.cache.states_to_id.clear();
+ self.cache.memory_usage_state = 0;
+ self.cache.clear_count += 1;
+ trace!(
+ "lazy DFA cache has been cleared (count: {})",
+ self.cache.clear_count
+ );
+ self.init_cache();
+ // If the state we want to save is one of the sentinel
+ // (unknown/dead/quit) states, then 'init_cache' adds those back, and
+ // their identifier values remains invariant. So there's no need to add
+ // it again. (And indeed, doing so would be incorrect!)
+ if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
+ // If the state is one of the special sentinel states, then it is
+ // automatically added by cache initialization and its ID always
+ // remains the same. With that said, this should never occur since
+ // the sentinel states are all loop states back to themselves. So
+ // we should never be in a position where we're attempting to save
+ // a sentinel state since we never compute transitions out of a
+ // sentinel state.
+ assert!(
+ !self.as_ref().is_sentinel(old_id),
+ "cannot save sentinel state"
+ );
+ let new_id = self
+ .add_state(state, |id| {
+ if old_id.is_start() {
+ id.to_start()
+ } else {
+ id
+ }
+ })
+ // The unwrap here is OK because lazy DFA creation ensures that
+ // we have room in the cache to add MIN_STATES states. Since
+ // 'init_cache' above adds 3, this adds a 4th.
+ .expect("adding one state after cache clear must work");
+ self.cache.state_saver = StateSaver::Saved(new_id);
+ }
+ }
+
+ /// Initialize this cache from emptiness to a place where it can be used
+ /// for search.
+ ///
+ /// This is called both at cache creation time and after the cache has been
+ /// cleared.
+ ///
+ /// Primarily, this adds the three sentinel states and allocates some
+ /// initial memory.
+ fn init_cache(&mut self) {
+ let mut starts_len = Start::count();
+ if self.dfa.starts_for_each_pattern {
+ starts_len += Start::count() * self.dfa.pattern_count();
+ }
+ self.cache
+ .starts
+ .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
+ // This is the set of NFA states that corresponds to each of our three
+ // sentinel states: the empty set.
+ let dead = State::dead();
+ // This sets up some states that we use as sentinels that are present
+ // in every DFA. While it would be technically possible to implement
+ // this DFA without explicitly putting these states in the transition
+ // table, this is convenient to do to make `next_state` correct for all
+ // valid state IDs without needing explicit conditionals to special
+ // case these sentinel states.
+ //
+ // All three of these states are "dead" states. That is, all of
+ // them transition only to themselves. So once you enter one of
+ // these states, it's impossible to leave them. Thus, any correct
+ // search routine must explicitly check for these state types. (Sans
+ // `unknown`, since that is only used internally to represent missing
+ // states.)
+ let unk_id =
+ self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
+ let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
+ let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
+ assert_eq!(unk_id, self.as_ref().unknown_id());
+ assert_eq!(dead_id, self.as_ref().dead_id());
+ assert_eq!(quit_id, self.as_ref().quit_id());
+ // The idea here is that if you start in an unknown/dead/quit state and
+ // try to transition on them, then you should end up where you started.
+ self.set_all_transitions(unk_id, unk_id);
+ self.set_all_transitions(dead_id, dead_id);
+ self.set_all_transitions(quit_id, quit_id);
+ // All of these states are technically equivalent from the FSM
+ // perspective, so putting all three of them in the cache isn't
+ // possible. (They are distinct merely because we use their
+ // identifiers as sentinels to mean something, as indicated by the
+ // names.) Moreover, we wouldn't want to do that. Unknown and quit
+ // states are special in that they are artificial constructions
+ // this implementation. But dead states are a natural part of
+ // determinization. When you reach a point in the NFA where you cannot
+ // go anywhere else, a dead state will naturally arise and we MUST
+ // reuse the canonical dead state that we've created here. Why? Because
+ // it is the state ID that tells the search routine whether a state is
+ // dead or not, and thus, whether to stop the search. Having a bunch of
+ // distinct dead states would be quite wasteful!
+ self.cache.states_to_id.insert(dead, dead_id);
+ }
+
+ /// Save the state corresponding to the ID given such that the state
+ /// persists through a cache clearing.
+ ///
+ /// While the state may persist, the ID may not. In order to discover the
+ /// new state ID, one must call 'saved_state_id' after a cache clearing.
+ fn save_state(&mut self, id: LazyStateID) {
+ let state = self.as_ref().get_cached_state(id).clone();
+ self.cache.state_saver = StateSaver::ToSave { id, state };
+ }
+
+ /// Returns the updated lazy state ID for a state that was persisted
+ /// through a cache clearing.
+ ///
+ /// It is only correct to call this routine when both a state has been
+ /// saved and the cache has just been cleared. Otherwise, this panics.
+ fn saved_state_id(&mut self) -> LazyStateID {
+ self.cache
+ .state_saver
+ .take_saved()
+ .expect("state saver does not have saved state ID")
+ }
+
+ /// Set all transitions on the state 'from' to 'to'.
+ fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
+ for unit in self.dfa.classes.representatives() {
+ self.set_transition(from, unit, to);
+ }
+ }
+
+ /// Set the transition on 'from' for 'unit' to 'to'.
+ ///
+ /// This panics if either 'from' or 'to' is invalid.
+ ///
+ /// All unit values are OK.
+ fn set_transition(
+ &mut self,
+ from: LazyStateID,
+ unit: alphabet::Unit,
+ to: LazyStateID,
+ ) {
+ assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from);
+ assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to);
+ let offset =
+ from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
+ self.cache.trans[offset] = to;
+ }
+
+ /// Set the start ID for the given pattern ID (if given) and starting
+ /// configuration to the ID given.
+ ///
+ /// This panics if 'id' is not valid or if a pattern ID is given and
+ /// 'starts_for_each_pattern' is not enabled.
+ fn set_start_state(
+ &mut self,
+ pattern_id: Option<PatternID>,
+ start: Start,
+ id: LazyStateID,
+ ) {
+ assert!(self.as_ref().is_valid(id));
+ let start_index = start.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ assert!(
+ self.dfa.starts_for_each_pattern,
+ "attempted to search for a specific pattern \
+ without enabling starts_for_each_pattern",
+ );
+ let pid = pid.as_usize();
+ Start::count() + (Start::count() * pid) + start_index
+ }
+ };
+ self.cache.starts[index] = id;
+ }
+
+ /// Returns a state builder from this DFA that might have existing
+ /// capacity. This helps avoid allocs in cases where a state is built that
+ /// turns out to already be cached.
+ ///
+ /// Callers must put the state builder back with 'put_state_builder',
+ /// otherwise the allocation reuse won't work.
+ fn get_state_builder(&mut self) -> StateBuilderEmpty {
+ core::mem::replace(
+ &mut self.cache.scratch_state_builder,
+ StateBuilderEmpty::new(),
+ )
+ }
+
+ /// Puts the given state builder back into this DFA for reuse.
+ ///
+ /// Note that building a 'State' from a builder always creates a new alloc,
+ /// so callers should always put the builder back.
+ fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+ let _ = core::mem::replace(
+ &mut self.cache.scratch_state_builder,
+ builder.clear(),
+ );
+ }
+}
+
+/// A type that groups methods that require the base NFA/DFA and read-only
+/// access to the cache.
+#[derive(Debug)]
+struct LazyRef<'i, 'c> {
+ dfa: &'i DFA,
+ cache: &'c Cache,
+}
+
+impl<'i, 'c> LazyRef<'i, 'c> {
+ /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
+ fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
+ LazyRef { dfa, cache }
+ }
+
+ /// Return the ID of the start state for the given configuration.
+ ///
+ /// If the start state has not yet been computed, then this returns an
+ /// unknown lazy state ID.
+ fn get_cached_start_id(
+ &self,
+ pattern_id: Option<PatternID>,
+ start: Start,
+ ) -> LazyStateID {
+ let start_index = start.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ let pid = pid.as_usize();
+ assert!(
+ pid < self.dfa.pattern_count(),
+ "invalid pattern ID: {:?}",
+ pid
+ );
+ Start::count() + (Start::count() * pid) + start_index
+ }
+ };
+ self.cache.starts[index]
+ }
+
+ /// Return the cached NFA/DFA powerset state for the given ID.
+ ///
+ /// This panics if the given ID does not address a valid state.
+ fn get_cached_state(&self, sid: LazyStateID) -> &State {
+ let index = sid.as_usize_untagged() >> self.dfa.stride2();
+ &self.cache.states[index]
+ }
+
+ /// Returns true if and only if the given ID corresponds to a "sentinel"
+ /// state.
+ ///
+ /// A sentinel state is a state that signifies a special condition of
+ /// search, and where every transition maps back to itself. See LazyStateID
+ /// for more details. Note that start and match states are _not_ sentinels
+ /// since they may otherwise be real states with non-trivial transitions.
+ /// The purposes of sentinel states is purely to indicate something. Their
+ /// transitions are not meant to be followed.
+ fn is_sentinel(&self, id: LazyStateID) -> bool {
+ id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
+ }
+
+ /// Returns the ID of the unknown state for this lazy DFA.
+ fn unknown_id(&self) -> LazyStateID {
+ // This unwrap is OK since 0 is always a valid state ID.
+ LazyStateID::new(0).unwrap().to_unknown()
+ }
+
+ /// Returns the ID of the dead state for this lazy DFA.
+ fn dead_id(&self) -> LazyStateID {
+ // This unwrap is OK since the maximum value here is 1 * 512 = 512,
+ // which is <= 2047 (the maximum state ID on 16-bit systems). Where
+ // 512 is the worst case for our equivalence classes (every byte is a
+ // distinct class).
+ LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
+ }
+
+ /// Returns the ID of the quit state for this lazy DFA.
+ fn quit_id(&self) -> LazyStateID {
+ // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
+ // which is <= 2047 (the maximum state ID on 16-bit systems). Where
+ // 512 is the worst case for our equivalence classes (every byte is a
+ // distinct class).
+ LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
+ }
+
+ /// Returns true if and only if the given ID is valid.
+ ///
+ /// An ID is valid if it is both a valid index into the transition table
+ /// and is a multiple of the DFA's stride.
+ fn is_valid(&self, id: LazyStateID) -> bool {
+ let id = id.as_usize_untagged();
+ id < self.cache.trans.len() && id % self.dfa.stride() == 0
+ }
+
+ /// Returns true if adding the state given would fit in this cache.
+ fn state_fits_in_cache(&self, state: &State) -> bool {
+ let needed = self.cache.memory_usage()
+ + self.memory_usage_for_one_more_state(state.memory_usage());
+ needed <= self.dfa.cache_capacity
+ }
+
+ /// Returns true if adding the state to be built by the given builder would
+ /// fit in this cache.
+ fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
+ let needed = self.cache.memory_usage()
+ + self.memory_usage_for_one_more_state(state.as_bytes().len());
+ needed <= self.dfa.cache_capacity
+ }
+
+ /// Returns the additional memory usage, in bytes, required to add one more
+ /// state to this cache. The given size should be the heap size, in bytes,
+ /// that would be used by the new state being added.
+ fn memory_usage_for_one_more_state(
+ &self,
+ state_heap_size: usize,
+ ) -> usize {
+ const ID_SIZE: usize = size_of::<LazyStateID>();
+ const STATE_SIZE: usize = size_of::<State>();
+
+ self.dfa.stride() * ID_SIZE // additional space needed in trans table
+ + STATE_SIZE // space in cache.states
+ + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
+ + state_heap_size // heap memory used by state itself
+ }
+}
+
+/// A simple type that encapsulates the saving of a state ID through a cache
+/// clearing.
+///
+/// A state ID can be marked for saving with ToSave, while a state ID can be
+/// saved itself with Saved.
+#[derive(Clone, Debug)]
+enum StateSaver {
+ /// An empty state saver. In this case, no states (other than the special
+ /// sentinel states) are preserved after clearing the cache.
+ None,
+ /// An ID of a state (and the state itself) that should be preserved after
+ /// the lazy DFA's cache has been cleared. After clearing, the updated ID
+ /// is stored in 'Saved' since it may have changed.
+ ToSave { id: LazyStateID, state: State },
+ /// An ID that of a state that has been persisted through a lazy DFA
+ /// cache clearing. The ID recorded here corresonds to an ID that was
+ /// once marked as ToSave. The IDs are likely not equivalent even though
+ /// the states they point to are.
+ Saved(LazyStateID),
+}
+
+impl StateSaver {
+ /// Create an empty state saver.
+ fn none() -> StateSaver {
+ StateSaver::None
+ }
+
+ /// Replace this state saver with an empty saver, and if this saver is a
+ /// request to save a state, return that request.
+ fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
+ match core::mem::replace(self, StateSaver::None) {
+ StateSaver::None | StateSaver::Saved(_) => None,
+ StateSaver::ToSave { id, state } => Some((id, state)),
+ }
+ }
+
+ /// Replace this state saver with an empty saver, and if this saver is a
+ /// saved state (or a request to save a state), return that state's ID.
+ ///
+ /// The idea here is that a request to save a state isn't necessarily
+ /// honored because it might not be needed. e.g., Some higher level code
+ /// might request a state to be saved on the off chance that the cache gets
+ /// cleared when a new state is added at a lower level. But if that new
+ /// state is never added, then the cache is never cleared and the state and
+ /// its ID remain unchanged.
+ fn take_saved(&mut self) -> Option<LazyStateID> {
+ match core::mem::replace(self, StateSaver::None) {
+ StateSaver::None => None,
+ StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
+ }
+ }
+}
+
+/// The configuration used for building a lazy DFA.
+///
+/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Config` type directly.
+///
+/// A lazy DFA configuration is a simple data object that is typically used
+/// with [`Builder::configure`].
+///
+/// The default configuration guarantees that a search will _never_ return
+/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with
+/// [`Config::quit`], enabling heuristic support for Unicode word boundaries
+/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear
+/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search
+/// to return an error. See the corresponding configuration options for more
+/// details on when those error conditions arise.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ // As with other configuration types in this crate, we put all our knobs
+ // in options so that we can distinguish between "default" and "not set."
+ // This makes it possible to easily combine multiple configurations
+ // without default values overwriting explicitly specified values. See the
+ // 'overwrite' method.
+ //
+ // For docs on the fields below, see the corresponding method setters.
+ anchored: Option<bool>,
+ match_kind: Option<MatchKind>,
+ starts_for_each_pattern: Option<bool>,
+ byte_classes: Option<bool>,
+ unicode_word_boundary: Option<bool>,
+ quitset: Option<ByteSet>,
+ cache_capacity: Option<usize>,
+ skip_cache_capacity_check: Option<bool>,
+ minimum_cache_clear_count: Option<Option<usize>>,
+}
+
+impl Config {
+ /// Return a new default lazy DFA builder configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of a search. When
+ /// disabled (the default), the lazy DFA will act as if the pattern started
+ /// with a `(?s:.)*?`, which enables a match to appear anywhere.
+ ///
+ /// Note that if you want to run both anchored and unanchored
+ /// searches without building multiple automatons, you can enable the
+ /// [`Config::starts_for_each_pattern`] configuration instead. This will
+ /// permit unanchored any-pattern searches and pattern-specific anchored
+ /// searches. See the documentation for that configuration for an example.
+ ///
+ /// By default this is disabled.
+ ///
+ /// **WARNING:** this is subtly different than using a `^` at the start of
+ /// your regex. A `^` forces a regex to match exclusively at the start of
+ /// input, regardless of where you begin your search. In contrast, enabling
+ /// this option will allow your regex to match anywhere in your input,
+ /// but the match must start at the beginning of a search. (Most of the
+ /// higher level convenience search routines make "start of input" and
+ /// "start of search" equivalent, but some routines allow treating these as
+ /// orthogonal.)
+ ///
+ /// For example, consider the haystack `aba` and the following searches:
+ ///
+ /// 1. The regex `^a` is compiled with `anchored=false` and searches
+ /// `aba` starting at position `2`. Since `^` requires the match to
+ /// start at the beginning of the input and `2 > 0`, no match is found.
+ /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
+ /// starting at position `2`. This reports a match at `[2, 3]` since
+ /// the match starts where the search started. Since there is no `^`,
+ /// there is no requirement for the match to start at the beginning of
+ /// the input.
+ /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
+ /// starting at position `1`. Since `b` corresponds to position `1` and
+ /// since the regex is anchored, it finds no match.
+ /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
+ /// startting at position `1`. Since the regex is neither anchored nor
+ /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
+ /// prefix that permits it to match anywhere. Thus, it reports a match
+ /// at `[2, 3]`.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// a pattern that begins with `^` (as described in the above warning
+ /// message).
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let haystack = "aba".as_bytes();
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(false)) // default
+ /// .build(r"^a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 2, 3,
+ /// )?;
+ /// // No match is found because 2 is not the beginning of the haystack,
+ /// // which is what ^ requires.
+ /// let expected = None;
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(true))
+ /// .build(r"a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 2, 3,
+ /// )?;
+ /// // An anchored search can still match anywhere in the haystack, it just
+ /// // must begin at the start of the search which is '2' in this case.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(true))
+ /// .build(r"a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 1, 3,
+ /// )?;
+ /// // No match is found since we start searching at offset 1 which
+ /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+ /// // is found.
+ /// let expected = None;
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(false))
+ /// .build(r"a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 1, 3,
+ /// )?;
+ /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
+ /// // pattern. Even though the search starts at 'b', the 'match anything'
+ /// // prefix allows the search to match 'a'.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn anchored(mut self, yes: bool) -> Config {
+ self.anchored = Some(yes);
+ self
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to classical DFA construction
+ /// where all possible matches are added to the lazy DFA.
+ ///
+ /// Typically, `All` is used when one wants to execute an overlapping
+ /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+ /// sense to use `All` with the various "leftmost" find routines, since the
+ /// leftmost routines depend on the `LeftmostFirst` automata construction
+ /// strategy. Specifically, `LeftmostFirst` adds dead states to the
+ /// lazy DFA as a way to terminate the search and report a match.
+ /// `LeftmostFirst` also supports non-greedy matches using this strategy
+ /// where as `All` does not.
+ ///
+ /// # Example: overlapping search
+ ///
+ /// This example shows the typical use of `MatchKind::All`, which is to
+ /// report overlapping matches.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, OverlappingState},
+ /// HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: reverse automaton to find start of match
+ ///
+ /// Another example for using `MatchKind::All` is for constructing a
+ /// reverse automaton to find the start of a match. `All` semantics are
+ /// used for this in order to find the longest possible match, which
+ /// corresponds to the leftmost starting position.
+ ///
+ /// Note that if you need the starting position then
+ /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
+ /// for you, so it's usually not necessary to do this yourself.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind};
+ ///
+ /// let haystack = "123foobar456".as_bytes();
+ /// let pattern = r"[a-z]+";
+ ///
+ /// let dfa_fwd = DFA::new(pattern)?;
+ /// let dfa_rev = DFA::builder()
+ /// .configure(DFA::config()
+ /// .anchored(true)
+ /// .match_kind(MatchKind::All)
+ /// )
+ /// .build(pattern)?;
+ /// let mut cache_fwd = dfa_fwd.create_cache();
+ /// let mut cache_rev = dfa_rev.create_cache();
+ ///
+ /// let expected_fwd = HalfMatch::must(0, 9);
+ /// let expected_rev = HalfMatch::must(0, 3);
+ /// let got_fwd = dfa_fwd.find_leftmost_fwd(
+ /// &mut cache_fwd, haystack,
+ /// )?.unwrap();
+ /// // Here we don't specify the pattern to search for since there's only
+ /// // one pattern and we're doing a leftmost search. But if this were an
+ /// // overlapping search, you'd need to specify the pattern that matched
+ /// // in the forward direction. (Otherwise, you might wind up finding the
+ /// // starting position of a match of some other pattern.) That in turn
+ /// // requires building the reverse automaton with starts_for_each_pattern
+ /// // enabled. Indeed, this is what Regex does internally.
+ /// let got_rev = dfa_rev.find_leftmost_rev_at(
+ /// &mut cache_rev, None, haystack, 0, got_fwd.offset(),
+ /// )?.unwrap();
+ /// assert_eq!(expected_fwd, got_fwd);
+ /// assert_eq!(expected_rev, got_rev);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
+ self
+ }
+
+ /// Whether to compile a separate start state for each pattern in the
+ /// lazy DFA.
+ ///
+ /// When enabled, a separate **anchored** start state is added for each
+ /// pattern in the lazy DFA. When this start state is used, then the DFA
+ /// will only search for matches for the pattern specified, even if there
+ /// are other patterns in the DFA.
+ ///
+ /// The main downside of this option is that it can potentially increase
+ /// the size of the DFA and/or increase the time it takes to build the
+ /// DFA at search time. However, since this is configuration for a lazy
+ /// DFA, these states aren't actually built unless they're used. Enabling
+ /// this isn't necessarily free, however, as it may result in higher cache
+ /// usage.
+ ///
+ /// There are a few reasons one might want to enable this (it's disabled
+ /// by default):
+ ///
+ /// 1. When looking for the start of an overlapping match (using a reverse
+ /// DFA), doing it correctly requires starting the reverse search using the
+ /// starting state of the pattern that matched in the forward direction.
+ /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
+ /// will automatically enable this option when building the reverse DFA
+ /// internally.
+ /// 2. When you want to use a DFA with multiple patterns to both search
+ /// for matches of any pattern or to search for anchored matches of one
+ /// particular pattern while using the same DFA. (Otherwise, you would need
+ /// to compile a new DFA for each pattern.)
+ /// 3. Since the start states added for each pattern are anchored, if you
+ /// compile an unanchored DFA with one pattern while also enabling this
+ /// option, then you can use the same DFA to perform anchored or unanchored
+ /// searches. The latter you get with the standard search APIs. The former
+ /// you get from the various `_at` search methods that allow you specify a
+ /// pattern ID to search for.
+ ///
+ /// By default this is disabled.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this option to permit the same lazy DFA
+ /// to run both anchored and unanchored searches for a single pattern.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build(r"foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = b"quux foo123";
+ ///
+ /// // Here's a normal unanchored search. Notice that we use 'None' for the
+ /// // pattern ID. Since the DFA was built as an unanchored machine, it
+ /// // uses its default unanchored starting state.
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 0, haystack.len(),
+ /// )?);
+ /// // But now if we explicitly specify the pattern to search ('0' being
+ /// // the only pattern in the DFA), then it will use the starting state
+ /// // for that specific pattern which is always anchored. Since the
+ /// // pattern doesn't have a match at the beginning of the haystack, we
+ /// // find nothing.
+ /// assert_eq!(None, dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
+ /// )?);
+ /// // And finally, an anchored search is not the same as putting a '^' at
+ /// // beginning of the pattern. An anchored search can only match at the
+ /// // beginning of the *search*, which we can change:
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
+ /// )?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+ self.starts_for_each_pattern = Some(yes);
+ self
+ }
+
+ /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
+ /// not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging the lazy DFA.
+ ///
+ /// When enabled, the lazy DFA will use a map from all possible bytes
+ /// to their corresponding equivalence class. Each equivalence class
+ /// represents a set of bytes that does not discriminate between a match
+ /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
+ /// least two equivalence classes: a set containing `a` and `b` and a set
+ /// containing every byte except for `a` and `b`. `a` and `b` are in the
+ /// same equivalence classes because they never discriminate between a
+ /// match and a non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table
+ /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
+ /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
+ /// equivalence classes (rounded up to the nearest power of 2). As a
+ /// result, total space usage can decrease substantially. Moreover, since a
+ /// smaller alphabet is used, DFA compilation during search becomes faster
+ /// as well since it will potentially be able to reuse a single transition
+ /// for multiple bytes.
+ ///
+ /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
+ /// this does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(mut self, yes: bool) -> Config {
+ self.byte_classes = Some(yes);
+ self
+ }
+
+ /// Heuristically enable Unicode word boundaries.
+ ///
+ /// When set, this will attempt to implement Unicode word boundaries as if
+ /// they were ASCII word boundaries. This only works when the search input
+ /// is ASCII only. If a non-ASCII byte is observed while searching, then a
+ /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+ ///
+ /// A possible alternative to enabling this option is to simply use an
+ /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
+ /// option is if you absolutely need Unicode support. This option lets one
+ /// use a fast search implementation (a DFA) for some potentially very
+ /// common cases, while providing the option to fall back to some other
+ /// regex engine to handle the general case when an error is returned.
+ ///
+ /// If the pattern provided has no Unicode word boundary in it, then this
+ /// option has no effect. (That is, quitting on a non-ASCII byte only
+ /// occurs when this option is enabled _and_ a Unicode word boundary is
+ /// present in the pattern.)
+ ///
+ /// This is almost equivalent to setting all non-ASCII bytes to be quit
+ /// bytes. The only difference is that this will cause non-ASCII bytes to
+ /// be quit bytes _only_ when a Unicode word boundary is present in the
+ /// pattern.
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle
+ /// a [`MatchError`](crate::MatchError) error during search.
+ /// When using a [`Regex`](crate::hybrid::regex::Regex), this
+ /// corresponds to using the `try_` suite of methods. Alternatively,
+ /// if callers can guarantee that their input is ASCII only, then a
+ /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
+ /// returned while searching.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to heuristically enable Unicode word boundaries
+ /// in a pattern. It also shows what happens when a search comes across a
+ /// non-ASCII byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// HalfMatch, MatchError, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().unicode_word_boundary(true))
+ /// .build(r"\b[0-9]+\b")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// // The match occurs before the search ever observes the snowman
+ /// // character, so no error occurs.
+ /// let haystack = "foo 123 ☃".as_bytes();
+ /// let expected = Some(HalfMatch::must(0, 7));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // Notice that this search fails, even though the snowman character
+ /// // occurs after the ending match offset. This is because search
+ /// // routines read one byte past the end of the search to account for
+ /// // look-around, and indeed, this is required here to determine whether
+ /// // the trailing \b matches.
+ /// let haystack = "foo 123☃".as_bytes();
+ /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
+ // We have a separate option for this instead of just setting the
+ // appropriate quit bytes here because we don't want to set quit bytes
+ // for every regex. We only want to set them when the regex contains a
+ // Unicode word boundary.
+ self.unicode_word_boundary = Some(yes);
+ self
+ }
+
+ /// Add a "quit" byte to the lazy DFA.
+ ///
+ /// When a quit byte is seen during search time, then search will return
+ /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
+ /// offset at which the search stopped.
+ ///
+ /// A quit byte will always overrule any other aspects of a regex. For
+ /// example, if the `x` byte is added as a quit byte and the regex `\w` is
+ /// used, then observing `x` will cause the search to quit immediately
+ /// despite the fact that `x` is in the `\w` class.
+ ///
+ /// This mechanism is primarily useful for heuristically enabling certain
+ /// features like Unicode word boundaries in a DFA. Namely, if the input
+ /// to search is ASCII, then a Unicode word boundary can be implemented
+ /// via an ASCII word boundary with no change in semantics. Thus, a DFA
+ /// can attempt to match a Unicode word boundary but give up as soon as it
+ /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
+ /// to be quit bytes, then Unicode word boundaries will be permitted when
+ /// building lazy DFAs. Of course, callers should enable
+ /// [`Config::unicode_word_boundary`] if they want this behavior instead.
+ /// (The advantage being that non-ASCII quit bytes will only be added if a
+ /// Unicode word boundary is in the pattern.)
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle a
+ /// [`MatchError`](crate::MatchError) error during search. When using a
+ /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
+ /// `try_` suite of methods.
+ ///
+ /// By default, there are no quit bytes set.
+ ///
+ /// # Panics
+ ///
+ /// This panics if heuristic Unicode word boundaries are enabled and any
+ /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
+ /// Unicode word boundaries requires setting every non-ASCII byte to a quit
+ /// byte. So if the caller attempts to undo any of that, then this will
+ /// panic.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte. This could be useful if, for example, you wanted to prevent
+ /// a user supplied pattern from matching across a line boundary.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "foo\nbar".as_bytes();
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn quit(mut self, byte: u8, yes: bool) -> Config {
+ if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
+ panic!(
+ "cannot set non-ASCII byte to be non-quit when \
+ Unicode word boundaries are enabled"
+ );
+ }
+ if self.quitset.is_none() {
+ self.quitset = Some(ByteSet::empty());
+ }
+ if yes {
+ self.quitset.as_mut().unwrap().add(byte);
+ } else {
+ self.quitset.as_mut().unwrap().remove(byte);
+ }
+ self
+ }
+
+ /// Sets the maximum amount of heap memory, in bytes, to allocate to the
+ /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
+ /// use more heap memory, then, depending on other configuration knobs,
+ /// either stop the search and return an error or clear the cache and
+ /// continue the search.
+ ///
+ /// The default cache capacity is some "reasonable" number that will
+ /// accommodate most regular expressions. You may find that if you need
+ /// to build a large DFA then it may be necessary to increase the cache
+ /// capacity.
+ ///
+ /// Note that while building a lazy DFA will do a "minimum" check to ensure
+ /// the capacity is big enough, this is more or less about correctness.
+ /// If the cache is bigger than the minimum but still too small, then the
+ /// lazy DFA could wind up spending a lot of time clearing the cache and
+ /// recomputing transitions, thus negating the performance benefits of a
+ /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
+ /// endeavor. For most common patterns, however, the default should be
+ /// sufficient.
+ ///
+ /// For more details on how the lazy DFA's cache is used, see the
+ /// documentation for [`Cache`].
+ ///
+ /// # Example
+ ///
+ /// This example shows what happens if the configured cache capacity is
+ /// too small. In such cases, one can override the cache capacity to make
+ /// it bigger. Alternatively, one might want to use less memory by setting
+ /// a smaller cache capacity.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ ///
+ /// let pattern = r"\p{L}{1000}";
+ ///
+ /// // The default cache capacity is likely too small to deal with regexes
+ /// // that are very large. Large repetitions of large Unicode character
+ /// // classes are a common way to make very large regexes.
+ /// let _ = DFA::new(pattern).unwrap_err();
+ /// // Bump up the capacity to something bigger.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
+ /// .build(pattern)?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
+ /// let expected = Some(HalfMatch::must(0, 2000));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn cache_capacity(mut self, bytes: usize) -> Config {
+ self.cache_capacity = Some(bytes);
+ self
+ }
+
+ /// Configures construction of a lazy DFA to use the minimum cache capacity
+ /// if the configured capacity is otherwise too small for the provided NFA.
+ ///
+ /// This is useful if you never want lazy DFA construction to fail because
+ /// of a capacity that is too small.
+ ///
+ /// In general, this option is typically not a good idea. In particular,
+ /// while a minimum cache capacity does permit the lazy DFA to function
+ /// where it otherwise couldn't, it's plausible that it may not function
+ /// well if it's constantly running out of room. In that case, the speed
+ /// advantages of the lazy DFA may be negated.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows what happens if the configured cache capacity is
+ /// too small. In such cases, one could override the capacity explicitly.
+ /// An alternative, demonstrated here, let's us force construction to use
+ /// the minimum cache capacity if the configured capacity is otherwise
+ /// too small.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ ///
+ /// let pattern = r"\p{L}{1000}";
+ ///
+ /// // The default cache capacity is likely too small to deal with regexes
+ /// // that are very large. Large repetitions of large Unicode character
+ /// // classes are a common way to make very large regexes.
+ /// let _ = DFA::new(pattern).unwrap_err();
+ /// // Configure construction such it automatically selects the minimum
+ /// // cache capacity if it would otherwise be too small.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().skip_cache_capacity_check(true))
+ /// .build(pattern)?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
+ /// let expected = Some(HalfMatch::must(0, 2000));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
+ self.skip_cache_capacity_check = Some(yes);
+ self
+ }
+
+ /// Configure a lazy DFA search to quit after a certain number of cache
+ /// clearings.
+ ///
+ /// When a minimum is set, then a lazy DFA search will "give up" after
+ /// the minimum number of cache clearings has occurred. This is typically
+ /// useful in scenarios where callers want to detect whether the lazy DFA
+ /// search is "efficient" or not. If the cache is cleared too many times,
+ /// this is a good indicator that it is not efficient, and thus, the caller
+ /// may wish to use some other regex engine.
+ ///
+ /// Note that the number of times a cache is cleared is a property of
+ /// the cache itself. Thus, if a cache is used in a subsequent search
+ /// with a similarly configured lazy DFA, then it would cause the
+ /// search to "give up" if the cache needed to be cleared. The cache
+ /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or
+ /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
+ /// you're using the `Regex` API).
+ ///
+ /// By default, no minimum is configured. Thus, a lazy DFA search will
+ /// never give up due to cache clearings.
+ ///
+ /// # Example
+ ///
+ /// This example uses a somewhat pathological configuration to demonstrate
+ /// the _possible_ behavior of cache clearing and how it might result
+ /// in a search that returns an error.
+ ///
+ /// It is important to note that the precise mechanics of how and when
+ /// a cache gets cleared is an implementation detail. Thus, the asserts
+ /// in the tests below with respect to the particular offsets at which a
+ /// search gave up should be viewed strictly as a demonstration. They are
+ /// not part of any API guarantees offered by this crate.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, MatchError};
+ ///
+ /// // This is a carefully chosen regex. The idea is to pick one
+ /// // that requires some decent number of states (hence the bounded
+ /// // repetition). But we specifically choose to create a class with an
+ /// // ASCII letter and a non-ASCII letter so that we can check that no new
+ /// // states are created once the cache is full. Namely, if we fill up the
+ /// // cache on a haystack of 'a's, then in order to match one 'β', a new
+ /// // state will need to be created since a 'β' is encoded with multiple
+ /// // bytes. Since there's no room for this state, the search should quit
+ /// // at the very first position.
+ /// let pattern = r"[aβ]{100}";
+ /// let dfa = DFA::builder()
+ /// .configure(
+ /// // Configure it so that we have the minimum cache capacity
+ /// // possible. And that if any clearings occur, the search quits.
+ /// DFA::config()
+ /// .skip_cache_capacity_check(true)
+ /// .cache_capacity(0)
+ /// .minimum_cache_clear_count(Some(0)),
+ /// )
+ /// .build(pattern)?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "a".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_leftmost_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 25 }),
+ /// );
+ ///
+ /// // Now that we know the cache is full, if we search a haystack that we
+ /// // know will require creating at least one new state, it should not
+ /// // be able to make any progress.
+ /// let haystack = "β".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_leftmost_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 0 }),
+ /// );
+ ///
+ /// // If we reset the cache, then we should be able to create more states
+ /// // and make more progress with searching for betas.
+ /// cache.reset(&dfa);
+ /// let haystack = "β".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_earliest_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 26 }),
+ /// );
+ ///
+ /// // ... switching back to ASCII still makes progress since it just needs
+ /// // to set transitions on existing states!
+ /// let haystack = "a".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_earliest_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 13 }),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
+ self.minimum_cache_clear_count = Some(min);
+ self
+ }
+
+ /// Returns whether this configuration has enabled anchored searches.
+ pub fn get_anchored(&self) -> bool {
+ self.anchored.unwrap_or(false)
+ }
+
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns whether this configuration has enabled anchored starting states
+ /// for every pattern in the DFA.
+ pub fn get_starts_for_each_pattern(&self) -> bool {
+ self.starts_for_each_pattern.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled byte classes or not.
+ /// This is typically a debugging oriented option, as disabling it confers
+ /// no speed benefit.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns whether this configuration has enabled heuristic Unicode word
+ /// boundary support. When enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_unicode_word_boundary(&self) -> bool {
+ self.unicode_word_boundary.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration will instruct the DFA to enter a
+ /// quit state whenever the given byte is seen during a search. When at
+ /// least one byte has this enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_quit(&self, byte: u8) -> bool {
+ self.quitset.map_or(false, |q| q.contains(byte))
+ }
+
+ /// Returns the cache capacity set on this configuration.
+ pub fn get_cache_capacity(&self) -> usize {
+ self.cache_capacity.unwrap_or(2 * (1 << 20))
+ }
+
+ /// Returns whether the cache capacity check should be skipped.
+ pub fn get_skip_cache_capacity_check(&self) -> bool {
+ self.skip_cache_capacity_check.unwrap_or(false)
+ }
+
+ /// Returns, if set, the minimum number of times the cache must be cleared
+ /// before a lazy DFA search can give up. When no minimum is set, then a
+ /// search will never quit and will always clear the cache whenever it
+ /// fills up.
+ pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
+ self.minimum_cache_clear_count.unwrap_or(None)
+ }
+
+ /// Returns the minimum lazy DFA cache capacity required for the given NFA.
+ ///
+ /// The cache capacity required for a particular NFA may change without
+ /// notice. Callers should not rely on it being stable.
+ ///
+ /// This is useful for informational purposes, but can also be useful for
+ /// other reasons. For example, if one wants to check the minimum cache
+ /// capacity themselves or if one wants to set the capacity based on the
+ /// minimum.
+ ///
+ /// This may return an error if this configuration does not support all of
+ /// the instructions used in the given NFA. For example, if the NFA has a
+ /// Unicode word boundary but this configuration does not enable heuristic
+ /// support for Unicode word boundaries.
+ pub fn get_minimum_cache_capacity(
+ &self,
+ nfa: &thompson::NFA,
+ ) -> Result<usize, BuildError> {
+ let quitset = self.quit_set_from_nfa(nfa)?;
+ let classes = self.byte_classes_from_nfa(nfa, &quitset);
+ let starts = self.get_starts_for_each_pattern();
+ Ok(minimum_cache_capacity(nfa, &classes, starts))
+ }
+
+ /// Returns the byte class map used during search from the given NFA.
+ ///
+ /// If byte classes are disabled on this configuration, then a map is
+ /// returned that puts each byte in its own equivalent class.
+ fn byte_classes_from_nfa(
+ &self,
+ nfa: &thompson::NFA,
+ quit: &ByteSet,
+ ) -> ByteClasses {
+ if !self.get_byte_classes() {
+ // The lazy DFA will always use the equivalence class map, but
+ // enabling this option is useful for debugging. Namely, this will
+ // cause all transitions to be defined over their actual bytes
+ // instead of an opaque equivalence class identifier. The former is
+ // much easier to grok as a human.
+ ByteClasses::singletons()
+ } else {
+ let mut set = nfa.byte_class_set().clone();
+ // It is important to distinguish any "quit" bytes from all other
+ // bytes. Otherwise, a non-quit byte may end up in the same class
+ // as a quit byte, and thus cause the DFA stop when it shouldn't.
+ if !quit.is_empty() {
+ set.add_set(&quit);
+ }
+ set.byte_classes()
+ }
+ }
+
+ /// Return the quit set for this configuration and the given NFA.
+ ///
+ /// This may return an error if the NFA is incompatible with this
+ /// configuration's quit set. For example, if the NFA has a Unicode word
+ /// boundary and the quit set doesn't include non-ASCII bytes.
+ fn quit_set_from_nfa(
+ &self,
+ nfa: &thompson::NFA,
+ ) -> Result<ByteSet, BuildError> {
+ let mut quit = self.quitset.unwrap_or(ByteSet::empty());
+ if nfa.has_word_boundary_unicode() {
+ if self.get_unicode_word_boundary() {
+ for b in 0x80..=0xFF {
+ quit.add(b);
+ }
+ } else {
+ // If heuristic support for Unicode word boundaries wasn't
+ // enabled, then we can still check if our quit set is correct.
+ // If the caller set their quit bytes in a way that causes the
+ // DFA to quit on at least all non-ASCII bytes, then that's all
+ // we need for heuristic support to work.
+ if !quit.contains_range(0x80, 0xFF) {
+ return Err(
+ BuildError::unsupported_dfa_word_boundary_unicode(),
+ );
+ }
+ }
+ }
+ Ok(quit)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ fn overwrite(self, o: Config) -> Config {
+ Config {
+ anchored: o.anchored.or(self.anchored),
+ match_kind: o.match_kind.or(self.match_kind),
+ starts_for_each_pattern: o
+ .starts_for_each_pattern
+ .or(self.starts_for_each_pattern),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ unicode_word_boundary: o
+ .unicode_word_boundary
+ .or(self.unicode_word_boundary),
+ quitset: o.quitset.or(self.quitset),
+ cache_capacity: o.cache_capacity.or(self.cache_capacity),
+ skip_cache_capacity_check: o
+ .skip_cache_capacity_check
+ .or(self.skip_cache_capacity_check),
+ minimum_cache_clear_count: o
+ .minimum_cache_clear_count
+ .or(self.minimum_cache_clear_count),
+ }
+ }
+}
+
+/// A builder for constructing a lazy deterministic finite automaton from
+/// regular expressions.
+///
+/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Builder` type directly.
+///
+/// This builder provides two main things:
+///
+/// 1. It provides a few different `build` routines for actually constructing
+/// a DFA from different kinds of inputs. The most convenient is
+/// [`Builder::build`], which builds a DFA directly from a pattern string. The
+/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
+/// from an NFA.
+/// 2. The builder permits configuring a number of things.
+/// [`Builder::configure`] is used with [`Config`] to configure aspects of
+/// the DFA and the construction process itself. [`Builder::syntax`] and
+/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
+/// construction, respectively. The syntax and thompson configurations only
+/// apply when building from a pattern string.
+///
+/// This builder always constructs a *single* lazy DFA. As such, this builder
+/// can only be used to construct regexes that either detect the presence
+/// of a match or find the end location of a match. A single DFA cannot
+/// produce both the start and end of a match. For that information, use a
+/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
+/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
+/// to use a DFA directly is if the end location of a match is enough for your
+/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
+/// since a second reverse DFA is needed to find the start of a match.
+///
+/// # Example
+///
+/// This example shows how to build a lazy DFA that uses a tiny cache capacity
+/// and completely disables Unicode. That is:
+///
+/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
+/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
+/// (instead of any UTF-8 encoding of a Unicode scalar value except for
+/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
+/// * The pattern itself is permitted to match invalid UTF-8. For example,
+/// things like `[^a]` that match any byte except for `a` are permitted.
+/// * Unanchored patterns can search through invalid UTF-8. That is, for
+/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
+/// `(?s:.)*?`.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::dfa::DFA,
+/// nfa::thompson,
+/// HalfMatch, SyntaxConfig,
+/// };
+///
+/// let dfa = DFA::builder()
+/// .configure(DFA::config().cache_capacity(5_000))
+/// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo[^b]ar.*")?;
+/// let mut cache = dfa.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
+/// let expected = Some(HalfMatch::must(0, 10));
+/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ thompson: thompson::Builder,
+}
+
+impl Builder {
+ /// Create a new lazy DFA builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ thompson: thompson::Builder::new(),
+ }
+ }
+
+ /// Build a lazy DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a lazy DFA from the given patterns.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<DFA, BuildError> {
+ let nfa =
+ self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
+ self.build_from_nfa(Arc::new(nfa))
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// Note that this requires an `Arc<thompson::NFA>` instead of a
+ /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at
+ /// search time. This means that the lazy DFA must hold on to its source
+ /// NFA for the entirety of its lifetime. An `Arc` is used so that callers
+ /// aren't forced to clone the NFA if it is needed elsewhere.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a lazy DFA if you already have an NFA
+ /// in hand.
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+ ///
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // This shows how to set non-default options for building an NFA.
+ /// let nfa = thompson::Builder::new()
+ /// .configure(thompson::Config::new().shrink(false))
+ /// .build(r"[0-9]+")?;
+ /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_nfa(
+ &self,
+ nfa: Arc<thompson::NFA>,
+ ) -> Result<DFA, BuildError> {
+ let quitset = self.config.quit_set_from_nfa(&nfa)?;
+ let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
+ // Check that we can fit at least a few states into our cache,
+ // otherwise it's pretty senseless to use the lazy DFA. This does have
+ // a possible failure mode though. This assumes the maximum size of a
+ // state in powerset space (so, the total number of NFA states), which
+ // may never actually materialize, and could be quite a bit larger
+ // than the actual biggest state. If this turns out to be a problem,
+ // we could expose a knob that disables this check. But if so, we have
+ // to be careful not to panic in other areas of the code (the cache
+ // clearing and init code) that tend to assume some minimum useful
+ // cache capacity.
+ let min_cache = minimum_cache_capacity(
+ &nfa,
+ &classes,
+ self.config.get_starts_for_each_pattern(),
+ );
+ let mut cache_capacity = self.config.get_cache_capacity();
+ if cache_capacity < min_cache {
+ // When the caller has asked us to skip the cache capacity check,
+ // then we simply force the cache capacity to its minimum amount
+ // and mush on.
+ if self.config.get_skip_cache_capacity_check() {
+ trace!(
+ "given capacity ({}) is too small, \
+ since skip_cache_capacity_check is enabled, \
+ setting cache capacity to minimum ({})",
+ cache_capacity,
+ min_cache,
+ );
+ cache_capacity = min_cache;
+ } else {
+ return Err(BuildError::insufficient_cache_capacity(
+ min_cache,
+ cache_capacity,
+ ));
+ }
+ }
+ // We also need to check that we can fit at least some small number
+ // of states in our state ID space. This is unlikely to trigger in
+ // >=32-bit systems, but 16-bit systems have a pretty small state ID
+ // space since a number of bits are used up as sentinels.
+ if let Err(err) = minimum_lazy_state_id(&nfa, &classes) {
+ return Err(BuildError::insufficient_state_id_capacity(err));
+ }
+ let stride2 = classes.stride2();
+ Ok(DFA {
+ nfa,
+ stride2,
+ classes,
+ quitset,
+ anchored: self.config.get_anchored(),
+ match_kind: self.config.get_match_kind(),
+ starts_for_each_pattern: self.config.get_starts_for_each_pattern(),
+ cache_capacity,
+ minimum_cache_clear_count: self
+ .config
+ .get_minimum_cache_clear_count(),
+ })
+ }
+
+ /// Apply the given lazy DFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a lazy DFA directly from a
+ /// pattern.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether the DFA should match the regex
+ /// in reverse or if additional time should be spent shrinking the size of
+ /// the NFA.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+/// Based on the minimum number of states required for a useful lazy DFA cache,
+/// this returns the minimum lazy state ID that must be representable.
+///
+/// It's likely not plausible for this to impose constraints on 32-bit systems
+/// (or higher), but on 16-bit systems, the lazy state ID space is quite
+/// constrained and thus may be insufficient for bigger regexes.
+fn minimum_lazy_state_id(
+ nfa: &thompson::NFA,
+ classes: &ByteClasses,
+) -> Result<LazyStateID, LazyStateIDError> {
+ let stride = 1 << classes.stride2();
+ let min_state_index = MIN_STATES.checked_sub(1).unwrap();
+ LazyStateID::new(min_state_index * stride)
+}
+
+/// Based on the minimum number of states required for a useful lazy DFA cache,
+/// this returns a heuristic minimum number of bytes of heap space required.
+///
+/// This is a "heuristic" because the minimum it returns is likely bigger than
+/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
+/// the maximum number of NFA states (all of them). This is likely bigger
+/// than what is required in practice. Computing the true minimum effectively
+/// requires determinization, which is probably too much work to do for a
+/// simple check like this.
+fn minimum_cache_capacity(
+ nfa: &thompson::NFA,
+ classes: &ByteClasses,
+ starts_for_each_pattern: bool,
+) -> usize {
+ const ID_SIZE: usize = size_of::<LazyStateID>();
+ let stride = 1 << classes.stride2();
+
+ let sparses = 2 * nfa.len() * NFAStateID::SIZE;
+ let trans = MIN_STATES * stride * ID_SIZE;
+
+ let mut starts = Start::count() * ID_SIZE;
+ if starts_for_each_pattern {
+ starts += (Start::count() * nfa.pattern_len()) * ID_SIZE;
+ }
+
+ // Every `State` has three bytes for flags, 4 bytes (max) for the number
+ // of patterns, followed by 32-bit encodings of patterns and then delta
+ // varint encodings of NFA state IDs. We use the worst case (which isn't
+ // technically possible) of 5 bytes for each NFA state ID.
+ //
+ // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
+ // unknown, dead and quit states. Those states have a known size and it is
+ // small.
+ assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3");
+ let dead_state_size = State::dead().memory_usage();
+ let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5);
+ let states = (3 * (size_of::<State>() + dead_state_size))
+ + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size));
+ let states_to_sid = states + (MIN_STATES * ID_SIZE);
+ let stack = nfa.len() * NFAStateID::SIZE;
+ let scratch_state_builder = max_state_size;
+
+ trans
+ + starts
+ + states
+ + states_to_sid
+ + sparses
+ + stack
+ + scratch_state_builder
+}
diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs
new file mode 100644
index 000000000..715da39bd
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/error.rs
@@ -0,0 +1,130 @@
+use crate::{hybrid::id::LazyStateIDError, nfa};
+
+/// An error that occurs when initial construction of a lazy DFA fails.
+///
+/// A build error can occur when insufficient cache capacity is configured or
+/// if something about the NFA is unsupported. (For example, if one attempts
+/// to build a lazy DFA without heuristic Unicode support but with an NFA that
+/// contains a Unicode word boundary.)
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
+}
+
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ NFA(nfa::thompson::Error),
+ InsufficientCacheCapacity { minimum: usize, given: usize },
+ InsufficientStateIDCapacity { err: LazyStateIDError },
+ Unsupported(&'static str),
+}
+
+impl BuildError {
+ fn kind(&self) -> &BuildErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+
+ pub(crate) fn insufficient_cache_capacity(
+ minimum: usize,
+ given: usize,
+ ) -> BuildError {
+ BuildError {
+ kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given },
+ }
+ }
+
+ pub(crate) fn insufficient_state_id_capacity(
+ err: LazyStateIDError,
+ ) -> BuildError {
+ BuildError {
+ kind: BuildErrorKind::InsufficientStateIDCapacity { err },
+ }
+ }
+
+ pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
+ let msg = "cannot build lazy DFAs for regexes with Unicode word \
+ boundaries; switch to ASCII word boundaries, or \
+ heuristically enable Unicode word boundaries or use a \
+ different regex engine";
+ BuildError { kind: BuildErrorKind::Unsupported(msg) }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind() {
+ BuildErrorKind::NFA(ref err) => Some(err),
+ BuildErrorKind::InsufficientCacheCapacity { .. } => None,
+ // LazyStateIDError is an implementation detail, don't expose it.
+ BuildErrorKind::InsufficientStateIDCapacity { .. } => None,
+ BuildErrorKind::Unsupported(_) => None,
+ }
+ }
+}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind() {
+ BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
+ BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
+ write!(
+ f,
+ "given cache capacity ({}) is smaller than \
+ minimum required ({})",
+ given, minimum,
+ )
+ }
+ BuildErrorKind::InsufficientStateIDCapacity { ref err } => {
+ err.fmt(f)
+ }
+ BuildErrorKind::Unsupported(ref msg) => {
+ write!(f, "unsupported regex feature for DFAs: {}", msg)
+ }
+ }
+ }
+}
+
+/// An error that occurs when cache usage has become inefficient.
+///
+/// One of the weaknesses of a lazy DFA is that it may need to clear its
+/// cache repeatedly if it's not big enough. If this happens too much, then it
+/// can slow searching down significantly. A mitigation to this is to use
+/// heuristics to detect whether the cache is being used efficiently or not.
+/// If not, then a lazy DFA can return a `CacheError`.
+///
+/// The default configuration of a lazy DFA in this crate is
+/// set such that a `CacheError` will never occur. Instead,
+/// callers must opt into this behavior with settings like
+/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count).
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct CacheError(());
+
+impl CacheError {
+ pub(crate) fn too_many_cache_clears() -> CacheError {
+ CacheError(())
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CacheError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ None
+ }
+}
+
+impl core::fmt::Display for CacheError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "lazy DFA cache has been cleared too many times")
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/id.rs b/vendor/regex-automata/src/hybrid/id.rs
new file mode 100644
index 000000000..a6fcde52e
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/id.rs
@@ -0,0 +1,415 @@
+/// A state identifier especially tailored for lazy DFAs.
+///
+/// A lazy state ID logically represents a pointer to a DFA state. In practice,
+/// by limiting the number of DFA states it can address, it reserves some
+/// bits of its representation to encode some additional information. That
+/// additional information is called a "tag." That tag is used to record
+/// whether the state it points to is an unknown, dead, quit, start or match
+/// state.
+///
+/// When implementing a low level search routine with a lazy DFA, it is
+/// necessary to query the type of the current state to know what to do:
+///
+/// * **Unknown** - The state has not yet been computed. The
+/// parameters used to get this state ID must be re-passed to
+/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an
+/// unknown state ID.
+/// * **Dead** - A dead state only has transitions to itself. It indicates that
+/// the search cannot do anything else and should stop with whatever result it
+/// has.
+/// * **Quit** - A quit state indicates that the automaton could not answer
+/// whether a match exists or not. Correct search implementations must return a
+/// [`MatchError::Quit`](crate::MatchError::Quit).
+/// * **Start** - A start state indicates that the automaton will begin
+/// searching at a starting state. Branching on this isn't required for
+/// correctness, but a common optimization is to use this to more quickly look
+/// for a prefix.
+/// * **Match** - A match state indicates that a match has been found.
+/// Depending on the semantics of your search implementation, it may either
+/// continue until the end of the haystack or a dead state, or it might quit
+/// and return the match immediately.
+///
+/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
+/// can be used to determine if a tag exists at all. This is useful to avoid
+/// branching on all of the above types for every byte searched.
+///
+/// # Example
+///
+/// This example shows how `LazyStateID` can be used to implement a correct
+/// search routine with minimal branching. In particular, this search routine
+/// implements "leftmost" matching, which means that it doesn't immediately
+/// stop once a match is found. Instead, it continues until it reaches a dead
+/// state.
+///
+/// Notice also how a correct search implementation deals with
+/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
+/// the lazy DFA routines. When a `CacheError` occurs, it returns
+/// [`MatchError::GaveUp`](crate::MatchError::GaveUp).
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::dfa::{Cache, DFA},
+/// HalfMatch, MatchError, PatternID,
+/// };
+///
+/// fn find_leftmost_first(
+/// dfa: &DFA,
+/// cache: &mut Cache,
+/// haystack: &[u8],
+/// ) -> Result<Option<HalfMatch>, MatchError> {
+/// // The start state is determined by inspecting the position and the
+/// // initial bytes of the haystack. Note that start states can never
+/// // be match states (since DFAs in this crate delay matches by 1
+/// // byte), so we don't need to check if the start state is a match.
+/// let mut sid = dfa.start_state_forward(
+/// cache, None, haystack, 0, haystack.len(),
+/// ).map_err(|_| MatchError::GaveUp { offset: 0 })?;
+/// let mut last_match = None;
+/// // Walk all the bytes in the haystack. We can quit early if we see
+/// // a dead or a quit state. The former means the automaton will
+/// // never transition to any other state. The latter means that the
+/// // automaton entered a condition in which its search failed.
+/// for (i, &b) in haystack.iter().enumerate() {
+/// sid = dfa
+/// .next_state(cache, sid, b)
+/// .map_err(|_| MatchError::GaveUp { offset: i })?;
+/// if sid.is_tagged() {
+/// if sid.is_match() {
+/// last_match = Some(HalfMatch::new(
+/// dfa.match_pattern(cache, sid, 0),
+/// i,
+/// ));
+/// } else if sid.is_dead() {
+/// return Ok(last_match);
+/// } else if sid.is_quit() {
+/// // It is possible to enter into a quit state after
+/// // observing a match has occurred. In that case, we
+/// // should return the match instead of an error.
+/// if last_match.is_some() {
+/// return Ok(last_match);
+/// }
+/// return Err(MatchError::Quit { byte: b, offset: i });
+/// }
+/// // Implementors may also want to check for start states and
+/// // handle them differently for performance reasons. But it is
+/// // not necessary for correctness.
+/// }
+/// }
+/// // Matches are always delayed by 1 byte, so we must explicitly walk
+/// // the special "EOI" transition at the end of the search.
+/// sid = dfa
+/// .next_eoi_state(cache, sid)
+/// .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?;
+/// if sid.is_match() {
+/// last_match = Some(HalfMatch::new(
+/// dfa.match_pattern(cache, sid, 0),
+/// haystack.len(),
+/// ));
+/// }
+/// Ok(last_match)
+/// }
+///
+/// // We use a greedy '+' operator to show how the search doesn't just stop
+/// // once a match is detected. It continues extending the match. Using
+/// // '[a-z]+?' would also work as expected and stop the search early.
+/// // Greediness is built into the automaton.
+/// let dfa = DFA::new(r"[a-z]+")?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 10);
+///
+/// // Here's another example that tests our handling of the special
+/// // EOI transition. This will fail to find a match if we don't call
+/// // 'next_eoi_state' at the end of the search since the match isn't found
+/// // until the final byte in the haystack.
+/// let dfa = DFA::new(r"[0-9]{4}")?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 15);
+///
+/// // And note that our search implementation above automatically works
+/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
+/// // the appropriate pattern ID for us.
+/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 1);
+/// assert_eq!(mat.offset(), 3);
+/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 7);
+/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 1);
+/// assert_eq!(mat.offset(), 5);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct LazyStateID(u32);
+
+impl LazyStateID {
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ const MAX_BIT: usize = 31;
+
+ #[cfg(target_pointer_width = "16")]
+ const MAX_BIT: usize = 15;
+
+ const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
+ const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
+ const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
+ const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
+ const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
+ const MAX: usize = LazyStateID::MASK_MATCH - 1;
+
+ /// Create a new lazy state ID.
+ ///
+ /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
+ if id > LazyStateID::MAX {
+ return Err(LazyStateIDError { attempted: id as u64 });
+ }
+ Ok(LazyStateID::new_unchecked(id))
+ }
+
+ /// Create a new lazy state ID without checking whether the given value
+ /// exceeds [`LazyStateID::MAX`].
+ ///
+ /// While this is unchecked, providing an incorrect value must never
+ /// sacrifice memory safety.
+ #[inline]
+ const fn new_unchecked(id: usize) -> LazyStateID {
+ LazyStateID(id as u32)
+ }
+
+ /// Return this lazy state ID as its raw value if and only if it is not
+ /// tagged (and thus not an unknown, dead, quit, start or match state ID).
+ #[inline]
+ pub(crate) fn as_usize(&self) -> Option<usize> {
+ if self.is_tagged() {
+ None
+ } else {
+ Some(self.as_usize_unchecked())
+ }
+ }
+
+ /// Return this lazy state ID as an untagged `usize`.
+ ///
+ /// If this lazy state ID is tagged, then the usize returned is the state
+ /// ID without the tag. If the ID was not tagged, then the usize returned
+ /// is equivalent to the state ID.
+ #[inline]
+ pub(crate) fn as_usize_untagged(&self) -> usize {
+ self.as_usize_unchecked() & LazyStateID::MAX
+ }
+
+ /// Return this lazy state ID as its raw internal `usize` value, which may
+ /// be tagged (and thus greater than LazyStateID::MAX).
+ #[inline]
+ pub(crate) const fn as_usize_unchecked(&self) -> usize {
+ self.0 as usize
+ }
+
+ #[inline]
+ pub(crate) const fn to_unknown(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
+ )
+ }
+
+ #[inline]
+ pub(crate) const fn to_dead(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
+ )
+ }
+
+ #[inline]
+ pub(crate) const fn to_quit(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
+ )
+ }
+
+ /// Return this lazy state ID as a state ID that is tagged as a start
+ /// state.
+ #[inline]
+ pub(crate) const fn to_start(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_START,
+ )
+ }
+
+ /// Return this lazy state ID as a lazy state ID that is tagged as a match
+ /// state.
+ #[inline]
+ pub(crate) const fn to_match(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
+ )
+ }
+
+ /// Return true if and only if this lazy state ID is tagged.
+ ///
+ /// When a lazy state ID is tagged, then one can conclude that it is one
+ /// of a match, start, dead, quit or unknown state.
+ #[inline]
+ pub const fn is_tagged(&self) -> bool {
+ self.as_usize_unchecked() > LazyStateID::MAX
+ }
+
+ /// Return true if and only if this represents a lazy state ID that is
+ /// "unknown." That is, the state has not yet been created. When a caller
+ /// sees this state ID, it generally means that a state has to be computed
+ /// in order to proceed.
+ #[inline]
+ pub const fn is_unknown(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
+ }
+
+ /// Return true if and only if this represents a dead state. A dead state
+ /// is a state that can never transition to any other state except the
+ /// dead state. When a dead state is seen, it generally indicates that a
+ /// search should stop.
+ #[inline]
+ pub const fn is_dead(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
+ }
+
+ /// Return true if and only if this represents a quit state. A quit state
+ /// is a state that is representationally equivalent to a dead state,
+ /// except it indicates the automaton has reached a point at which it can
+ /// no longer determine whether a match exists or not. In general, this
+ /// indicates an error during search and the caller must either pass this
+ /// error up or use a different search technique.
+ #[inline]
+ pub const fn is_quit(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
+ }
+
+ /// Return true if and only if this lazy state ID has been tagged as a
+ /// start state.
+ #[inline]
+ pub const fn is_start(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_START > 0
+ }
+
+ /// Return true if and only if this lazy state ID has been tagged as a
+ /// match state.
+ #[inline]
+ pub const fn is_match(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
+ }
+}
+
+/// This error occurs when a lazy state ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum lazy state ID
+/// value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub(crate) struct LazyStateIDError {
+ attempted: u64,
+}
+
+impl LazyStateIDError {
+ /// Returns the value that failed to constructed a lazy state ID.
+ pub(crate) fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for LazyStateIDError {}
+
+impl core::fmt::Display for LazyStateIDError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create LazyStateID from {:?}, which exceeds {:?}",
+ self.attempted(),
+ LazyStateID::MAX,
+ )
+ }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides no introspection capabilities. The only thing a caller
+/// can do is construct it and pass it around to permit search routines to use
+/// it to track state.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+ /// The state ID of the state at which the search was in when the call
+ /// terminated. When this is a match state, `last_match` must be set to a
+ /// non-None value.
+ ///
+ /// A `None` value indicates the start state of the corresponding
+ /// automaton. We cannot use the actual ID, since any one automaton may
+ /// have many start states, and which one is in use depends on several
+ /// search-time factors.
+ id: Option<LazyStateID>,
+ /// Information associated with a match when `id` corresponds to a match
+ /// state.
+ last_match: Option<StateMatch>,
+}
+
+/// Internal state about the last match that occurred. This records both the
+/// offset of the match and the match index.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) struct StateMatch {
+ /// The index into the matching patterns for the current match state.
+ pub(crate) match_index: usize,
+ /// The offset in the haystack at which the match occurred. This is used
+ /// when reporting multiple matches at the same offset. That is, when
+ /// an overlapping search runs, the first thing it checks is whether it's
+ /// already in a match state, and if so, whether there are more patterns
+ /// to report as matches in that state. If so, it increments `match_index`
+ /// and returns the pattern and this offset. Once `match_index` exceeds the
+ /// number of matching patterns in the current state, the search continues.
+ pub(crate) offset: usize,
+}
+
+impl OverlappingState {
+ /// Create a new overlapping state that begins at the start state of any
+ /// automaton.
+ pub fn start() -> OverlappingState {
+ OverlappingState { id: None, last_match: None }
+ }
+
+ pub(crate) fn id(&self) -> Option<LazyStateID> {
+ self.id
+ }
+
+ pub(crate) fn set_id(&mut self, id: LazyStateID) {
+ self.id = Some(id);
+ }
+
+ pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
+ self.last_match.as_mut()
+ }
+
+ pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
+ self.last_match = Some(last_match);
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs
new file mode 100644
index 000000000..4c8ca7ebe
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/mod.rs
@@ -0,0 +1,179 @@
+/*!
+A module for building and searching with lazy determinstic finite automata
+(DFAs).
+
+Like other modules in this crate, lazy DFAs support a rich regex syntax with
+Unicode features. The key feature of a lazy DFA is that it builds itself
+incrementally during search, and never uses more than a configured capacity of
+memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache"
+in which the actual DFA's transition table is stored.
+
+If you're looking for fully compiled DFAs, then please see the top-level
+[`dfa` module](crate::dfa).
+
+# Overview
+
+This section gives a brief overview of the primary types in this module:
+
+* A [`regex::Regex`] provides a way to search for matches of a regular
+expression using lazy DFAs. This includes iterating over matches with both the
+start and end positions of each match.
+* A [`dfa::DFA`] provides direct low level access to a lazy DFA.
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::{hybrid::regex::Regex, MultiMatch};
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+let mut cache = re.create_cache();
+
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> =
+ re.find_leftmost_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: searching with regex sets
+
+The lazy DFAs in this module all fully support searching with multiple regexes
+simultaneously. You can use this support with standard leftmost-first style
+searching to find non-overlapping matches:
+
+```
+use regex_automata::{hybrid::regex::Regex, MultiMatch};
+
+let re = Regex::new_many(&[r"\w+", r"\S+"])?;
+let mut cache = re.create_cache();
+
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> =
+ re.find_leftmost_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(1, 0, 4),
+ MultiMatch::must(0, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Or use overlapping style searches to find all possible occurrences:
+
+```
+use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch};
+
+// N.B. For overlapping searches, we need the underlying lazy DFA to report all
+// possible matches.
+let re = Regex::builder()
+ .dfa(dfa::Config::new().match_kind(MatchKind::All))
+ .build_many(&[r"\w{3}", r"\S{3}"])?;
+let mut cache = re.create_cache();
+
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> =
+ re.find_overlapping_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(1, 0, 3),
+ MultiMatch::must(0, 1, 4),
+ MultiMatch::must(1, 1, 4),
+ MultiMatch::must(0, 5, 8),
+ MultiMatch::must(1, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# When should I use this?
+
+Generally speaking, if you can abide the use of mutable state during search,
+and you don't need things like capturing groups or Unicode word boundary
+support in non-ASCII text, then a lazy DFA is likely a robust choice with
+respect to both search speed and memory usage. Note however that its speed
+may be worse than a general purpose regex engine if you don't select a good
+[prefilter](crate::util::prefilter).
+
+If you know ahead of time that your pattern would result in a very large DFA
+if it was fully compiled, it may be better to use an NFA simulation instead
+of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA
+to something that is big enough to hold the state machine (likely through
+experimentation). The issue here is that if the cache is too small, then it
+could wind up being reset too frequently and this might decrease searching
+speed significantly.
+
+# Differences with fully compiled DFAs
+
+A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a
+[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities
+(and similarly for their underlying DFAs), but they achieve them through
+different means. The main difference is that a hybrid or "lazy" regex builds
+its DFA lazily during search, where as a fully compiled regex will build its
+DFA at construction time. While building a DFA at search time might sound like
+it's slow, it tends to work out where most bytes seen during a search will
+reuse pre-built parts of the DFA and thus can be almost as fast as a fully
+compiled DFA. The main downside is that searching requires mutable space to
+store the DFA, and, in the worst case, a search can result in a new state being
+created for each byte seen, which would make searching quite a bit slower.
+
+A fully compiled DFA never has to worry about searches being slower once
+it's built. (Aside from, say, the transition table being so large that it
+is subject to harsh CPU cache effects.) However, of course, building a full
+DFA can be quite time consuming and memory hungry. Particularly when it's
+so easy to build large DFAs when Unicode mode is enabled.
+
+A lazy DFA strikes a nice balance _in practice_, particularly in the
+presence of Unicode mode, by only building what is needed. It avoids the
+worst case exponential time complexity of DFA compilation by guaranteeing that
+it will only build at most one state per byte searched. While the worst
+case here can lead to a very high constant, it will never be exponential.
+
+# Syntax
+
+This module supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
+
+There are two things that are not supported by the lazy DFAs in this module:
+
+* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
+of them) can only find the offsets of an entire match, but cannot resolve
+the offsets of each capturing group. This is because DFAs do not have the
+expressive power necessary.
+* Unicode word boundaries. These present particularly difficult challenges for
+DFA construction and would result in an explosion in the number of states.
+One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
+heuristic support for Unicode word boundaries that only works on ASCII text.
+Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
+on any input.
+
+There are no plans to lift either of these limitations.
+
+Note that these restrictions are identical to the restrictions on fully
+compiled DFAs.
+
+# Support for `alloc`-only
+
+This crate comes with `alloc` and `std` features that are enabled by default.
+One can disable the `std` feature and still use the full API of a lazy DFA.
+(You should use `std` when possible, since it permits providing implementations
+of the `std::error::Error` trait, and does enable some minor internal
+optimizations.)
+
+This module does require at least the `alloc` feature though. It is not
+available in any capacity without `alloc`.
+*/
+
+pub use self::{
+ error::{BuildError, CacheError},
+ id::{LazyStateID, OverlappingState},
+};
+
+pub mod dfa;
+mod error;
+mod id;
+pub mod regex;
+mod search;
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs
new file mode 100644
index 000000000..7cc6b9064
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/regex.rs
@@ -0,0 +1,2124 @@
+/*!
+A lazy DFA backed `Regex`.
+
+This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience
+routines you might have come to expect, such as finding a match and iterating
+over all non-overlapping matches. This `Regex` type is limited in its
+capabilities to what a lazy DFA can provide. Therefore, APIs involving
+capturing groups, for example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::hybrid) for examples.
+*/
+
+use core::borrow::Borrow;
+
+use alloc::boxed::Box;
+
+use crate::{
+ hybrid::{
+ dfa::{self, DFA},
+ error::BuildError,
+ OverlappingState,
+ },
+ nfa::thompson,
+ util::{
+ matchtypes::{MatchError, MatchKind, MultiMatch},
+ prefilter::{self, Prefilter},
+ },
+};
+
+/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
+/// for searching.
+///
+/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of
+/// a match while the reverse DFA is responsible for detecting the start
+/// of a match. Thus, in order to find the bounds of any given match, a
+/// forward search must first be run followed by a reverse search. A match
+/// found by the forward DFA guarantees that the reverse DFA will also find
+/// a match.
+///
+/// A `Regex` can also have a prefilter set via the
+/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is
+/// enabled.
+///
+/// # Earliest vs Leftmost vs Overlapping
+///
+/// The search routines exposed on a `Regex` reflect three different ways
+/// of searching:
+///
+/// * "earliest" means to stop as soon as a match has been detected.
+/// * "leftmost" means to continue matching until the underlying
+/// automaton cannot advance. This reflects "standard" searching you
+/// might be used to in other regex engines. e.g., This permits
+/// non-greedy and greedy searching to work as you would expect.
+/// * "overlapping" means to find all possible matches, even if they
+/// overlap.
+///
+/// Generally speaking, when doing an overlapping search, you'll want to
+/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using
+/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
+/// likely to lead to odd behavior since `LeftmostFirst` specifically omits
+/// some matches that can never be reported due to its semantics.
+///
+/// The following example shows the differences between how these different
+/// types of searches impact looking for matches of `[a-z]+` in the
+/// haystack `abc`.
+///
+/// ```
+/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch};
+///
+/// let pattern = r"[a-z]+";
+/// let haystack = "abc".as_bytes();
+///
+/// // With leftmost-first semantics, we test "earliest" and "leftmost".
+/// let re = regex::Builder::new()
+/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst))
+/// .build(pattern)?;
+/// let mut cache = re.create_cache();
+///
+/// // "earliest" searching isn't impacted by greediness
+/// let mut it = re.find_earliest_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// // "leftmost" searching supports greediness (and non-greediness)
+/// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// // For overlapping, we want "all" match kind semantics.
+/// let re = regex::Builder::new()
+/// .dfa(dfa::Config::new().match_kind(MatchKind::All))
+/// .build(pattern)?;
+/// let mut cache = re.create_cache();
+///
+/// // In the overlapping search, we find all three possible matches
+/// // starting at the beginning of the haystack.
+/// let mut it = re.find_overlapping_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Fallibility
+///
+/// In non-default configurations, the lazy DFAs generated in this module may
+/// return an error during a search. (Currently, the only way this happens is
+/// if quit bytes are added, Unicode word boundaries are heuristically enabled,
+/// or if the cache is configured to "give up" on a search if it has been
+/// cleared too many times. All of these are turned off by default, which means
+/// a search can never fail in the default configuration.) For convenience,
+/// the main search routines, like [`find_leftmost`](Regex::find_leftmost),
+/// will panic if an error occurs. However, if you need to use DFAs which may
+/// produce an error at search time, then there are fallible equivalents of
+/// all search routines. For example, for `find_leftmost`, its fallible analog
+/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed
+/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the
+/// infallible routines simply return `Option<MultiMatch>`.
+///
+/// # Example
+///
+/// This example shows how to cause a search to terminate if it sees a
+/// `\n` byte, and handle the error returned. This could be useful if, for
+/// example, you wanted to prevent a user supplied pattern from matching
+/// across a line boundary.
+///
+/// ```
+/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError};
+///
+/// let re = Regex::builder()
+/// .dfa(dfa::Config::new().quit(b'\n', true))
+/// .build(r"foo\p{any}+bar")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = "foo\nbar".as_bytes();
+/// // Normally this would produce a match, since \p{any} contains '\n'.
+/// // But since we instructed the automaton to enter a quit state if a
+/// // '\n' is observed, this produces a match error instead.
+/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err();
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Debug)]
+pub struct Regex {
+ /// An optional prefilter that is passed down to the lazy DFA search
+ /// routines when present. By default, no prefilter is set.
+ pre: Option<Box<dyn Prefilter>>,
+ /// The forward lazy DFA. This can only find the end of a match.
+ forward: DFA,
+ /// The reverse lazy DFA. This can only find the start of a match.
+ ///
+ /// This is built with 'all' match semantics (instead of leftmost-first)
+ /// so that it always finds the longest possible match (which corresponds
+ /// to the leftmost starting position). It is also compiled as an anchored
+ /// matcher and has 'starts_for_each_pattern' enabled. Including starting
+ /// states for each pattern is necessary to ensure that we only look for
+ /// matches of a pattern that matched in the forward direction. Otherwise,
+ /// we might wind up finding the "leftmost" starting position of a totally
+ /// different pattern!
+ reverse: DFA,
+ /// Whether iterators on this type should advance by one codepoint or one
+ /// byte when an empty match is seen.
+ utf8: bool,
+}
+
+/// Convenience routines for regex and cache construction.
+impl Regex {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 14)),
+ /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex, BuildError> {
+ Regex::builder().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let mut it = re.find_leftmost_iter(
+ /// &mut cache,
+ /// b"abc 1 foo 4567 0 quux",
+ /// );
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ Regex::builder().build_many(patterns)
+ }
+
+ /// Return a default configuration for a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a regex.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to disable UTF-8 mode for `Regex` iteration.
+ /// When UTF-8 mode is disabled, the position immediately following an
+ /// empty match is where the next search begins, instead of the next
+ /// position of a UTF-8 encoded codepoint.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::regex::Regex,
+ /// nfa::thompson,
+ /// MultiMatch, SyntaxConfig,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(MultiMatch::must(0, 1, 9));
+ /// let got = re.find_leftmost(&mut cache, haystack);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new cache for this `Regex`.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// `Regex`. If you want to reuse the cache for another `Regex`, then
+ /// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
+ /// [`Regex::reset_cache`]).
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this `Regex` (and only this `Regex`).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `Regex`.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// `Regex` has been configured to "give up" after it has cleared the cache
+ /// a certain number of times.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `Regex`.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re1 = Regex::new(r"\w")?;
+ /// let re2 = Regex::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 2)),
+ /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the Regex we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 3)),
+ /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ self.forward().reset_cache(&mut cache.forward);
+ self.reverse().reset_cache(&mut cache.reverse);
+ }
+}
+
+/// Standard infallible search routines for finding and iterating over matches.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_is_match`](Regex::try_is_match).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::hybrid::regex::Regex;
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar"));
+ /// assert_eq!(false, re.is_match(&mut cache, b"foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool {
+ self.try_is_match(cache, haystack).unwrap()
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest`](Regex::try_find_earliest).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// // Normally, the leftmost first match would greedily consume as many
+ /// // decimal digits as it could. But a match is detected as soon as one
+ /// // digit is seen.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 4)),
+ /// re.find_earliest(&mut cache, b"foo12345"),
+ /// );
+ ///
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the "earliest" match semantics detect a match earlier.
+ /// let re = Regex::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 1)),
+ /// re.find_earliest(&mut cache, b"abc"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_earliest(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Option<MultiMatch> {
+ self.try_find_earliest(cache, haystack).unwrap()
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost`](Regex::try_find_leftmost).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// // Greediness is applied appropriately when compared to find_earliest.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 11)),
+ /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"),
+ /// );
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the default leftmost-first match semantics demand that we find the
+ /// // earliest match that prefers earlier parts of the pattern over latter
+ /// // parts.
+ /// let re = Regex::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 3)),
+ /// re.find_leftmost(&mut cache, b"abc"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_leftmost(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Option<MultiMatch> {
+ self.try_find_leftmost(cache, haystack).unwrap()
+ }
+
+ /// Search for the first overlapping match in `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping`](Regex::try_find_overlapping).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an overlapping search with multiple
+ /// regexes.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, regex::Regex, OverlappingState},
+ /// MatchKind,
+ /// MultiMatch,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .dfa(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(MultiMatch::must(1, 0, 4));
+ /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(MultiMatch::must(0, 1, 4));
+ /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_overlapping(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ state: &mut OverlappingState,
+ ) -> Option<MultiMatch> {
+ self.try_find_overlapping(cache, haystack, state).unwrap()
+ }
+
+ /// Returns an iterator over all non-overlapping "earliest" matches.
+ ///
+ /// Match positions are reported as soon as a match is known to occur, even
+ /// if the standard leftmost match would be longer.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an "earliest" iterator.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::new("[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let haystack = "123".as_bytes();
+ ///
+ /// // Normally, a standard leftmost iterator would return a single
+ /// // match, but since "earliest" detects matches earlier, we get
+ /// // three matches.
+ /// let mut it = re.find_earliest_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_earliest_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindEarliestMatches<'r, 'c, 't> {
+ FindEarliestMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let text = b"foo1 foo12 foo123";
+ /// let matches: Vec<MultiMatch> = re
+ /// .find_leftmost_iter(&mut cache, text)
+ /// .collect();
+ /// assert_eq!(matches, vec![
+ /// MultiMatch::must(0, 0, 4),
+ /// MultiMatch::must(0, 5, 10),
+ /// MultiMatch::must(0, 11, 17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_leftmost_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 'c, 't> {
+ FindLeftmostMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all overlapping matches in the given haystack.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// The iterator takes care of handling the overlapping state that must be
+ /// threaded through every search.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an overlapping search with multiple
+ /// regexes.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, regex::Regex},
+ /// MatchKind,
+ /// MultiMatch,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .dfa(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = re.create_cache();
+ /// let haystack = "@foo".as_bytes();
+ ///
+ /// let mut it = re.find_overlapping_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_overlapping_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindOverlappingMatches<'r, 'c, 't> {
+ FindOverlappingMatches::new(self, cache, haystack)
+ }
+}
+
+/// Lower level infallible search routines that permit controlling where
+/// the search starts and ends in a particular sequence. This is useful for
+/// executing searches that need to take surrounding context into account. This
+/// is required for correctly implementing iteration because of look-around
+/// operators (`^`, `$`, `\b`).
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_is_match_at`](Regex::try_is_match_at).
+ pub fn is_match_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ self.try_is_match_at(cache, haystack, start, end).unwrap()
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
+ pub fn find_earliest_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<MultiMatch> {
+ self.try_find_earliest_at(cache, haystack, start, end).unwrap()
+ }
+
+ /// Returns the same as `find_leftmost`, but starts the search at the given
+ /// offset.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches within the
+ /// same haystack, which cannot be done correctly by simply providing a
+ /// subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
+ pub fn find_leftmost_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<MultiMatch> {
+ self.try_find_leftmost_at(cache, haystack, start, end).unwrap()
+ }
+
+ /// Search for the first overlapping match within a given range of
+ /// `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
+ pub fn find_overlapping_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Option<MultiMatch> {
+ self.try_find_overlapping_at(cache, haystack, start, end, state)
+ .unwrap()
+ }
+}
+
+/// Fallible search routines. These may return an error when the underlying
+/// lazy DFAs have been configured in a way that permits them to fail during a
+/// search.
+///
+/// Errors during search only occur when the lazy DFA has been explicitly
+/// configured to do so, usually by specifying one or more "quit" bytes or by
+/// heuristically enabling Unicode word boundaries.
+///
+/// Errors will never be returned using the default configuration. So these
+/// fallible routines are only needed for particular configurations.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`is_match`](Regex::is_match).
+ pub fn try_is_match(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Result<bool, MatchError> {
+ self.try_is_match_at(cache, haystack, 0, haystack.len())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest`](Regex::find_earliest).
+ pub fn try_find_earliest(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_earliest_at(cache, haystack, 0, haystack.len())
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost`](Regex::find_leftmost).
+ pub fn try_find_leftmost(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_leftmost_at(cache, haystack, 0, haystack.len())
+ }
+
+ /// Search for the first overlapping match in `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping`](Regex::find_overlapping).
+ pub fn try_find_overlapping(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state)
+ }
+
+ /// Returns an iterator over all non-overlapping "earliest" matches.
+ ///
+ /// Match positions are reported as soon as a match is known to occur, even
+ /// if the standard leftmost match would be longer.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest_iter`](Regex::find_earliest_iter).
+ pub fn try_find_earliest_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> TryFindEarliestMatches<'r, 'c, 't> {
+ TryFindEarliestMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
+ pub fn try_find_leftmost_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> TryFindLeftmostMatches<'r, 'c, 't> {
+ TryFindLeftmostMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all overlapping matches in the given haystack.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// The iterator takes care of handling the overlapping state that must be
+ /// threaded through every search.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
+ pub fn try_find_overlapping_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> TryFindOverlappingMatches<'r, 'c, 't> {
+ TryFindOverlappingMatches::new(self, cache, haystack)
+ }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`is_match_at`](Regex::is_match_at).
+ pub fn try_is_match_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<bool, MatchError> {
+ self.forward()
+ .find_leftmost_fwd_at(
+ &mut cache.forward,
+ self.scanner().as_mut(),
+ None,
+ haystack,
+ start,
+ end,
+ )
+ .map(|x| x.is_some())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest_at`](Regex::find_earliest_at).
+ pub fn try_find_earliest_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_earliest_at_imp(
+ self.scanner().as_mut(),
+ cache,
+ haystack,
+ start,
+ end,
+ )
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost_at`](Regex::find_leftmost_at).
+ pub fn try_find_leftmost_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_leftmost_at_imp(
+ self.scanner().as_mut(),
+ cache,
+ haystack,
+ start,
+ end,
+ )
+ }
+
+ /// Search for the first overlapping match within a given range of
+ /// `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping_at`](Regex::find_overlapping_at).
+ pub fn try_find_overlapping_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_overlapping_at_imp(
+ self.scanner().as_mut(),
+ cache,
+ haystack,
+ start,
+ end,
+ state,
+ )
+ }
+}
+
+impl Regex {
+ #[inline(always)]
+ fn try_find_earliest_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ let (fdfa, rdfa) = (self.forward(), self.reverse());
+ let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+ let end = match fdfa
+ .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)?
+ {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // N.B. The only time we need to tell the reverse searcher the pattern
+ // to match is in the overlapping case, since it's ambiguous. In the
+ // earliest case, I have tentatively convinced myself that it isn't
+ // necessary and the reverse search will always find the same pattern
+ // to match as the forward search. But I lack a rigorous proof. Why not
+ // just provide the pattern anyway? Well, if it is needed, then leaving
+ // it out gives us a chance to find a witness.
+ let start = rdfa
+ .find_earliest_rev_at(rcache, None, haystack, start, end.offset())?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+
+ #[inline(always)]
+ fn try_find_leftmost_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ let (fdfa, rdfa) = (self.forward(), self.reverse());
+ let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+ let end = match fdfa
+ .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)?
+ {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // N.B. The only time we need to tell the reverse searcher the pattern
+ // to match is in the overlapping case, since it's ambiguous. In the
+ // leftmost case, I have tentatively convinced myself that it isn't
+ // necessary and the reverse search will always find the same pattern
+ // to match as the forward search. But I lack a rigorous proof. Why not
+ // just provide the pattern anyway? Well, if it is needed, then leaving
+ // it out gives us a chance to find a witness.
+ let start = rdfa
+ .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+
+ #[inline(always)]
+ fn try_find_overlapping_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ let (fdfa, rdfa) = (self.forward(), self.reverse());
+ let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+ let end = match fdfa.find_overlapping_fwd_at(
+ fcache, pre, None, haystack, start, end, state,
+ )? {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // Unlike the leftmost cases, the reverse overlapping search may match
+ // a different pattern than the forward search. See test failures when
+ // using `None` instead of `Some(end.pattern())` below. Thus, we must
+ // run our reverse search using the pattern that matched in the forward
+ // direction.
+ let start = rdfa
+ .find_leftmost_rev_at(
+ rcache,
+ Some(end.pattern()),
+ haystack,
+ 0,
+ end.offset(),
+ )?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl Regex {
+ /// Return the underlying lazy DFA responsible for forward matching.
+ ///
+ /// This is useful for accessing the underlying lazy DFA and using it
+ /// directly if the situation calls for it.
+ pub fn forward(&self) -> &DFA {
+ &self.forward
+ }
+
+ /// Return the underlying lazy DFA responsible for reverse matching.
+ ///
+ /// This is useful for accessing the underlying lazy DFA and using it
+ /// directly if the situation calls for it.
+ pub fn reverse(&self) -> &DFA {
+ &self.reverse
+ }
+
+ /// Returns the total number of patterns matched by this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+ /// assert_eq!(3, re.pattern_count());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_count(&self) -> usize {
+ assert_eq!(
+ self.forward().pattern_count(),
+ self.reverse().pattern_count()
+ );
+ self.forward().pattern_count()
+ }
+
+ /// Convenience function for returning this regex's prefilter as a trait
+ /// object.
+ ///
+ /// If this regex doesn't have a prefilter, then `None` is returned.
+ pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.pre.as_ref().map(|x| &**x)
+ }
+
+ /// Attach the given prefilter to this regex.
+ pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) {
+ self.pre = pre;
+ }
+
+ /// Convenience function for returning a prefilter scanner.
+ fn scanner(&self) -> Option<prefilter::Scanner> {
+ self.prefilter().map(prefilter::Scanner::new)
+ }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>);
+
+impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindEarliestMatches<'r, 'c, 't> {
+ FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text))
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> {
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>);
+
+impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 'c, 't> {
+ FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text))
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all overlapping matches for a particular infallible
+/// search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindOverlappingMatches<'r, 'c, 't>(
+ TryFindOverlappingMatches<'r, 'c, 't>,
+);
+
+impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindOverlappingMatches<'r, 'c, 't> {
+ FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text))
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> {
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindEarliestMatches<'r, 'c, 't> {
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> TryFindEarliestMatches<'r, 'c, 't> {
+ let scanner = re.scanner();
+ TryFindEarliestMatches {
+ re,
+ cache,
+ scanner,
+ text,
+ last_end: 0,
+ last_match: None,
+ }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> {
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_earliest_at_imp(
+ self.scanner.as_mut(),
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.re.utf8 {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(Ok(m))
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindLeftmostMatches<'r, 'c, 't> {
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> TryFindLeftmostMatches<'r, 'c, 't> {
+ let scanner = re.scanner();
+ TryFindLeftmostMatches {
+ re,
+ cache,
+ scanner,
+ text,
+ last_end: 0,
+ last_match: None,
+ }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> {
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_leftmost_at_imp(
+ self.scanner.as_mut(),
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.re.utf8 {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(Ok(m))
+ }
+}
+
+/// An iterator over all overlapping matches for a particular fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindOverlappingMatches<'r, 'c, 't> {
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ state: OverlappingState,
+}
+
+impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> TryFindOverlappingMatches<'r, 'c, 't> {
+ let scanner = re.scanner();
+ TryFindOverlappingMatches {
+ re,
+ cache,
+ scanner,
+ text,
+ last_end: 0,
+ state: OverlappingState::start(),
+ }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> {
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_overlapping_at_imp(
+ self.scanner.as_mut(),
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ &mut self.state,
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ // Unlike the non-overlapping case, we're OK with empty matches at this
+ // level. In particular, the overlapping search algorithm is itself
+ // responsible for ensuring that progress is always made.
+ self.last_end = m.end();
+ Some(Ok(m))
+ }
+}
+
+/// A cache represents a partially computed forward and reverse DFA.
+///
+/// A cache is the key component that differentiates a classical DFA and a
+/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
+/// complete transition table that can handle all possible inputs, a hybrid
+/// NFA/DFA starts with an empty transition table and builds only the parts
+/// required during search. The parts that are built are stored in a cache. For
+/// this reason, a cache is a required parameter for nearly every operation on
+/// a [`Regex`].
+///
+/// Caches can be created from their corresponding `Regex` via
+/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
+/// that created it, or the `Regex` that was most recently used to reset it
+/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
+/// panics or incorrect results.
+#[derive(Debug, Clone)]
+pub struct Cache {
+ forward: dfa::Cache,
+ reverse: dfa::Cache,
+}
+
+impl Cache {
+ /// Create a new cache for the given `Regex`.
+ ///
+ /// The cache returned should only be used for searches for the given
+ /// `Regex`. If you want to reuse the cache for another `Regex`, then you
+ /// must call [`Cache::reset`] with that `Regex`.
+ pub fn new(re: &Regex) -> Cache {
+ let forward = dfa::Cache::new(re.forward());
+ let reverse = dfa::Cache::new(re.reverse());
+ Cache { forward, reverse }
+ }
+
+ /// Reset this cache such that it can be used for searching with the given
+ /// `Regex` (and only that `Regex`).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `Regex`.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// `Regex` has been configured to "give up" after it has cleared the cache
+ /// a certain number of times.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `Regex`.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re1 = Regex::new(r"\w")?;
+ /// let re2 = Regex::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 2)),
+ /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the Regex we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// cache.reset(&re2);
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 3)),
+ /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &Regex) {
+ self.forward.reset(re.forward());
+ self.reverse.reset(re.reverse());
+ }
+
+ /// Returns the heap memory usage, in bytes, as a sum of the forward and
+ /// reverse lazy DFA caches.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.forward.memory_usage() + self.reverse.memory_usage()
+ }
+
+ /// Return references to the forward and reverse caches, respectively.
+ pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
+ (&self.forward, &self.reverse)
+ }
+
+ /// Return mutable references to the forward and reverse caches,
+ /// respectively.
+ pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
+ (&mut self.forward, &mut self.reverse)
+ }
+}
+
+/// The configuration used for compiling a hybrid NFA/DFA regex.
+///
+/// A regex configuration is a simple data object that is typically used with
+/// [`Builder::configure`].
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ utf8: Option<bool>,
+}
+
+impl Config {
+ /// Return a new default regex compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Whether to enable UTF-8 mode or not.
+ ///
+ /// When UTF-8 mode is enabled (the default) and an empty match is seen,
+ /// the iterators on [`Regex`] will always start the next search at the
+ /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
+ /// mode is disabled, such searches are begun at the next byte offset.
+ ///
+ /// If this mode is enabled and invalid UTF-8 is given to search, then
+ /// behavior is unspecified.
+ ///
+ /// Generally speaking, one should enable this when
+ /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
+ /// and
+ /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+ /// are enabled, and disable it otherwise.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates the differences between when this option is
+ /// enabled and disabled. The differences only arise when the regex can
+ /// return matches of length zero.
+ ///
+ /// In this first snippet, we show the results when UTF-8 mode is disabled.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And in this snippet, we execute the same search on the same haystack,
+ /// but with UTF-8 mode enabled. Notice that byte offsets that would
+ /// otherwise split the encoding of `☃` are not returned.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(true))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ /// Returns true if and only if this configuration has UTF-8 mode enabled.
+ ///
+ /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
+ /// [`Regex`] will always start the next search at the next UTF-8 encoded
+ /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
+ /// next byte offset.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config { utf8: o.utf8.or(self.utf8) }
+ }
+}
+
+/// A builder for a regex based on a hybrid NFA/DFA.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the lazy DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder
+/// in that it permits fine grain configuration of the construction process.
+/// The trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are three
+/// different UTF-8 modes:
+///
+/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
+/// pattern itself can contain sub-expressions that match invalid UTF-8.
+/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+/// controls whether the implicit unanchored prefix added to the NFA can
+/// match through invalid UTF-8 or not.
+/// * [`Config::utf8`] controls how the regex iterators themselves advance
+/// the starting position of the next search when a match with zero length is
+/// found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two hybrid NFA/DFAs,
+/// where one is responsible for finding the end of a match and the other is
+/// responsible for finding the start of a match. If you only need to detect
+/// whether something matched, or only the end of a match, then you should use
+/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
+/// than building two of them.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
+/// the regex itself. This is generally what you want for matching on
+/// arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// };
+///
+/// let re = Regex::builder()
+/// .configure(Regex::config().utf8(false))
+/// .syntax(SyntaxConfig::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(MultiMatch::must(0, 1, 9));
+/// let got = re.find_leftmost(&mut cache, haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this. Notice also that the
+/// // search was unanchored and skipped over invalid UTF-8.
+/// // Disabling UTF-8 on the Thompson NFA permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ dfa: dfa::Builder,
+}
+
+impl Builder {
+ /// Create a new regex builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default(), dfa: DFA::builder() }
+ }
+
+ /// Build a regex from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a regex from the given patterns.
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ let forward = self.dfa.build_many(patterns)?;
+ let reverse = self
+ .dfa
+ .clone()
+ .configure(
+ DFA::config()
+ .anchored(true)
+ .match_kind(MatchKind::All)
+ .starts_for_each_pattern(true),
+ )
+ .thompson(thompson::Config::new().reverse(true))
+ .build_many(patterns)?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a regex from its component forward and reverse hybrid NFA/DFAs.
+ fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
+ // The congruous method on DFA-backed regexes is exposed, but it's
+ // not clear this builder is useful here since lazy DFAs can't be
+ // serialized and there is only one type of them.
+ let utf8 = self.config.get_utf8();
+ Regex { pre: None, forward, reverse, utf8 }
+ }
+
+ /// Apply the given regex configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.dfa.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.dfa.thompson(config);
+ self
+ }
+
+ /// Set the lazy DFA compilation configuration for this builder using
+ /// [`dfa::Config`](dfa::Config).
+ ///
+ /// This permits setting things like whether Unicode word boundaries should
+ /// be heuristically supported or settings how the behavior of the cache.
+ pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
+ self.dfa.configure(config);
+ self
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+#[inline(always)]
+fn next_unwrap(
+ item: Option<Result<MultiMatch, MatchError>>,
+) -> Option<MultiMatch> {
+ match item {
+ None => None,
+ Some(Ok(m)) => Some(m),
+ Some(Err(err)) => panic!(
+ "unexpected regex search error: {}\n\
+ to handle search errors, use try_ methods",
+ err,
+ ),
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs
new file mode 100644
index 000000000..92760cee2
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/search.rs
@@ -0,0 +1,663 @@
+use crate::{
+ hybrid::{
+ dfa::{Cache, DFA},
+ id::{LazyStateID, OverlappingState, StateMatch},
+ },
+ nfa::thompson,
+ util::{
+ id::PatternID,
+ matchtypes::{HalfMatch, MatchError},
+ prefilter, MATCH_OFFSET,
+ },
+};
+
+#[inline(never)]
+pub(crate) fn find_earliest_fwd(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() && pattern_id.is_none() {
+ find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
+ } else {
+ find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
+ }
+}
+
+#[inline(never)]
+pub(crate) fn find_leftmost_fwd(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() && pattern_id.is_none() {
+ find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
+ } else {
+ find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
+ }
+}
+
+#[inline(always)]
+fn find_fwd(
+ mut pre: Option<&mut prefilter::Scanner>,
+ earliest: bool,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= haystack.len());
+ assert!(end <= haystack.len());
+
+ // Why do this? This lets 'bytes[at]' work without bounds checks below.
+ // It seems the assert on 'end <= haystack.len()' above is otherwise
+ // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+ // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+ // for resolving look-ahead.
+ let bytes = &haystack[..end];
+
+ let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
+ let mut last_match = None;
+ let mut at = start;
+ if let Some(ref mut pre) = pre {
+ // If a prefilter doesn't report false positives, then we don't need to
+ // touch the DFA at all. However, since all matches include the pattern
+ // ID, and the prefilter infrastructure doesn't report pattern IDs, we
+ // limit this optimization to cases where there is exactly one pattern.
+ // In that case, any match must be the 0th pattern.
+ if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
+ return Ok(pre.next_candidate(bytes, at).into_option().map(
+ |offset| HalfMatch { pattern: PatternID::ZERO, offset },
+ ));
+ } else if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ while at < end {
+ if sid.is_tagged() {
+ sid = dfa
+ .next_state(cache, sid, bytes[at])
+ .map_err(|_| gave_up(at))?;
+ at += 1;
+ } else {
+ // SAFETY: There are two safety invariants we need to uphold
+ // here in the loop below: that 'sid' is a valid state ID for
+ // this DFA, and that 'at' is a valid index into 'bytes'. For
+ // the former, we rely on the invariant that next_state* and
+ // start_state_forward always returns a valid state ID (given a
+ // valid state ID in the former case), and that we are only at this
+ // place in the code if 'sid' is untagged. Moreover, every call to
+ // next_state_untagged_unchecked below is guarded by a check that
+ // sid is untagged. For the latter safety invariant, we always
+ // guard unchecked access with a check that 'at' is less than
+ // 'end', where 'end == bytes.len()'.
+ //
+ // For justification, this gives us a ~10% bump in search time.
+ // This was used for a benchmark:
+ //
+ // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
+ //
+ // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
+ // used OpenSubtitles2018.raw.sample.medium.en.
+ let mut prev_sid = sid;
+ while at < end {
+ prev_sid = sid;
+ sid = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ at += 1;
+ if sid.is_tagged() {
+ break;
+ }
+ // SAFETY: we make four unguarded accesses to 'bytes[at]'
+ // below, and each are safe because we know that 'at + 4' is
+ // in bounds. Moreover, while we don't check whether 'sid' is
+ // untagged directly, we know it is because of the check above.
+ // And the unrolled loop below quits when the next state is not
+ // equal to the previous state.
+ //
+ // PERF: For justification for eliminating bounds checks,
+ // see above. For justification for the unrolling, we use
+ // two tests. The one above with regex '(?m)^.+$', and also
+ // '(?m)^.{40}$'. The former is kinda the best case for
+ // unrolling, and gives a 1.67 boost primarily because the DFA
+ // spends most of its time munching through the input in the
+ // same state. But the latter pattern rarely spends time in the
+ // same state through subsequent transitions, so unrolling is
+ // pretty much always ineffective in that it craps out on the
+ // first 'sid != next' check below. However, without unrolling,
+ // search is only 1.03 times faster than with unrolling on the
+ // latter pattern, which we deem to be an acceptable loss in
+ // favor of optimizing the more common case of having a "hot"
+ // state somewhere in the DFA.
+ while at + 4 < end {
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ }
+ }
+ if sid.is_unknown() {
+ sid = dfa
+ .next_state(cache, prev_sid, bytes[at - 1])
+ .map_err(|_| gave_up(at - 1))?;
+ }
+ }
+ if sid.is_tagged() {
+ if sid.is_start() {
+ if let Some(ref mut pre) = pre {
+ if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ } else if sid.is_match() {
+ last_match = Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: at - MATCH_OFFSET,
+ });
+ if earliest {
+ return Ok(last_match);
+ }
+ } else if sid.is_dead() {
+ return Ok(last_match);
+ } else if sid.is_quit() {
+ if last_match.is_some() {
+ return Ok(last_match);
+ }
+ let offset = at - 1;
+ return Err(MatchError::Quit { byte: bytes[offset], offset });
+ } else {
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
+ }
+ }
+ }
+ // We are careful to use 'haystack' here, which contains the full context
+ // that we might want to inspect.
+ Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
+}
+
+#[inline(never)]
+pub(crate) fn find_earliest_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ find_rev(true, dfa, cache, pattern_id, bytes, start, end)
+}
+
+#[inline(never)]
+pub(crate) fn find_leftmost_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ find_rev(false, dfa, cache, pattern_id, bytes, start, end)
+}
+
+#[inline(always)]
+fn find_rev(
+ earliest: bool,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= haystack.len());
+ assert!(end <= haystack.len());
+
+ // Why do this? This lets 'bytes[at]' work without bounds checks below.
+ // It seems the assert on 'end <= haystack.len()' above is otherwise
+ // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+ // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+ // for resolving look-ahead.
+ let bytes = &haystack[start..];
+
+ let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
+ let mut last_match = None;
+ let mut at = end - start;
+ while at > 0 {
+ if sid.is_tagged() {
+ at -= 1;
+ sid = dfa
+ .next_state(cache, sid, bytes[at])
+ .map_err(|_| gave_up(at))?;
+ } else {
+ // SAFETY: See comments in 'find_fwd' for both a safety argument
+ // and a justification from a performance perspective as to 1) why
+ // we elide bounds checks and 2) why we do a specialized version of
+ // unrolling below.
+ let mut prev_sid = sid;
+ while at > 0 && !sid.is_tagged() {
+ prev_sid = sid;
+ at -= 1;
+ while at > 3 {
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ }
+ sid = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ }
+ if sid.is_unknown() {
+ sid = dfa
+ .next_state(cache, prev_sid, bytes[at])
+ .map_err(|_| gave_up(at))?;
+ }
+ }
+ if sid.is_tagged() {
+ if sid.is_start() {
+ continue;
+ } else if sid.is_match() {
+ last_match = Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: start + at + MATCH_OFFSET,
+ });
+ if earliest {
+ return Ok(last_match);
+ }
+ } else if sid.is_dead() {
+ return Ok(last_match);
+ } else {
+ debug_assert!(sid.is_quit());
+ if last_match.is_some() {
+ return Ok(last_match);
+ }
+ return Err(MatchError::Quit { byte: bytes[at], offset: at });
+ }
+ }
+ }
+ Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
+}
+
+#[inline(never)]
+pub(crate) fn find_overlapping_fwd(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should only ever
+ // use a prefilter when no pattern ID is given.
+ if pre.is_some() && pattern_id.is_none() {
+ find_overlapping_fwd_imp(
+ pre,
+ dfa,
+ cache,
+ pattern_id,
+ bytes,
+ start,
+ end,
+ caller_state,
+ )
+ } else {
+ find_overlapping_fwd_imp(
+ None,
+ dfa,
+ cache,
+ pattern_id,
+ bytes,
+ start,
+ end,
+ caller_state,
+ )
+ }
+}
+
+#[inline(always)]
+fn find_overlapping_fwd_imp(
+ mut pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ mut start: usize,
+ end: usize,
+ caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let mut sid = match caller_state.id() {
+ None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
+ Some(sid) => {
+ if let Some(last) = caller_state.last_match() {
+ let match_count = dfa.match_count(cache, sid);
+ if last.match_index < match_count {
+ let m = HalfMatch {
+ pattern: dfa.match_pattern(
+ cache,
+ sid,
+ last.match_index,
+ ),
+ offset: last.offset,
+ };
+ last.match_index += 1;
+ return Ok(Some(m));
+ }
+ }
+
+ // This is a subtle but critical detail. If the caller provides a
+ // non-None state ID, then it must be the case that the state ID
+ // corresponds to one set by this function. The state ID therefore
+ // corresponds to a match state, a dead state or some other state.
+ // However, "some other" state _only_ occurs when the input has
+ // been exhausted because the only way to stop before then is to
+ // see a match or a dead/quit state.
+ //
+ // If the input is exhausted or if it's a dead state, then
+ // incrementing the starting position has no relevance on
+ // correctness, since the loop below will either not execute
+ // at all or will immediately stop due to being in a dead state.
+ // (Once in a dead state it is impossible to leave it.)
+ //
+ // Therefore, the only case we need to consider is when
+ // caller_state is a match state. In this case, since our machines
+ // support the ability to delay a match by a certain number of
+ // bytes (to support look-around), it follows that we actually
+ // consumed that many additional bytes on our previous search. When
+ // the caller resumes their search to find subsequent matches, they
+ // will use the ending location from the previous match as the next
+ // starting point, which is `match_offset` bytes PRIOR to where
+ // we scanned to on the previous search. Therefore, we need to
+ // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
+ //
+ // Incidentally, since MATCH_OFFSET is non-zero, this also makes
+ // dealing with empty matches convenient. Namely, callers needn't
+ // special case them when implementing an iterator. Instead, this
+ // ensures that forward progress is always made.
+ start += MATCH_OFFSET;
+ sid
+ }
+ };
+
+ let mut at = start;
+ while at < end {
+ let byte = bytes[at];
+ sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
+ at += 1;
+ if sid.is_tagged() {
+ caller_state.set_id(sid);
+ if sid.is_start() {
+ if let Some(ref mut pre) = pre {
+ if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ } else if sid.is_match() {
+ let offset = at - MATCH_OFFSET;
+ caller_state
+ .set_last_match(StateMatch { match_index: 1, offset });
+ return Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset,
+ }));
+ } else if sid.is_dead() {
+ return Ok(None);
+ } else {
+ debug_assert!(sid.is_quit());
+ return Err(MatchError::Quit { byte, offset: at - 1 });
+ }
+ }
+ }
+
+ let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
+ caller_state.set_id(sid);
+ if let Ok(Some(ref last_match)) = result {
+ caller_state.set_last_match(StateMatch {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ match_index: 1,
+ offset: last_match.offset(),
+ });
+ }
+ result
+}
+
+#[inline(always)]
+fn init_fwd(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<LazyStateID, MatchError> {
+ let sid = dfa
+ .start_state_forward(cache, pattern_id, bytes, start, end)
+ .map_err(|_| gave_up(start))?;
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ assert!(!sid.is_match());
+ Ok(sid)
+}
+
+#[inline(always)]
+fn init_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<LazyStateID, MatchError> {
+ let sid = dfa
+ .start_state_reverse(cache, pattern_id, bytes, start, end)
+ .map_err(|_| gave_up(end))?;
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ assert!(!sid.is_match());
+ Ok(sid)
+}
+
+#[inline(always)]
+fn eoi_fwd(
+ dfa: &DFA,
+ cache: &mut Cache,
+ bytes: &[u8],
+ end: usize,
+ sid: &mut LazyStateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+ match bytes.get(end) {
+ Some(&b) => {
+ *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, *sid, 0),
+ offset: end,
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+ None => {
+ *sid = dfa
+ .next_eoi_state(cache, *sid)
+ .map_err(|_| gave_up(bytes.len()))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, *sid, 0),
+ offset: bytes.len(),
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+ }
+}
+
+#[inline(always)]
+fn eoi_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ bytes: &[u8],
+ start: usize,
+ state: LazyStateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+ if start > 0 {
+ let sid = dfa
+ .next_state(cache, state, bytes[start - 1])
+ .map_err(|_| gave_up(start))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: start,
+ }))
+ } else {
+ Ok(None)
+ }
+ } else {
+ let sid =
+ dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: 0,
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+}
+
+/// A convenience routine for constructing a "gave up" match error.
+#[inline(always)]
+fn gave_up(offset: usize) -> MatchError {
+ MatchError::GaveUp { offset }
+}
diff --git a/vendor/regex-automata/src/lib.rs b/vendor/regex-automata/src/lib.rs
index 7894eccea..d9d7ada48 100644
--- a/vendor/regex-automata/src/lib.rs
+++ b/vendor/regex-automata/src/lib.rs
@@ -1,360 +1,47 @@
/*!
-A low level regular expression library that uses deterministic finite automata.
-It supports a rich syntax with Unicode support, has extensive options for
-configuring the best space vs time trade off for your use case and provides
-support for cheap deserialization of automata for use in `no_std` environments.
-
-# Overview
-
-This section gives a brief overview of the primary types in this crate:
-
-* A [`Regex`](struct.Regex.html) provides a way to search for matches of a
- regular expression. This includes iterating over matches with both the start
- and end positions of each match.
-* A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many
- compilation options for a regex.
-* A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that
- uses a dense representation (uses lots of space, but fast searching).
-* A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`,
- but uses a sparse representation (uses less space, but slower matching).
-* A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must
- implement.
-* Both dense DFAs and sparse DFAs support
- [serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian)
- and
- [cheap deserialization](enum.DenseDFA.html#method.from_bytes).
-
-# Example: basic regex searching
-
-This example shows how to compile a regex using the default configuration
-and then use it to find matches in a byte string:
-
-```
-use regex_automata::Regex;
-
-let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
-assert_eq!(matches, vec![(0, 10), (11, 21)]);
-```
-
-# Example: use sparse DFAs
-
-By default, compiling a regex will use dense DFAs internally. This uses more
-memory, but executes searches more quickly. If you can abide slower searches
-(somewhere around 3-5x), then sparse DFAs might make more sense since they can
-use significantly less space.
-
-Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
-`Regex::new`:
-
-```
-use regex_automata::Regex;
-
-# fn example() -> Result<(), regex_automata::Error> {
-let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
-assert_eq!(matches, vec![(0, 10), (11, 21)]);
-# Ok(()) }; example().unwrap()
-```
-
-If you already have dense DFAs for some reason, they can be converted to sparse
-DFAs and used to build a new `Regex`. For example:
-
-```
-use regex_automata::Regex;
-
-# fn example() -> Result<(), regex_automata::Error> {
-let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
-let sparse_re = Regex::from_dfas(
- dense_re.forward().to_sparse()?,
- dense_re.reverse().to_sparse()?,
-);
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect();
-assert_eq!(matches, vec![(0, 10), (11, 21)]);
-# Ok(()) }; example().unwrap()
-```
-
-# Example: deserialize a DFA
-
-This shows how to first serialize a DFA into raw bytes, and then deserialize
-those raw bytes back into a DFA. While this particular example is a bit
-contrived, this same technique can be used in your program to deserialize a
-DFA at start up time or by memory mapping a file. In particular,
-deserialization is guaranteed to be cheap because it will always be a constant
-time operation.
-
-```
-use regex_automata::{DenseDFA, Regex};
-
-# fn example() -> Result<(), regex_automata::Error> {
-let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
-// serialize both the forward and reverse DFAs, see note below
-let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?;
-let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?;
-// now deserialize both---we need to specify the correct type!
-let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) };
-let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) };
-// finally, reconstruct our regex
-let re2 = Regex::from_dfas(fwd, rev);
-
-// we can use it like normal
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
-assert_eq!(matches, vec![(0, 10), (11, 21)]);
-# Ok(()) }; example().unwrap()
-```
-
-There are a few points worth noting here:
-
-* We need to extract the raw DFAs used by the regex and serialize those. You
- can build the DFAs manually yourself using
- [`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a
- `Regex` guarantees that the DFAs are built correctly.
-* We specifically convert the dense DFA to a representation that uses `u16`
- for its state identifiers using
- [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't
- strictly necessary, if we skipped this step, then the serialized bytes would
- use `usize` for state identifiers, which does not have a fixed size. Using
- `u16` ensures that we can deserialize this DFA even on platforms with a
- smaller pointer size. If our DFA is too big for `u16` state identifiers, then
- one can use `u32` or `u64`.
-* To convert the DFA to raw bytes, we use the `to_bytes_native_endian`
- method. In practice, you'll want to use either
- [`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
- or
- [`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian),
- depending on which platform you're deserializing your DFA from. If you intend
- to deserialize on either platform, then you'll need to serialize both and
- deserialize the right one depending on your target's endianness.
-* Deserializing a DFA requires the use of `unsafe` because the raw bytes must
- be *trusted*. In particular, while some degree of sanity checks are
- performed, nothing guarantees the integrity of the DFA's transition table
- since deserialization is a constant time operation. Since searching with a
- DFA must be able to follow transitions blindly for performance reasons,
- giving incorrect bytes to the deserialization API can result in memory
- unsafety.
-
-The same process can be achieved with sparse DFAs as well:
-
-```
-use regex_automata::{SparseDFA, Regex};
-
-# fn example() -> Result<(), regex_automata::Error> {
-let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
-// serialize both
-let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
-let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
-// now deserialize both---we need to specify the correct type!
-let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) };
-let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) };
-// finally, reconstruct our regex
-let re2 = Regex::from_dfas(fwd, rev);
-
-// we can use it like normal
-let text = b"2018-12-24 2016-10-08";
-let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
-assert_eq!(matches, vec![(0, 10), (11, 21)]);
-# Ok(()) }; example().unwrap()
-```
-
-Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
-Conversely, dense DFAs must be be aligned to the same alignment as their
-state identifier representation.
-
-# Support for `no_std`
-
-This crate comes with a `std` feature that is enabled by default. When the
-`std` feature is enabled, the API of this crate will include the facilities
-necessary for compiling, serializing, deserializing and searching with regular
-expressions. When the `std` feature is disabled, the API of this crate will
-shrink such that it only includes the facilities necessary for deserializing
-and searching with regular expressions.
-
-The intended workflow for `no_std` environments is thus as follows:
-
-* Write a program with the `std` feature that compiles and serializes a
- regular expression. Serialization should only happen after first converting
- the DFAs to use a fixed size state identifier instead of the default `usize`.
- You may also need to serialize both little and big endian versions of each
- DFA. (So that's 4 DFAs in total for each regex.)
-* In your `no_std` environment, follow the examples above for deserializing
- your previously serialized DFAs into regexes. You can then search with them
- as you would any regex.
-
-Deserialization can happen anywhere. For example, with bytes embedded into a
-binary or with a file memory mapped at runtime.
-
-Note that the
-[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
-tool will do the first step for you with its `dfa` or `regex` sub-commands.
-
-# Syntax
-
-This crate supports the same syntax as the `regex` crate, since they share the
-same parser. You can find an exhaustive list of supported syntax in the
-[documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax).
-
-Currently, there are a couple limitations. In general, this crate does not
-support zero-width assertions, although they may be added in the future. This
-includes:
-
-* Anchors such as `^`, `$`, `\A` and `\z`.
-* Word boundary assertions such as `\b` and `\B`.
-
-It is possible to run a search that is anchored at the beginning of the input.
-To do that, set the
-[`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored)
-option when building a regex. By default, all searches are unanchored.
-
-# Differences with the regex crate
-
-The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
-general purpose regular expression engine. It aims to automatically balance low
-compile times, fast search times and low memory usage, while also providing
-a convenient API for users. In contrast, this crate provides a lower level
-regular expression interface that is a bit less convenient while providing more
-explicit control over memory usage and search times.
-
-Here are some specific negative differences:
-
-* **Compilation can take an exponential amount of time and space** in the size
- of the regex pattern. While most patterns do not exhibit worst case
- exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
- build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
- not be compiled with this library. (In the future, the API may expose an
- option to return an error if the DFA gets too big.)
-* This crate does not support sub-match extraction, which can be achieved with
- the regex crate's "captures" API. This may be added in the future, but is
- unlikely.
-* While the regex crate doesn't necessarily sport fast compilation times, the
- regexes in this crate are almost universally slow to compile, especially when
- they contain large Unicode character classes. For example, on my system,
- compiling `\w{3}` with byte classes enabled takes just over 1 second and
- almost 5MB of memory! (Compiling a sparse regex takes about the same time
- but only uses about 500KB of memory.) Conversly, compiling the same regex
- without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
- less than 5KB of memory. For this reason, you should only use Unicode
- character classes if you absolutely need them!
-* This crate does not support regex sets.
-* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
- `\B`.
-* As a lower level crate, this library does not do literal optimizations. In
- exchange, you get predictable performance regardless of input. The
- philosophy here is that literal optimizations should be applied at a higher
- level, although there is no easy support for this in the ecosystem yet.
-* There is no `&str` API like in the regex crate. In this crate, all APIs
- operate on `&[u8]`. By default, match indices are guaranteed to fall on
- UTF-8 boundaries, unless
- [`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
- is enabled.
-
-With some of the downsides out of the way, here are some positive differences:
-
-* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
- deserialized. Deserialization always takes constant time since searching can
- be performed directly on the raw serialized bytes of a DFA.
-* This crate was specifically designed so that the searching phase of a DFA has
- minimal runtime requirements, and can therefore be used in `no_std`
- environments. While `no_std` environments cannot compile regexes, they can
- deserialize pre-compiled regexes.
-* Since this crate builds DFAs ahead of time, it will generally out-perform
- the `regex` crate on equivalent tasks. The performance difference is likely
- not large. However, because of a complex set of optimizations in the regex
- crate (like literal optimizations), an accurate performance comparison may be
- difficult to do.
-* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
- performance a small amount, but uses much less storage space. Potentially
- even less than what the regex crate uses.
-* This crate exposes DFAs directly, such as
- [`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html),
- which enables one to do less work in some cases. For example, if you only
- need the end of a match and not the start of a match, then you can use a DFA
- directly without building a `Regex`, which always requires a second DFA to
- find the start of a match.
-* Aside from choosing between dense and sparse DFAs, there are several options
- for configuring the space usage vs search time trade off. These include
- things like choosing a smaller state identifier representation, to
- premultiplying state identifiers and splitting a DFA's alphabet into
- equivalence classes. Finally, DFA minimization is also provided, but can
- increase compilation times dramatically.
+This crate provides an "expert" API for executing regular expressions using
+finite automata.
+
+**WARNING**: This `0.2` release of `regex-automata` was published
+before it was ready to unblock work elsewhere that needed some
+of the new APIs in this release. At the time of writing, it is
+strongly preferred that you continue using the
+[`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/)
+release. Since this release represents an unfinished state, please do not
+create issues for this release unless it's for a critical bug.
*/
-#![deny(missing_docs)]
+#![allow(warnings)]
+// #![deny(missing_docs)]
#![cfg_attr(not(feature = "std"), no_std)]
-#[cfg(feature = "std")]
-extern crate core;
-
-#[cfg(all(test, feature = "transducer"))]
-extern crate bstr;
-#[cfg(feature = "transducer")]
-extern crate fst;
-#[cfg(feature = "std")]
-extern crate regex_syntax;
-
-pub use dense::DenseDFA;
-pub use dfa::DFA;
-#[cfg(feature = "std")]
-pub use error::{Error, ErrorKind};
-pub use regex::Regex;
-#[cfg(feature = "std")]
-pub use regex::RegexBuilder;
-pub use sparse::SparseDFA;
-pub use state_id::StateID;
-
-mod byteorder;
-mod classes;
-#[path = "dense.rs"]
-mod dense_imp;
-#[cfg(feature = "std")]
-mod determinize;
-mod dfa;
-#[cfg(feature = "std")]
-mod error;
-#[cfg(feature = "std")]
-mod minimize;
-#[cfg(feature = "std")]
+#[cfg(not(any(
+ target_pointer_width = "16",
+ target_pointer_width = "32",
+ target_pointer_width = "64"
+)))]
+compile_error!("regex-automata currently not supported on non-{16,32,64}");
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+#[doc(inline)]
+pub use crate::util::id::PatternID;
+#[cfg(feature = "alloc")]
+pub use crate::util::syntax::SyntaxConfig;
+pub use crate::util::{
+ bytes::{DeserializeError, SerializeError},
+ matchtypes::{HalfMatch, Match, MatchError, MatchKind, MultiMatch},
+};
+
+#[macro_use]
+mod macros;
+
+pub mod dfa;
+#[cfg(feature = "alloc")]
+pub mod hybrid;
#[doc(hidden)]
+#[cfg(feature = "alloc")]
pub mod nfa;
-mod regex;
-#[path = "sparse.rs"]
-mod sparse_imp;
-#[cfg(feature = "std")]
-mod sparse_set;
-mod state_id;
-#[cfg(feature = "transducer")]
-mod transducer;
-
-/// Types and routines specific to dense DFAs.
-///
-/// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its
-/// corresponding variant DFA types, such as [`Standard`](struct.Standard.html)
-/// and [`ByteClass`](struct.ByteClass.html).
-///
-/// This module also contains a [builder](struct.Builder.html) for
-/// configuring the construction of a dense DFA.
-pub mod dense {
- pub use dense_imp::*;
-}
-
-/// Types and routines specific to sparse DFAs.
-///
-/// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of
-/// its corresponding variant DFA types, such as
-/// [`Standard`](struct.Standard.html) and
-/// [`ByteClass`](struct.ByteClass.html).
-///
-/// Unlike the [`dense`](../dense/index.html) module, this module does not
-/// contain a builder specific for sparse DFAs. Instead, the intended way to
-/// build a sparse DFA is either by using a default configuration with its
-/// [constructor](enum.SparseDFA.html#method.new),
-/// or by first
-/// [configuring the construction of a dense DFA](../dense/struct.Builder.html)
-/// and then calling
-/// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse).
-pub mod sparse {
- pub use sparse_imp::*;
-}
+#[doc(hidden)]
+pub mod util;
diff --git a/vendor/regex-automata/src/macros.rs b/vendor/regex-automata/src/macros.rs
new file mode 100644
index 000000000..649ba17c5
--- /dev/null
+++ b/vendor/regex-automata/src/macros.rs
@@ -0,0 +1,30 @@
+/// A simple macro for defining bitfield accessors/mutators.
+#[cfg(feature = "alloc")]
+macro_rules! define_bool {
+ ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
+ fn $is_fn_name(&self) -> bool {
+ self.bools & (0b1 << $bit) > 0
+ }
+
+ fn $set_fn_name(&mut self, yes: bool) {
+ if yes {
+ self.bools |= 1 << $bit;
+ } else {
+ self.bools &= !(1 << $bit);
+ }
+ }
+ };
+}
+
+macro_rules! log {
+ ($($tt:tt)*) => {
+ #[cfg(feature = "logging")]
+ {
+ $($tt)*
+ }
+ }
+}
+
+macro_rules! trace {
+ ($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
+}
diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs
deleted file mode 100644
index d9b3945b3..000000000
--- a/vendor/regex-automata/src/nfa/compiler.rs
+++ /dev/null
@@ -1,1193 +0,0 @@
-// This module provides an NFA compiler using Thompson's construction
-// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
-// graph as output. The NFA graph is structured in a way that permits it to be
-// executed by a virtual machine and also used to efficiently build a DFA.
-//
-// The compiler deals with a slightly expanded set of NFA states that notably
-// includes an empty node that has exactly one epsilon transition to the next
-// state. In other words, it's a "goto" instruction if one views Thompson's NFA
-// as a set of bytecode instructions. These goto instructions are removed in
-// a subsequent phase before returning the NFA to the caller. The purpose of
-// these empty nodes is that they make the construction algorithm substantially
-// simpler to implement. We remove them before returning to the caller because
-// they can represent substantial overhead when traversing the NFA graph
-// (either while searching using the NFA directly or while building a DFA).
-//
-// In the future, it would be nice to provide a Glushkov compiler as well,
-// as it would work well as a bit-parallel NFA for smaller regexes. But
-// the Thompson construction is one I'm more familiar with and seems more
-// straight-forward to deal with when it comes to large Unicode character
-// classes.
-//
-// Internally, the compiler uses interior mutability to improve composition
-// in the face of the borrow checker. In particular, we'd really like to be
-// able to write things like this:
-//
-// self.c_concat(exprs.iter().map(|e| self.c(e)))
-//
-// Which elegantly uses iterators to build up a sequence of compiled regex
-// sub-expressions and then hands it off to the concatenating compiler
-// routine. Without interior mutability, the borrow checker won't let us
-// borrow `self` mutably both inside and outside the closure at the same
-// time.
-
-use std::cell::RefCell;
-use std::mem;
-
-use regex_syntax::hir::{self, Hir, HirKind};
-use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
-
-use classes::ByteClassSet;
-use error::{Error, Result};
-use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
-use nfa::range_trie::RangeTrie;
-use nfa::{State, StateID, Transition, NFA};
-
-/// Config knobs for the NFA compiler. See the builder's methods for more
-/// docs on each one.
-#[derive(Clone, Copy, Debug)]
-struct Config {
- anchored: bool,
- allow_invalid_utf8: bool,
- reverse: bool,
- shrink: bool,
-}
-
-impl Default for Config {
- fn default() -> Config {
- Config {
- anchored: false,
- allow_invalid_utf8: false,
- reverse: false,
- shrink: true,
- }
- }
-}
-
-/// A builder for compiling an NFA.
-#[derive(Clone, Debug)]
-pub struct Builder {
- config: Config,
-}
-
-impl Builder {
- /// Create a new NFA builder with its default configuration.
- pub fn new() -> Builder {
- Builder { config: Config::default() }
- }
-
- /// Compile the given high level intermediate representation of a regular
- /// expression into an NFA.
- ///
- /// If there was a problem building the NFA, then an error is returned.
- /// For example, if the regex uses unsupported features (such as zero-width
- /// assertions), then an error is returned.
- pub fn build(&self, expr: &Hir) -> Result<NFA> {
- let mut nfa = NFA::always_match();
- self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
- Ok(nfa)
- }
-
- /// Compile the given high level intermediate representation of a regular
- /// expression into the NFA given using the given compiler. Callers may
- /// prefer this over `build` if they would like to reuse allocations while
- /// compiling many regular expressions.
- ///
- /// On success, the given NFA is completely overwritten with the NFA
- /// produced by the compiler.
- ///
- /// If there was a problem building the NFA, then an error is returned. For
- /// example, if the regex uses unsupported features (such as zero-width
- /// assertions), then an error is returned. When an error is returned,
- /// the contents of `nfa` are unspecified and should not be relied upon.
- /// However, it can still be reused in subsequent calls to this method.
- pub fn build_with(
- &self,
- compiler: &mut Compiler,
- nfa: &mut NFA,
- expr: &Hir,
- ) -> Result<()> {
- compiler.clear();
- compiler.configure(self.config);
- compiler.compile(nfa, expr)
- }
-
- /// Set whether matching must be anchored at the beginning of the input.
- ///
- /// When enabled, a match must begin at the start of the input. When
- /// disabled, the NFA will act as if the pattern started with a `.*?`,
- /// which enables a match to appear anywhere.
- ///
- /// By default this is disabled.
- pub fn anchored(&mut self, yes: bool) -> &mut Builder {
- self.config.anchored = yes;
- self
- }
-
- /// When enabled, the builder will permit the construction of an NFA that
- /// may match invalid UTF-8.
- ///
- /// When disabled (the default), the builder is guaranteed to produce a
- /// regex that will only ever match valid UTF-8 (otherwise, the builder
- /// will return an error).
- pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
- self.config.allow_invalid_utf8 = yes;
- self
- }
-
- /// Reverse the NFA.
- ///
- /// A NFA reversal is performed by reversing all of the concatenated
- /// sub-expressions in the original pattern, recursively. The resulting
- /// NFA can be used to match the pattern starting from the end of a string
- /// instead of the beginning of a string.
- ///
- /// Reversing the NFA is useful for building a reverse DFA, which is most
- /// useful for finding the start of a match.
- pub fn reverse(&mut self, yes: bool) -> &mut Builder {
- self.config.reverse = yes;
- self
- }
-
- /// Apply best effort heuristics to shrink the NFA at the expense of more
- /// time/memory.
- ///
- /// This is enabled by default. Generally speaking, if one is using an NFA
- /// to compile DFA, then the extra time used to shrink the NFA will be
- /// more than made up for during DFA construction (potentially by a lot).
- /// In other words, enabling this can substantially decrease the overall
- /// amount of time it takes to build a DFA.
- ///
- /// The only reason to disable this if you want to compile an NFA and start
- /// using it as quickly as possible without needing to build a DFA.
- pub fn shrink(&mut self, yes: bool) -> &mut Builder {
- self.config.shrink = yes;
- self
- }
-}
-
-/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
-/// construction. Namely, this compiler permits epsilon transitions between
-/// states.
-///
-/// Users of this crate cannot use a compiler directly. Instead, all one can
-/// do is create one and use it via the
-/// [`Builder::build_with`](struct.Builder.html#method.build_with)
-/// method. This permits callers to reuse compilers in order to amortize
-/// allocations.
-#[derive(Clone, Debug)]
-pub struct Compiler {
- /// The set of compiled NFA states. Once a state is compiled, it is
- /// assigned a state ID equivalent to its index in this list. Subsequent
- /// compilation can modify previous states by adding new transitions.
- states: RefCell<Vec<CState>>,
- /// The configuration from the builder.
- config: Config,
- /// State used for compiling character classes to UTF-8 byte automata.
- /// State is not retained between character class compilations. This just
- /// serves to amortize allocation to the extent possible.
- utf8_state: RefCell<Utf8State>,
- /// State used for arranging character classes in reverse into a trie.
- trie_state: RefCell<RangeTrie>,
- /// State used for caching common suffixes when compiling reverse UTF-8
- /// automata (for Unicode character classes).
- utf8_suffix: RefCell<Utf8SuffixMap>,
- /// A map used to re-map state IDs when translating the compiler's internal
- /// NFA state representation to the external NFA representation.
- remap: RefCell<Vec<StateID>>,
- /// A set of compiler internal state IDs that correspond to states that are
- /// exclusively epsilon transitions, i.e., goto instructions, combined with
- /// the state that they point to. This is used to record said states while
- /// transforming the compiler's internal NFA representation to the external
- /// form.
- empties: RefCell<Vec<(StateID, StateID)>>,
-}
-
-/// A compiler intermediate state representation for an NFA that is only used
-/// during compilation. Once compilation is done, `CState`s are converted to
-/// `State`s, which have a much simpler representation.
-#[derive(Clone, Debug, Eq, PartialEq)]
-enum CState {
- /// An empty state whose only purpose is to forward the automaton to
- /// another state via en epsilon transition. These are useful during
- /// compilation but are otherwise removed at the end.
- Empty { next: StateID },
- /// A state that only transitions to `next` if the current input byte is
- /// in the range `[start, end]` (inclusive on both ends).
- Range { range: Transition },
- /// A state with possibly many transitions, represented in a sparse
- /// fashion. Transitions are ordered lexicographically by input range.
- /// As such, this may only be used when every transition has equal
- /// priority. (In practice, this is only used for encoding large UTF-8
- /// automata.)
- Sparse { ranges: Vec<Transition> },
- /// An alternation such that there exists an epsilon transition to all
- /// states in `alternates`, where matches found via earlier transitions
- /// are preferred over later transitions.
- Union { alternates: Vec<StateID> },
- /// An alternation such that there exists an epsilon transition to all
- /// states in `alternates`, where matches found via later transitions
- /// are preferred over earlier transitions.
- ///
- /// This "reverse" state exists for convenience during compilation that
- /// permits easy construction of non-greedy combinations of NFA states.
- /// At the end of compilation, Union and UnionReverse states are merged
- /// into one Union type of state, where the latter has its epsilon
- /// transitions reversed to reflect the priority inversion.
- UnionReverse { alternates: Vec<StateID> },
- /// A match state. There is exactly one such occurrence of this state in
- /// an NFA.
- Match,
-}
-
-/// A value that represents the result of compiling a sub-expression of a
-/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
-/// has an initial state at `start` and a final state at `end`.
-#[derive(Clone, Copy, Debug)]
-pub struct ThompsonRef {
- start: StateID,
- end: StateID,
-}
-
-impl Compiler {
- /// Create a new compiler.
- pub fn new() -> Compiler {
- Compiler {
- states: RefCell::new(vec![]),
- config: Config::default(),
- utf8_state: RefCell::new(Utf8State::new()),
- trie_state: RefCell::new(RangeTrie::new()),
- utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
- remap: RefCell::new(vec![]),
- empties: RefCell::new(vec![]),
- }
- }
-
- /// Clear any memory used by this compiler such that it is ready to compile
- /// a new regex.
- ///
- /// It is preferrable to reuse a compiler if possible in order to reuse
- /// allocations.
- fn clear(&self) {
- self.states.borrow_mut().clear();
- // We don't need to clear anything else since they are cleared on
- // their own and only when they are used.
- }
-
- /// Configure this compiler from the builder's knobs.
- ///
- /// The compiler is always reconfigured by the builder before using it to
- /// build an NFA.
- fn configure(&mut self, config: Config) {
- self.config = config;
- }
-
- /// Convert the current intermediate NFA to its final compiled form.
- fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
- nfa.anchored = self.config.anchored;
-
- let mut start = self.add_empty();
- if !nfa.anchored {
- let compiled = if self.config.allow_invalid_utf8 {
- self.c_unanchored_prefix_invalid_utf8()?
- } else {
- self.c_unanchored_prefix_valid_utf8()?
- };
- self.patch(start, compiled.start);
- start = compiled.end;
- }
- let compiled = self.c(&expr)?;
- let match_id = self.add_match();
- self.patch(start, compiled.start);
- self.patch(compiled.end, match_id);
- self.finish(nfa);
- Ok(())
- }
-
- /// Finishes the compilation process and populates the provide NFA with
- /// the final graph.
- fn finish(&self, nfa: &mut NFA) {
- let mut bstates = self.states.borrow_mut();
- let mut remap = self.remap.borrow_mut();
- remap.resize(bstates.len(), 0);
- let mut empties = self.empties.borrow_mut();
- empties.clear();
-
- // We don't reuse allocations here becuase this is what we're
- // returning.
- nfa.states.clear();
- let mut byteset = ByteClassSet::new();
-
- // The idea here is to convert our intermediate states to their final
- // form. The only real complexity here is the process of converting
- // transitions, which are expressed in terms of state IDs. The new
- // set of states will be smaller because of partial epsilon removal,
- // so the state IDs will not be the same.
- for (id, bstate) in bstates.iter_mut().enumerate() {
- match *bstate {
- CState::Empty { next } => {
- // Since we're removing empty states, we need to handle
- // them later since we don't yet know which new state this
- // empty state will be mapped to.
- empties.push((id, next));
- }
- CState::Range { ref range } => {
- remap[id] = nfa.states.len();
- byteset.set_range(range.start, range.end);
- nfa.states.push(State::Range { range: range.clone() });
- }
- CState::Sparse { ref mut ranges } => {
- remap[id] = nfa.states.len();
-
- let ranges = mem::replace(ranges, vec![]);
- for r in &ranges {
- byteset.set_range(r.start, r.end);
- }
- nfa.states.push(State::Sparse {
- ranges: ranges.into_boxed_slice(),
- });
- }
- CState::Union { ref mut alternates } => {
- remap[id] = nfa.states.len();
-
- let alternates = mem::replace(alternates, vec![]);
- nfa.states.push(State::Union {
- alternates: alternates.into_boxed_slice(),
- });
- }
- CState::UnionReverse { ref mut alternates } => {
- remap[id] = nfa.states.len();
-
- let mut alternates = mem::replace(alternates, vec![]);
- alternates.reverse();
- nfa.states.push(State::Union {
- alternates: alternates.into_boxed_slice(),
- });
- }
- CState::Match => {
- remap[id] = nfa.states.len();
- nfa.states.push(State::Match);
- }
- }
- }
- for &(empty_id, mut empty_next) in empties.iter() {
- // empty states can point to other empty states, forming a chain.
- // So we must follow the chain until the end, which must end at
- // a non-empty state, and therefore, a state that is correctly
- // remapped. We are guaranteed to terminate because our compiler
- // never builds a loop among empty states.
- while let CState::Empty { next } = bstates[empty_next] {
- empty_next = next;
- }
- remap[empty_id] = remap[empty_next];
- }
- for state in &mut nfa.states {
- state.remap(&remap);
- }
- // The compiler always begins the NFA at the first state.
- nfa.start = remap[0];
- nfa.byte_classes = byteset.byte_classes();
- }
-
- fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
- match *expr.kind() {
- HirKind::Empty => {
- let id = self.add_empty();
- Ok(ThompsonRef { start: id, end: id })
- }
- HirKind::Literal(hir::Literal::Unicode(ch)) => {
- let mut buf = [0; 4];
- let it = ch
- .encode_utf8(&mut buf)
- .as_bytes()
- .iter()
- .map(|&b| Ok(self.c_range(b, b)));
- self.c_concat(it)
- }
- HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
- HirKind::Class(hir::Class::Bytes(ref cls)) => {
- self.c_byte_class(cls)
- }
- HirKind::Class(hir::Class::Unicode(ref cls)) => {
- self.c_unicode_class(cls)
- }
- HirKind::Repetition(ref rep) => self.c_repetition(rep),
- HirKind::Group(ref group) => self.c(&*group.hir),
- HirKind::Concat(ref exprs) => {
- self.c_concat(exprs.iter().map(|e| self.c(e)))
- }
- HirKind::Alternation(ref exprs) => {
- self.c_alternation(exprs.iter().map(|e| self.c(e)))
- }
- HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
- HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
- }
- }
-
- fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
- where
- I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
- {
- let first =
- if self.config.reverse { it.next_back() } else { it.next() };
- let ThompsonRef { start, mut end } = match first {
- Some(result) => result?,
- None => return Ok(self.c_empty()),
- };
- loop {
- let next =
- if self.config.reverse { it.next_back() } else { it.next() };
- let compiled = match next {
- Some(result) => result?,
- None => break,
- };
- self.patch(end, compiled.start);
- end = compiled.end;
- }
- Ok(ThompsonRef { start, end })
- }
-
- fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
- where
- I: Iterator<Item = Result<ThompsonRef>>,
- {
- let first = it.next().expect("alternations must be non-empty")?;
- let second = match it.next() {
- None => return Ok(first),
- Some(result) => result?,
- };
-
- let union = self.add_union();
- let end = self.add_empty();
- self.patch(union, first.start);
- self.patch(first.end, end);
- self.patch(union, second.start);
- self.patch(second.end, end);
- for result in it {
- let compiled = result?;
- self.patch(union, compiled.start);
- self.patch(compiled.end, end);
- }
- Ok(ThompsonRef { start: union, end })
- }
-
- fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
- match rep.kind {
- hir::RepetitionKind::ZeroOrOne => {
- self.c_zero_or_one(&rep.hir, rep.greedy)
- }
- hir::RepetitionKind::ZeroOrMore => {
- self.c_at_least(&rep.hir, rep.greedy, 0)
- }
- hir::RepetitionKind::OneOrMore => {
- self.c_at_least(&rep.hir, rep.greedy, 1)
- }
- hir::RepetitionKind::Range(ref rng) => match *rng {
- hir::RepetitionRange::Exactly(count) => {
- self.c_exactly(&rep.hir, count)
- }
- hir::RepetitionRange::AtLeast(m) => {
- self.c_at_least(&rep.hir, rep.greedy, m)
- }
- hir::RepetitionRange::Bounded(min, max) => {
- self.c_bounded(&rep.hir, rep.greedy, min, max)
- }
- },
- }
- }
-
- fn c_bounded(
- &self,
- expr: &Hir,
- greedy: bool,
- min: u32,
- max: u32,
- ) -> Result<ThompsonRef> {
- let prefix = self.c_exactly(expr, min)?;
- if min == max {
- return Ok(prefix);
- }
-
- // It is tempting here to compile the rest here as a concatenation
- // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
- // were `aaa?a?a?`. The problem here is that it leads to this program:
- //
- // >000000: 61 => 01
- // 000001: 61 => 02
- // 000002: alt(03, 04)
- // 000003: 61 => 04
- // 000004: alt(05, 06)
- // 000005: 61 => 06
- // 000006: alt(07, 08)
- // 000007: 61 => 08
- // 000008: MATCH
- //
- // And effectively, once you hit state 2, the epsilon closure will
- // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
- // better to instead compile it like so:
- //
- // >000000: 61 => 01
- // 000001: 61 => 02
- // 000002: alt(03, 08)
- // 000003: 61 => 04
- // 000004: alt(05, 08)
- // 000005: 61 => 06
- // 000006: alt(07, 08)
- // 000007: 61 => 08
- // 000008: MATCH
- //
- // So that the epsilon closure of state 2 is now just 3 and 8.
- let empty = self.add_empty();
- let mut prev_end = prefix.end;
- for _ in min..max {
- let union = if greedy {
- self.add_union()
- } else {
- self.add_reverse_union()
- };
- let compiled = self.c(expr)?;
- self.patch(prev_end, union);
- self.patch(union, compiled.start);
- self.patch(union, empty);
- prev_end = compiled.end;
- }
- self.patch(prev_end, empty);
- Ok(ThompsonRef { start: prefix.start, end: empty })
- }
-
- fn c_at_least(
- &self,
- expr: &Hir,
- greedy: bool,
- n: u32,
- ) -> Result<ThompsonRef> {
- if n == 0 {
- let union = if greedy {
- self.add_union()
- } else {
- self.add_reverse_union()
- };
- let compiled = self.c(expr)?;
- self.patch(union, compiled.start);
- self.patch(compiled.end, union);
- Ok(ThompsonRef { start: union, end: union })
- } else if n == 1 {
- let compiled = self.c(expr)?;
- let union = if greedy {
- self.add_union()
- } else {
- self.add_reverse_union()
- };
- self.patch(compiled.end, union);
- self.patch(union, compiled.start);
- Ok(ThompsonRef { start: compiled.start, end: union })
- } else {
- let prefix = self.c_exactly(expr, n - 1)?;
- let last = self.c(expr)?;
- let union = if greedy {
- self.add_union()
- } else {
- self.add_reverse_union()
- };
- self.patch(prefix.end, last.start);
- self.patch(last.end, union);
- self.patch(union, last.start);
- Ok(ThompsonRef { start: prefix.start, end: union })
- }
- }
-
- fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
- let union =
- if greedy { self.add_union() } else { self.add_reverse_union() };
- let compiled = self.c(expr)?;
- let empty = self.add_empty();
- self.patch(union, compiled.start);
- self.patch(union, empty);
- self.patch(compiled.end, empty);
- Ok(ThompsonRef { start: union, end: empty })
- }
-
- fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
- let it = (0..n).map(|_| self.c(expr));
- self.c_concat(it)
- }
-
- fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
- let end = self.add_empty();
- let mut trans = Vec::with_capacity(cls.ranges().len());
- for r in cls.iter() {
- trans.push(Transition {
- start: r.start(),
- end: r.end(),
- next: end,
- });
- }
- Ok(ThompsonRef { start: self.add_sparse(trans), end })
- }
-
- fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
- // If all we have are ASCII ranges wrapped in a Unicode package, then
- // there is zero reason to bring out the big guns. We can fit all ASCII
- // ranges within a single sparse transition.
- if cls.is_all_ascii() {
- let end = self.add_empty();
- let mut trans = Vec::with_capacity(cls.ranges().len());
- for r in cls.iter() {
- assert!(r.start() <= '\x7F');
- assert!(r.end() <= '\x7F');
- trans.push(Transition {
- start: r.start() as u8,
- end: r.end() as u8,
- next: end,
- });
- }
- Ok(ThompsonRef { start: self.add_sparse(trans), end })
- } else if self.config.reverse {
- if !self.config.shrink {
- // When we don't want to spend the extra time shrinking, we
- // compile the UTF-8 automaton in reverse using something like
- // the "naive" approach, but will attempt to re-use common
- // suffixes.
- self.c_unicode_class_reverse_with_suffix(cls)
- } else {
- // When we want to shrink our NFA for reverse UTF-8 automata,
- // we cannot feed UTF-8 sequences directly to the UTF-8
- // compiler, since the UTF-8 compiler requires all sequences
- // to be lexicographically sorted. Instead, we organize our
- // sequences into a range trie, which can then output our
- // sequences in the correct order. Unfortunately, building the
- // range trie is fairly expensive (but not nearly as expensive
- // as building a DFA). Hence the reason why the 'shrink' option
- // exists, so that this path can be toggled off.
- let mut trie = self.trie_state.borrow_mut();
- trie.clear();
-
- for rng in cls.iter() {
- for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
- seq.reverse();
- trie.insert(seq.as_slice());
- }
- }
- let mut utf8_state = self.utf8_state.borrow_mut();
- let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
- trie.iter(|seq| {
- utf8c.add(&seq);
- });
- Ok(utf8c.finish())
- }
- } else {
- // In the forward direction, we always shrink our UTF-8 automata
- // because we can stream it right into the UTF-8 compiler. There
- // is almost no downside (in either memory or time) to using this
- // approach.
- let mut utf8_state = self.utf8_state.borrow_mut();
- let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
- for rng in cls.iter() {
- for seq in Utf8Sequences::new(rng.start(), rng.end()) {
- utf8c.add(seq.as_slice());
- }
- }
- Ok(utf8c.finish())
- }
-
- // For reference, the code below is the "naive" version of compiling a
- // UTF-8 automaton. It is deliciously simple (and works for both the
- // forward and reverse cases), but will unfortunately produce very
- // large NFAs. When compiling a forward automaton, the size difference
- // can sometimes be an order of magnitude. For example, the '\w' regex
- // will generate about ~3000 NFA states using the naive approach below,
- // but only 283 states when using the approach above. This is because
- // the approach above actually compiles a *minimal* (or near minimal,
- // because of the bounded hashmap) UTF-8 automaton.
- //
- // The code below is kept as a reference point in order to make it
- // easier to understand the higher level goal here.
- /*
- let it = cls
- .iter()
- .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
- .map(|seq| {
- let it = seq
- .as_slice()
- .iter()
- .map(|rng| Ok(self.c_range(rng.start, rng.end)));
- self.c_concat(it)
- });
- self.c_alternation(it);
- */
- }
-
- fn c_unicode_class_reverse_with_suffix(
- &self,
- cls: &hir::ClassUnicode,
- ) -> Result<ThompsonRef> {
- // N.B. It would likely be better to cache common *prefixes* in the
- // reverse direction, but it's not quite clear how to do that. The
- // advantage of caching suffixes is that it does give us a win, and
- // has a very small additional overhead.
- let mut cache = self.utf8_suffix.borrow_mut();
- cache.clear();
-
- let union = self.add_union();
- let alt_end = self.add_empty();
- for urng in cls.iter() {
- for seq in Utf8Sequences::new(urng.start(), urng.end()) {
- let mut end = alt_end;
- for brng in seq.as_slice() {
- let key = Utf8SuffixKey {
- from: end,
- start: brng.start,
- end: brng.end,
- };
- let hash = cache.hash(&key);
- if let Some(id) = cache.get(&key, hash) {
- end = id;
- continue;
- }
-
- let compiled = self.c_range(brng.start, brng.end);
- self.patch(compiled.end, end);
- end = compiled.start;
- cache.set(key, hash, end);
- }
- self.patch(union, end);
- }
- }
- Ok(ThompsonRef { start: union, end: alt_end })
- }
-
- fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
- let id = self.add_range(start, end);
- ThompsonRef { start: id, end: id }
- }
-
- fn c_empty(&self) -> ThompsonRef {
- let id = self.add_empty();
- ThompsonRef { start: id, end: id }
- }
-
- fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- greedy: false,
- hir: Box::new(Hir::any(false)),
- }))
- }
-
- fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- greedy: false,
- hir: Box::new(Hir::any(true)),
- }))
- }
-
- fn patch(&self, from: StateID, to: StateID) {
- match self.states.borrow_mut()[from] {
- CState::Empty { ref mut next } => {
- *next = to;
- }
- CState::Range { ref mut range } => {
- range.next = to;
- }
- CState::Sparse { .. } => {
- panic!("cannot patch from a sparse NFA state")
- }
- CState::Union { ref mut alternates } => {
- alternates.push(to);
- }
- CState::UnionReverse { ref mut alternates } => {
- alternates.push(to);
- }
- CState::Match => {}
- }
- }
-
- fn add_empty(&self) -> StateID {
- let id = self.states.borrow().len();
- self.states.borrow_mut().push(CState::Empty { next: 0 });
- id
- }
-
- fn add_range(&self, start: u8, end: u8) -> StateID {
- let id = self.states.borrow().len();
- let trans = Transition { start, end, next: 0 };
- let state = CState::Range { range: trans };
- self.states.borrow_mut().push(state);
- id
- }
-
- fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
- if ranges.len() == 1 {
- let id = self.states.borrow().len();
- let state = CState::Range { range: ranges[0] };
- self.states.borrow_mut().push(state);
- return id;
- }
- let id = self.states.borrow().len();
- let state = CState::Sparse { ranges };
- self.states.borrow_mut().push(state);
- id
- }
-
- fn add_union(&self) -> StateID {
- let id = self.states.borrow().len();
- let state = CState::Union { alternates: vec![] };
- self.states.borrow_mut().push(state);
- id
- }
-
- fn add_reverse_union(&self) -> StateID {
- let id = self.states.borrow().len();
- let state = CState::UnionReverse { alternates: vec![] };
- self.states.borrow_mut().push(state);
- id
- }
-
- fn add_match(&self) -> StateID {
- let id = self.states.borrow().len();
- self.states.borrow_mut().push(CState::Match);
- id
- }
-}
-
-#[derive(Debug)]
-struct Utf8Compiler<'a> {
- nfac: &'a Compiler,
- state: &'a mut Utf8State,
- target: StateID,
-}
-
-#[derive(Clone, Debug)]
-struct Utf8State {
- compiled: Utf8BoundedMap,
- uncompiled: Vec<Utf8Node>,
-}
-
-#[derive(Clone, Debug)]
-struct Utf8Node {
- trans: Vec<Transition>,
- last: Option<Utf8LastTransition>,
-}
-
-#[derive(Clone, Debug)]
-struct Utf8LastTransition {
- start: u8,
- end: u8,
-}
-
-impl Utf8State {
- fn new() -> Utf8State {
- Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
- }
-
- fn clear(&mut self) {
- self.compiled.clear();
- self.uncompiled.clear();
- }
-}
-
-impl<'a> Utf8Compiler<'a> {
- fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
- let target = nfac.add_empty();
- state.clear();
- let mut utf8c = Utf8Compiler { nfac, state, target };
- utf8c.add_empty();
- utf8c
- }
-
- fn finish(&mut self) -> ThompsonRef {
- self.compile_from(0);
- let node = self.pop_root();
- let start = self.compile(node);
- ThompsonRef { start, end: self.target }
- }
-
- fn add(&mut self, ranges: &[Utf8Range]) {
- let prefix_len = ranges
- .iter()
- .zip(&self.state.uncompiled)
- .take_while(|&(range, node)| {
- node.last.as_ref().map_or(false, |t| {
- (t.start, t.end) == (range.start, range.end)
- })
- })
- .count();
- assert!(prefix_len < ranges.len());
- self.compile_from(prefix_len);
- self.add_suffix(&ranges[prefix_len..]);
- }
-
- fn compile_from(&mut self, from: usize) {
- let mut next = self.target;
- while from + 1 < self.state.uncompiled.len() {
- let node = self.pop_freeze(next);
- next = self.compile(node);
- }
- self.top_last_freeze(next);
- }
-
- fn compile(&mut self, node: Vec<Transition>) -> StateID {
- let hash = self.state.compiled.hash(&node);
- if let Some(id) = self.state.compiled.get(&node, hash) {
- return id;
- }
- let id = self.nfac.add_sparse(node.clone());
- self.state.compiled.set(node, hash, id);
- id
- }
-
- fn add_suffix(&mut self, ranges: &[Utf8Range]) {
- assert!(!ranges.is_empty());
- let last = self
- .state
- .uncompiled
- .len()
- .checked_sub(1)
- .expect("non-empty nodes");
- assert!(self.state.uncompiled[last].last.is_none());
- self.state.uncompiled[last].last = Some(Utf8LastTransition {
- start: ranges[0].start,
- end: ranges[0].end,
- });
- for r in &ranges[1..] {
- self.state.uncompiled.push(Utf8Node {
- trans: vec![],
- last: Some(Utf8LastTransition { start: r.start, end: r.end }),
- });
- }
- }
-
- fn add_empty(&mut self) {
- self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
- }
-
- fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
- let mut uncompiled = self.state.uncompiled.pop().unwrap();
- uncompiled.set_last_transition(next);
- uncompiled.trans
- }
-
- fn pop_root(&mut self) -> Vec<Transition> {
- assert_eq!(self.state.uncompiled.len(), 1);
- assert!(self.state.uncompiled[0].last.is_none());
- self.state.uncompiled.pop().expect("non-empty nodes").trans
- }
-
- fn top_last_freeze(&mut self, next: StateID) {
- let last = self
- .state
- .uncompiled
- .len()
- .checked_sub(1)
- .expect("non-empty nodes");
- self.state.uncompiled[last].set_last_transition(next);
- }
-}
-
-impl Utf8Node {
- fn set_last_transition(&mut self, next: StateID) {
- if let Some(last) = self.last.take() {
- self.trans.push(Transition {
- start: last.start,
- end: last.end,
- next,
- });
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use regex_syntax::hir::Hir;
- use regex_syntax::ParserBuilder;
-
- use super::{Builder, State, StateID, Transition, NFA};
-
- fn parse(pattern: &str) -> Hir {
- ParserBuilder::new().build().parse(pattern).unwrap()
- }
-
- fn build(pattern: &str) -> NFA {
- Builder::new().anchored(true).build(&parse(pattern)).unwrap()
- }
-
- fn s_byte(byte: u8, next: StateID) -> State {
- let trans = Transition { start: byte, end: byte, next };
- State::Range { range: trans }
- }
-
- fn s_range(start: u8, end: u8, next: StateID) -> State {
- let trans = Transition { start, end, next };
- State::Range { range: trans }
- }
-
- fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
- let ranges = ranges
- .iter()
- .map(|&(start, end, next)| Transition { start, end, next })
- .collect();
- State::Sparse { ranges }
- }
-
- fn s_union(alts: &[StateID]) -> State {
- State::Union { alternates: alts.to_vec().into_boxed_slice() }
- }
-
- fn s_match() -> State {
- State::Match
- }
-
- #[test]
- fn errors() {
- // unsupported anchors
- assert!(Builder::new().build(&parse(r"^")).is_err());
- assert!(Builder::new().build(&parse(r"$")).is_err());
- assert!(Builder::new().build(&parse(r"\A")).is_err());
- assert!(Builder::new().build(&parse(r"\z")).is_err());
-
- // unsupported word boundaries
- assert!(Builder::new().build(&parse(r"\b")).is_err());
- assert!(Builder::new().build(&parse(r"\B")).is_err());
- assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
- }
-
- // Test that building an unanchored NFA has an appropriate `.*?` prefix.
- #[test]
- fn compile_unanchored_prefix() {
- // When the machine can only match valid UTF-8.
- let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
- // There should be many states since the `.` in `.*?` matches any
- // Unicode scalar value.
- assert_eq!(11, nfa.len());
- assert_eq!(nfa.states[10], s_match());
- assert_eq!(nfa.states[9], s_byte(b'a', 10));
-
- // When the machine can match invalid UTF-8.
- let nfa = Builder::new()
- .anchored(false)
- .allow_invalid_utf8(true)
- .build(&parse(r"a"))
- .unwrap();
- assert_eq!(
- nfa.states,
- &[
- s_union(&[2, 1]),
- s_range(0, 255, 0),
- s_byte(b'a', 3),
- s_match(),
- ]
- );
- }
-
- #[test]
- fn compile_empty() {
- assert_eq!(build("").states, &[s_match(),]);
- }
-
- #[test]
- fn compile_literal() {
- assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
- assert_eq!(
- build("ab").states,
- &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
- );
- assert_eq!(
- build("☃").states,
- &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
- );
-
- // Check that non-UTF-8 literals work.
- let hir = ParserBuilder::new()
- .allow_invalid_utf8(true)
- .build()
- .parse(r"(?-u)\xFF")
- .unwrap();
- let nfa = Builder::new()
- .anchored(true)
- .allow_invalid_utf8(true)
- .build(&hir)
- .unwrap();
- assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
- }
-
- #[test]
- fn compile_class() {
- assert_eq!(
- build(r"[a-z]").states,
- &[s_range(b'a', b'z', 1), s_match(),]
- );
- assert_eq!(
- build(r"[x-za-c]").states,
- &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
- );
- assert_eq!(
- build(r"[\u03B1-\u03B4]").states,
- &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
- );
- assert_eq!(
- build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
- &[
- s_range(0xB1, 0xB4, 5),
- s_range(0x99, 0x9E, 5),
- s_byte(0xA4, 1),
- s_byte(0x9F, 2),
- s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
- s_match(),
- ]
- );
- assert_eq!(
- build(r"[a-z☃]").states,
- &[
- s_byte(0x83, 3),
- s_byte(0x98, 0),
- s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
- s_match(),
- ]
- );
- }
-
- #[test]
- fn compile_repetition() {
- assert_eq!(
- build(r"a?").states,
- &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
- );
- assert_eq!(
- build(r"a??").states,
- &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
- );
- }
-
- #[test]
- fn compile_group() {
- assert_eq!(
- build(r"ab+").states,
- &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
- );
- assert_eq!(
- build(r"(ab)").states,
- &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
- );
- assert_eq!(
- build(r"(ab)+").states,
- &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
- );
- }
-
- #[test]
- fn compile_alternation() {
- assert_eq!(
- build(r"a|b").states,
- &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
- );
- assert_eq!(
- build(r"|b").states,
- &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
- );
- assert_eq!(
- build(r"a|").states,
- &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
- );
- }
-}
diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs
index 02d0501de..61ce5ef47 100644
--- a/vendor/regex-automata/src/nfa/mod.rs
+++ b/vendor/regex-automata/src/nfa/mod.rs
@@ -1,252 +1 @@
-use std::fmt;
-
-use classes::ByteClasses;
-pub use nfa::compiler::Builder;
-
-mod compiler;
-mod map;
-mod range_trie;
-
-/// The representation for an NFA state identifier.
-pub type StateID = usize;
-
-/// A final compiled NFA.
-///
-/// The states of the NFA are indexed by state IDs, which are how transitions
-/// are expressed.
-#[derive(Clone)]
-pub struct NFA {
- /// Whether this NFA can only match at the beginning of input or not.
- ///
- /// When true, a match should only be reported if it begins at the 0th
- /// index of the haystack.
- anchored: bool,
- /// The starting state of this NFA.
- start: StateID,
- /// The state list. This list is guaranteed to be indexable by the starting
- /// state ID, and it is also guaranteed to contain exactly one `Match`
- /// state.
- states: Vec<State>,
- /// A mapping from any byte value to its corresponding equivalence class
- /// identifier. Two bytes in the same equivalence class cannot discriminate
- /// between a match or a non-match. This map can be used to shrink the
- /// total size of a DFA's transition table with a small match-time cost.
- ///
- /// Note that the NFA's transitions are *not* defined in terms of these
- /// equivalence classes. The NFA's transitions are defined on the original
- /// byte values. For the most part, this is because they wouldn't really
- /// help the NFA much since the NFA already uses a sparse representation
- /// to represent transitions. Byte classes are most effective in a dense
- /// representation.
- byte_classes: ByteClasses,
-}
-
-impl NFA {
- /// Returns an NFA that always matches at every position.
- pub fn always_match() -> NFA {
- NFA {
- anchored: false,
- start: 0,
- states: vec![State::Match],
- byte_classes: ByteClasses::empty(),
- }
- }
-
- /// Returns an NFA that never matches at any position.
- pub fn never_match() -> NFA {
- NFA {
- anchored: false,
- start: 0,
- states: vec![State::Fail],
- byte_classes: ByteClasses::empty(),
- }
- }
-
- /// Returns true if and only if this NFA is anchored.
- pub fn is_anchored(&self) -> bool {
- self.anchored
- }
-
- /// Return the number of states in this NFA.
- pub fn len(&self) -> usize {
- self.states.len()
- }
-
- /// Return the ID of the initial state of this NFA.
- pub fn start(&self) -> StateID {
- self.start
- }
-
- /// Return the NFA state corresponding to the given ID.
- pub fn state(&self, id: StateID) -> &State {
- &self.states[id]
- }
-
- /// Return the set of equivalence classes for this NFA. The slice returned
- /// always has length 256 and maps each possible byte value to its
- /// corresponding equivalence class ID (which is never more than 255).
- pub fn byte_classes(&self) -> &ByteClasses {
- &self.byte_classes
- }
-}
-
-impl fmt::Debug for NFA {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- for (i, state) in self.states.iter().enumerate() {
- let status = if i == self.start { '>' } else { ' ' };
- writeln!(f, "{}{:06}: {:?}", status, i, state)?;
- }
- Ok(())
- }
-}
-
-/// A state in a final compiled NFA.
-#[derive(Clone, Eq, PartialEq)]
-pub enum State {
- /// A state that transitions to `next` if and only if the current input
- /// byte is in the range `[start, end]` (inclusive).
- ///
- /// This is a special case of Sparse in that it encodes only one transition
- /// (and therefore avoids the allocation).
- Range { range: Transition },
- /// A state with possibly many transitions, represented in a sparse
- /// fashion. Transitions are ordered lexicographically by input range.
- /// As such, this may only be used when every transition has equal
- /// priority. (In practice, this is only used for encoding large UTF-8
- /// automata.)
- Sparse { ranges: Box<[Transition]> },
- /// An alternation such that there exists an epsilon transition to all
- /// states in `alternates`, where matches found via earlier transitions
- /// are preferred over later transitions.
- Union { alternates: Box<[StateID]> },
- /// A fail state. When encountered, the automaton is guaranteed to never
- /// reach a match state.
- Fail,
- /// A match state. There is exactly one such occurrence of this state in
- /// an NFA.
- Match,
-}
-
-/// A transition to another state, only if the given byte falls in the
-/// inclusive range specified.
-#[derive(Clone, Copy, Eq, Hash, PartialEq)]
-pub struct Transition {
- pub start: u8,
- pub end: u8,
- pub next: StateID,
-}
-
-impl State {
- /// Returns true if and only if this state contains one or more epsilon
- /// transitions.
- pub fn is_epsilon(&self) -> bool {
- match *self {
- State::Range { .. }
- | State::Sparse { .. }
- | State::Fail
- | State::Match => false,
- State::Union { .. } => true,
- }
- }
-
- /// Remap the transitions in this state using the given map. Namely, the
- /// given map should be indexed according to the transitions currently
- /// in this state.
- ///
- /// This is used during the final phase of the NFA compiler, which turns
- /// its intermediate NFA into the final NFA.
- fn remap(&mut self, remap: &[StateID]) {
- match *self {
- State::Range { ref mut range } => range.next = remap[range.next],
- State::Sparse { ref mut ranges } => {
- for r in ranges.iter_mut() {
- r.next = remap[r.next];
- }
- }
- State::Union { ref mut alternates } => {
- for alt in alternates.iter_mut() {
- *alt = remap[*alt];
- }
- }
- State::Fail => {}
- State::Match => {}
- }
- }
-}
-
-impl fmt::Debug for State {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match *self {
- State::Range { ref range } => range.fmt(f),
- State::Sparse { ref ranges } => {
- let rs = ranges
- .iter()
- .map(|t| format!("{:?}", t))
- .collect::<Vec<String>>()
- .join(", ");
- write!(f, "sparse({})", rs)
- }
- State::Union { ref alternates } => {
- let alts = alternates
- .iter()
- .map(|id| format!("{}", id))
- .collect::<Vec<String>>()
- .join(", ");
- write!(f, "alt({})", alts)
- }
- State::Fail => write!(f, "FAIL"),
- State::Match => write!(f, "MATCH"),
- }
- }
-}
-
-impl fmt::Debug for Transition {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let Transition { start, end, next } = *self;
- if self.start == self.end {
- write!(f, "{} => {}", escape(start), next)
- } else {
- write!(f, "{}-{} => {}", escape(start), escape(end), next)
- }
- }
-}
-
-/// Return the given byte as its escaped string form.
-fn escape(b: u8) -> String {
- use std::ascii;
-
- String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use dense;
- use dfa::DFA;
-
- #[test]
- fn always_match() {
- let nfa = NFA::always_match();
- let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
-
- assert_eq!(Some(0), dfa.find_at(b"", 0));
- assert_eq!(Some(0), dfa.find_at(b"a", 0));
- assert_eq!(Some(1), dfa.find_at(b"a", 1));
- assert_eq!(Some(0), dfa.find_at(b"ab", 0));
- assert_eq!(Some(1), dfa.find_at(b"ab", 1));
- assert_eq!(Some(2), dfa.find_at(b"ab", 2));
- }
-
- #[test]
- fn never_match() {
- let nfa = NFA::never_match();
- let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
-
- assert_eq!(None, dfa.find_at(b"", 0));
- assert_eq!(None, dfa.find_at(b"a", 0));
- assert_eq!(None, dfa.find_at(b"a", 1));
- assert_eq!(None, dfa.find_at(b"ab", 0));
- assert_eq!(None, dfa.find_at(b"ab", 1));
- assert_eq!(None, dfa.find_at(b"ab", 2));
- }
-}
+pub mod thompson;
diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs
new file mode 100644
index 000000000..301194005
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs
@@ -0,0 +1,1713 @@
+/*
+This module provides an NFA compiler using Thompson's construction
+algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
+graph as output. The NFA graph is structured in a way that permits it to be
+executed by a virtual machine and also used to efficiently build a DFA.
+
+The compiler deals with a slightly expanded set of NFA states that notably
+includes an empty node that has exactly one epsilon transition to the next
+state. In other words, it's a "goto" instruction if one views Thompson's NFA
+as a set of bytecode instructions. These goto instructions are removed in
+a subsequent phase before returning the NFA to the caller. The purpose of
+these empty nodes is that they make the construction algorithm substantially
+simpler to implement. We remove them before returning to the caller because
+they can represent substantial overhead when traversing the NFA graph
+(either while searching using the NFA directly or while building a DFA).
+
+In the future, it would be nice to provide a Glushkov compiler as well,
+as it would work well as a bit-parallel NFA for smaller regexes. But
+the Thompson construction is one I'm more familiar with and seems more
+straight-forward to deal with when it comes to large Unicode character
+classes.
+
+Internally, the compiler uses interior mutability to improve composition
+in the face of the borrow checker. In particular, we'd really like to be
+able to write things like this:
+
+ self.c_concat(exprs.iter().map(|e| self.c(e)))
+
+Which elegantly uses iterators to build up a sequence of compiled regex
+sub-expressions and then hands it off to the concatenating compiler
+routine. Without interior mutability, the borrow checker won't let us
+borrow `self` mutably both inside and outside the closure at the same
+time.
+*/
+
+use core::{
+ borrow::Borrow,
+ cell::{Cell, RefCell},
+ mem,
+};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use regex_syntax::{
+ hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary},
+ utf8::{Utf8Range, Utf8Sequences},
+ ParserBuilder,
+};
+
+use crate::{
+ nfa::thompson::{
+ error::Error,
+ map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
+ range_trie::RangeTrie,
+ Look, SparseTransitions, State, Transition, NFA,
+ },
+ util::{
+ alphabet::ByteClassSet,
+ id::{IteratorIDExt, PatternID, StateID},
+ },
+};
+
+/// The configuration used for compiling a Thompson NFA from a regex pattern.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ reverse: Option<bool>,
+ utf8: Option<bool>,
+ nfa_size_limit: Option<Option<usize>>,
+ shrink: Option<bool>,
+ captures: Option<bool>,
+ #[cfg(test)]
+ unanchored_prefix: Option<bool>,
+}
+
+impl Config {
+ /// Return a new default Thompson NFA compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Reverse the NFA.
+ ///
+ /// A NFA reversal is performed by reversing all of the concatenated
+ /// sub-expressions in the original pattern, recursively. The resulting
+ /// NFA can be used to match the pattern starting from the end of a string
+ /// instead of the beginning of a string.
+ ///
+ /// Reversing the NFA is useful for building a reverse DFA, which is most
+ /// useful for finding the start of a match after its ending position has
+ /// been found.
+ ///
+ /// This is disabled by default.
+ pub fn reverse(mut self, yes: bool) -> Config {
+ self.reverse = Some(yes);
+ self
+ }
+
+ /// Whether to enable UTF-8 mode or not.
+ ///
+ /// When UTF-8 mode is enabled (which is the default), unanchored searches
+ /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
+ /// an unanchored search will stop at that point. This is equivalent to
+ /// putting a `(?s:.)*?` at the start of the regex.
+ ///
+ /// When UTF-8 mode is disabled, then unanchored searches will match
+ /// through any arbitrary byte. This is equivalent to putting a
+ /// `(?s-u:.)*?` at the start of the regex.
+ ///
+ /// Generally speaking, UTF-8 mode should only be used when you know you
+ /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
+ /// on input that is not valid UTF-8, then the regex is not likely to work
+ /// as expected.
+ ///
+ /// This is enabled by default.
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ /// Sets an approximate size limit on the total heap used by the NFA being
+ /// compiled.
+ ///
+ /// This permits imposing constraints on the size of a compiled NFA. This
+ /// may be useful in contexts where the regex pattern is untrusted and one
+ /// wants to avoid using too much memory.
+ ///
+ /// This size limit does not apply to auxiliary heap used during
+ /// compilation that is not part of the built NFA.
+ ///
+ /// Note that this size limit is applied during compilation in order for
+ /// the limit to prevent too much heap from being used. However, the
+ /// implementation may use an intermediate NFA representation that is
+ /// otherwise slightly bigger than the final public form. Since the size
+ /// limit may be applied to an intermediate representation, there is not
+ /// necessarily a precise correspondence between the configured size limit
+ /// and the heap usage of the final NFA.
+ ///
+ /// There is no size limit by default.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how Unicode mode can greatly increase the
+ /// size of the NFA.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// // 300KB isn't enough!
+ /// NFA::builder()
+ /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 400KB probably is.
+ /// let nfa = NFA::builder()
+ /// .configure(NFA::config().nfa_size_limit(Some(400_000)))
+ /// .build(r"\w{20}")?;
+ ///
+ /// assert_eq!(nfa.pattern_len(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.nfa_size_limit = Some(bytes);
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// This is enabled by default. Generally speaking, if one is using an NFA
+ /// to compile a DFA, then the extra time used to shrink the NFA will be
+ /// more than made up for during DFA construction (potentially by a lot).
+ /// In other words, enabling this can substantially decrease the overall
+ /// amount of time it takes to build a DFA.
+ ///
+ /// The only reason to disable this if you want to compile an NFA and start
+ /// using it as quickly as possible without needing to build a DFA. e.g.,
+ /// for an NFA simulation or for a lazy DFA.
+ ///
+ /// This is enabled by default.
+ pub fn shrink(mut self, yes: bool) -> Config {
+ self.shrink = Some(yes);
+ self
+ }
+
+ /// Whether to include 'Capture' states in the NFA.
+ ///
+ /// This can only be enabled when compiling a forward NFA. This is
+ /// always disabled---with no way to override it---when the `reverse`
+ /// configuration is enabled.
+ ///
+ /// This is enabled by default.
+ pub fn captures(mut self, yes: bool) -> Config {
+ self.captures = Some(yes);
+ self
+ }
+
+ /// Whether to compile an unanchored prefix into this NFA.
+ ///
+ /// This is enabled by default. It is made available for tests only to make
+ /// it easier to unit test the output of the compiler.
+ #[cfg(test)]
+ fn unanchored_prefix(mut self, yes: bool) -> Config {
+ self.unanchored_prefix = Some(yes);
+ self
+ }
+
+ pub fn get_reverse(&self) -> bool {
+ self.reverse.unwrap_or(false)
+ }
+
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ pub fn get_nfa_size_limit(&self) -> Option<usize> {
+ self.nfa_size_limit.unwrap_or(None)
+ }
+
+ pub fn get_shrink(&self) -> bool {
+ self.shrink.unwrap_or(true)
+ }
+
+ pub fn get_captures(&self) -> bool {
+ !self.get_reverse() && self.captures.unwrap_or(true)
+ }
+
+ fn get_unanchored_prefix(&self) -> bool {
+ #[cfg(test)]
+ {
+ self.unanchored_prefix.unwrap_or(true)
+ }
+ #[cfg(not(test))]
+ {
+ true
+ }
+ }
+
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config {
+ reverse: o.reverse.or(self.reverse),
+ utf8: o.utf8.or(self.utf8),
+ nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
+ shrink: o.shrink.or(self.shrink),
+ captures: o.captures.or(self.captures),
+ #[cfg(test)]
+ unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
+ }
+ }
+}
+
+/// A builder for compiling an NFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ parser: ParserBuilder,
+}
+
+impl Builder {
+ /// Create a new NFA builder with its default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default(), parser: ParserBuilder::new() }
+ }
+
+ /// Compile the given regular expression into an NFA.
+ ///
+ /// If there was a problem parsing the regex, then that error is returned.
+ ///
+ /// Otherwise, if there was a problem building the NFA, then an error is
+ /// returned. The only error that can occur is if the compiled regex would
+ /// exceed the size limits configured on this builder.
+ pub fn build(&self, pattern: &str) -> Result<NFA, Error> {
+ self.build_many(&[pattern])
+ }
+
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<NFA, Error> {
+ let mut hirs = vec![];
+ for p in patterns {
+ hirs.push(
+ self.parser
+ .build()
+ .parse(p.as_ref())
+ .map_err(Error::syntax)?,
+ );
+ log!(log::trace!("parsed: {:?}", p.as_ref()));
+ }
+ self.build_many_from_hir(&hirs)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into an NFA.
+ ///
+ /// If there was a problem building the NFA, then an error is returned. The
+ /// only error that can occur is if the compiled regex would exceed the
+ /// size limits configured on this builder.
+ pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> {
+ self.build_from_hir_with(&mut Compiler::new(), expr)
+ }
+
+ pub fn build_many_from_hir<H: Borrow<Hir>>(
+ &self,
+ exprs: &[H],
+ ) -> Result<NFA, Error> {
+ self.build_many_from_hir_with(&mut Compiler::new(), exprs)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into the NFA given using the given compiler. Callers may
+ /// prefer this over `build` if they would like to reuse allocations while
+ /// compiling many regular expressions.
+ ///
+ /// On success, the given NFA is completely overwritten with the NFA
+ /// produced by the compiler.
+ ///
+ /// If there was a problem building the NFA, then an error is returned.
+ /// The only error that can occur is if the compiled regex would exceed
+ /// the size limits configured on this builder. When an error is returned,
+ /// the contents of `nfa` are unspecified and should not be relied upon.
+ /// However, it can still be reused in subsequent calls to this method.
+ fn build_from_hir_with(
+ &self,
+ compiler: &mut Compiler,
+ expr: &Hir,
+ ) -> Result<NFA, Error> {
+ self.build_many_from_hir_with(compiler, &[expr])
+ }
+
+ fn build_many_from_hir_with<H: Borrow<Hir>>(
+ &self,
+ compiler: &mut Compiler,
+ exprs: &[H],
+ ) -> Result<NFA, Error> {
+ compiler.configure(self.config);
+ compiler.compile(exprs)
+ }
+
+ /// Apply the given NFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// This syntax configuration generally only applies when an NFA is built
+ /// directly from a pattern string. If an NFA is built from an HIR, then
+ /// all syntax settings are ignored.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ config.apply(&mut self.parser);
+ self
+ }
+}
+
+/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
+/// construction. Namely, this compiler permits epsilon transitions between
+/// states.
+#[derive(Clone, Debug)]
+pub struct Compiler {
+ /// The configuration from the builder.
+ config: Config,
+ /// The final NFA that is built.
+ ///
+ /// Parts of this NFA are constructed during compilation, but the actual
+ /// states aren't added until a final "finish" step. This is because the
+ /// states constructed during compilation have unconditional epsilon
+ /// transitions, which makes the logic of compilation much simpler. The
+ /// "finish" step removes these unconditional epsilon transitions and must
+ /// therefore remap all of the transition state IDs.
+ nfa: RefCell<NFA>,
+ /// The set of compiled NFA states. Once a state is compiled, it is
+ /// assigned a state ID equivalent to its index in this list. Subsequent
+ /// compilation can modify previous states by adding new transitions.
+ states: RefCell<Vec<CState>>,
+ /// State used for compiling character classes to UTF-8 byte automata.
+ /// State is not retained between character class compilations. This just
+ /// serves to amortize allocation to the extent possible.
+ utf8_state: RefCell<Utf8State>,
+ /// State used for arranging character classes in reverse into a trie.
+ trie_state: RefCell<RangeTrie>,
+ /// State used for caching common suffixes when compiling reverse UTF-8
+ /// automata (for Unicode character classes).
+ utf8_suffix: RefCell<Utf8SuffixMap>,
+ /// A map used to re-map state IDs when translating the compiler's internal
+ /// NFA state representation to the external NFA representation.
+ remap: RefCell<Vec<StateID>>,
+ /// A set of compiler internal state IDs that correspond to states that are
+ /// exclusively epsilon transitions, i.e., goto instructions, combined with
+ /// the state that they point to. This is used to record said states while
+ /// transforming the compiler's internal NFA representation to the external
+ /// form.
+ empties: RefCell<Vec<(StateID, StateID)>>,
+ /// The total memory used by each of the 'CState's in 'states'. This only
+ /// includes heap usage by each state, and not the size of the state
+ /// itself.
+ memory_cstates: Cell<usize>,
+}
+
+/// A compiler intermediate state representation for an NFA that is only used
+/// during compilation. Once compilation is done, `CState`s are converted
+/// to `State`s (defined in the parent module), which have a much simpler
+/// representation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum CState {
+ /// An empty state whose only purpose is to forward the automaton to
+ /// another state via en epsilon transition. These are useful during
+ /// compilation but are otherwise removed at the end.
+ Empty {
+ next: StateID,
+ },
+ /// An empty state that records a capture location.
+ ///
+ /// From the perspective of finite automata, this is precisely equivalent
+ /// to 'Empty', but serves the purpose of instructing NFA simulations to
+ /// record additional state when the finite state machine passes through
+ /// this epsilon transition.
+ ///
+ /// These transitions are treated as epsilon transitions with no additional
+ /// effects in DFAs.
+ ///
+ /// 'slot' in this context refers to the specific capture group offset that
+ /// is being recorded. Each capturing group has two slots corresponding to
+ /// the start and end of the matching portion of that group.
+ CaptureStart {
+ next: StateID,
+ capture_index: u32,
+ name: Option<Arc<str>>,
+ },
+ CaptureEnd {
+ next: StateID,
+ capture_index: u32,
+ },
+ /// A state that only transitions to `next` if the current input byte is
+ /// in the range `[start, end]` (inclusive on both ends).
+ Range {
+ range: Transition,
+ },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.) In contrast, a `Union` state has each alternate in order
+ /// of priority. Priority is used to implement greedy matching and also
+ /// alternations themselves, e.g., `abc|a` where `abc` has priority over
+ /// `a`.
+ ///
+ /// To clarify, it is possible to remove `Sparse` and represent all things
+ /// that `Sparse` is used for via `Union`. But this creates a more bloated
+ /// NFA with more epsilon transitions than is necessary in the special case
+ /// of character classes.
+ Sparse {
+ ranges: Vec<Transition>,
+ },
+ /// A conditional epsilon transition satisfied via some sort of
+ /// look-around.
+ Look {
+ look: Look,
+ next: StateID,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union {
+ alternates: Vec<StateID>,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via later transitions are
+ /// preferred over earlier transitions.
+ ///
+ /// This "reverse" state exists for convenience during compilation that
+ /// permits easy construction of non-greedy combinations of NFA states. At
+ /// the end of compilation, Union and UnionReverse states are merged into
+ /// one Union type of state, where the latter has its epsilon transitions
+ /// reversed to reflect the priority inversion.
+ ///
+ /// The "convenience" here arises from the fact that as new states are
+ /// added to the list of `alternates`, we would like that add operation
+ /// to be amortized constant time. But if we used a `Union`, we'd need to
+ /// prepend the state, which takes O(n) time. There are other approaches we
+ /// could use to solve this, but this seems simple enough.
+ UnionReverse {
+ alternates: Vec<StateID>,
+ },
+ /// A match state. There is at most one such occurrence of this state in
+ /// an NFA for each pattern compiled into the NFA. At time of writing, a
+ /// match state is always produced for every pattern given, but in theory,
+ /// if a pattern can never lead to a match, then the match state could be
+ /// omitted.
+ ///
+ /// `id` refers to the ID of the pattern itself, which corresponds to the
+ /// pattern's index (starting at 0). `start_id` refers to the anchored
+ /// NFA starting state corresponding to this pattern.
+ Match {
+ pattern_id: PatternID,
+ start_id: StateID,
+ },
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub struct ThompsonRef {
+ start: StateID,
+ end: StateID,
+}
+
+impl Compiler {
+ /// Create a new compiler.
+ pub fn new() -> Compiler {
+ Compiler {
+ config: Config::default(),
+ nfa: RefCell::new(NFA::empty()),
+ states: RefCell::new(vec![]),
+ utf8_state: RefCell::new(Utf8State::new()),
+ trie_state: RefCell::new(RangeTrie::new()),
+ utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+ remap: RefCell::new(vec![]),
+ empties: RefCell::new(vec![]),
+ memory_cstates: Cell::new(0),
+ }
+ }
+
+ /// Configure and prepare this compiler from the builder's knobs.
+ ///
+ /// The compiler is must always reconfigured by the builder before using it
+ /// to build an NFA. Namely, this will also clear any latent state in the
+ /// compiler used during previous compilations.
+ fn configure(&mut self, config: Config) {
+ self.config = config;
+ self.nfa.borrow_mut().clear();
+ self.states.borrow_mut().clear();
+ self.memory_cstates.set(0);
+ // We don't need to clear anything else since they are cleared on
+ // their own and only when they are used.
+ }
+
+ /// Convert the current intermediate NFA to its final compiled form.
+ fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> {
+ if exprs.is_empty() {
+ return Ok(NFA::never_match());
+ }
+ if exprs.len() > PatternID::LIMIT {
+ return Err(Error::too_many_patterns(exprs.len()));
+ }
+
+ // We always add an unanchored prefix unless we were specifically told
+ // not to (for tests only), or if we know that the regex is anchored
+ // for all matches. When an unanchored prefix is not added, then the
+ // NFA's anchored and unanchored start states are equivalent.
+ let all_anchored =
+ exprs.iter().all(|e| e.borrow().is_anchored_start());
+ let anchored = !self.config.get_unanchored_prefix() || all_anchored;
+ let unanchored_prefix = if anchored {
+ self.c_empty()?
+ } else {
+ if self.config.get_utf8() {
+ self.c_unanchored_prefix_valid_utf8()?
+ } else {
+ self.c_unanchored_prefix_invalid_utf8()?
+ }
+ };
+
+ let compiled = self.c_alternation(
+ exprs.iter().with_pattern_ids().map(|(pid, e)| {
+ let group_kind = hir::GroupKind::CaptureIndex(0);
+ let one = self.c_group(&group_kind, e.borrow())?;
+ let match_state_id = self.add_match(pid, one.start)?;
+ self.patch(one.end, match_state_id)?;
+ Ok(ThompsonRef { start: one.start, end: match_state_id })
+ }),
+ )?;
+ self.patch(unanchored_prefix.end, compiled.start)?;
+ self.finish(compiled.start, unanchored_prefix.start)?;
+ Ok(self.nfa.replace(NFA::empty()))
+ }
+
+ /// Finishes the compilation process and populates the NFA attached to this
+ /// compiler with the final graph.
+ fn finish(
+ &self,
+ start_anchored: StateID,
+ start_unanchored: StateID,
+ ) -> Result<(), Error> {
+ trace!(
+ "intermediate NFA compilation complete, \
+ intermediate NFA size: {} states, {} bytes on heap",
+ self.states.borrow().len(),
+ self.nfa_memory_usage(),
+ );
+ let mut nfa = self.nfa.borrow_mut();
+ let mut bstates = self.states.borrow_mut();
+ let mut remap = self.remap.borrow_mut();
+ let mut empties = self.empties.borrow_mut();
+ remap.resize(bstates.len(), StateID::ZERO);
+ empties.clear();
+
+ // The idea here is to convert our intermediate states to their final
+ // form. The only real complexity here is the process of converting
+ // transitions, which are expressed in terms of state IDs. The new
+ // set of states will be smaller because of partial epsilon removal,
+ // so the state IDs will not be the same.
+ for (sid, bstate) in bstates.iter_mut().with_state_ids() {
+ match *bstate {
+ CState::Empty { next } => {
+ // Since we're removing empty states, we need to handle
+ // them later since we don't yet know which new state this
+ // empty state will be mapped to.
+ empties.push((sid, next));
+ }
+ CState::CaptureStart { next, capture_index, ref name } => {
+ // We can't remove this empty state because of the side
+ // effect of capturing an offset for this capture slot.
+ remap[sid] = nfa.add_capture_start(
+ next,
+ capture_index,
+ name.clone(),
+ )?;
+ }
+ CState::CaptureEnd { next, capture_index } => {
+ // We can't remove this empty state because of the side
+ // effect of capturing an offset for this capture slot.
+ remap[sid] = nfa.add_capture_end(next, capture_index)?;
+ }
+ CState::Range { range } => {
+ remap[sid] = nfa.add_range(range)?;
+ }
+ CState::Sparse { ref mut ranges } => {
+ let ranges =
+ mem::replace(ranges, vec![]).into_boxed_slice();
+ remap[sid] =
+ nfa.add_sparse(SparseTransitions { ranges })?;
+ }
+ CState::Look { look, next } => {
+ remap[sid] = nfa.add_look(next, look)?;
+ }
+ CState::Union { ref mut alternates } => {
+ let alternates =
+ mem::replace(alternates, vec![]).into_boxed_slice();
+ remap[sid] = nfa.add_union(alternates)?;
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ let mut alternates =
+ mem::replace(alternates, vec![]).into_boxed_slice();
+ alternates.reverse();
+ remap[sid] = nfa.add_union(alternates)?;
+ }
+ CState::Match { start_id, .. } => {
+ remap[sid] = nfa.add_match()?;
+ nfa.finish_pattern(start_id)?;
+ }
+ }
+ }
+ for &(empty_id, mut empty_next) in empties.iter() {
+ // empty states can point to other empty states, forming a chain.
+ // So we must follow the chain until the end, which must end at
+ // a non-empty state, and therefore, a state that is correctly
+ // remapped. We are guaranteed to terminate because our compiler
+ // never builds a loop among only empty states.
+ while let CState::Empty { next } = bstates[empty_next] {
+ empty_next = next;
+ }
+ remap[empty_id] = remap[empty_next];
+ }
+ nfa.set_start_anchored(start_anchored);
+ nfa.set_start_unanchored(start_unanchored);
+ nfa.remap(&remap);
+ trace!(
+ "final NFA (reverse? {:?}) compilation complete, \
+ final NFA size: {} states, {} bytes on heap",
+ self.config.get_reverse(),
+ nfa.states().len(),
+ nfa.memory_usage(),
+ );
+ Ok(())
+ }
+
+ fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> {
+ match *expr.kind() {
+ HirKind::Empty => self.c_empty(),
+ HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch),
+ HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b),
+ HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c),
+ HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
+ HirKind::Anchor(ref anchor) => self.c_anchor(anchor),
+ HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb),
+ HirKind::Repetition(ref rep) => self.c_repetition(rep),
+ HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir),
+ HirKind::Concat(ref es) => {
+ self.c_concat(es.iter().map(|e| self.c(e)))
+ }
+ HirKind::Alternation(ref es) => {
+ self.c_alternation(es.iter().map(|e| self.c(e)))
+ }
+ }
+ }
+
+ fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+ where
+ I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>,
+ {
+ let first = if self.is_reverse() { it.next_back() } else { it.next() };
+ let ThompsonRef { start, mut end } = match first {
+ Some(result) => result?,
+ None => return self.c_empty(),
+ };
+ loop {
+ let next =
+ if self.is_reverse() { it.next_back() } else { it.next() };
+ let compiled = match next {
+ Some(result) => result?,
+ None => break,
+ };
+ self.patch(end, compiled.start)?;
+ end = compiled.end;
+ }
+ Ok(ThompsonRef { start, end })
+ }
+
+ fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+ where
+ I: Iterator<Item = Result<ThompsonRef, Error>>,
+ {
+ let first = it.next().expect("alternations must be non-empty")?;
+ let second = match it.next() {
+ None => return Ok(first),
+ Some(result) => result?,
+ };
+
+ let union = self.add_union()?;
+ let end = self.add_empty()?;
+ self.patch(union, first.start)?;
+ self.patch(first.end, end)?;
+ self.patch(union, second.start)?;
+ self.patch(second.end, end)?;
+ for result in it {
+ let compiled = result?;
+ self.patch(union, compiled.start)?;
+ self.patch(compiled.end, end)?;
+ }
+ Ok(ThompsonRef { start: union, end })
+ }
+
+ fn c_group(
+ &self,
+ kind: &hir::GroupKind,
+ expr: &Hir,
+ ) -> Result<ThompsonRef, Error> {
+ if !self.config.get_captures() {
+ return self.c(expr);
+ }
+ let (capi, name) = match *kind {
+ hir::GroupKind::NonCapturing => return self.c(expr),
+ hir::GroupKind::CaptureIndex(index) => (index, None),
+ hir::GroupKind::CaptureName { ref name, index } => {
+ (index, Some(Arc::from(&**name)))
+ }
+ };
+
+ let start = self.add_capture_start(capi, name)?;
+ let inner = self.c(expr)?;
+ let end = self.add_capture_end(capi)?;
+
+ self.patch(start, inner.start)?;
+ self.patch(inner.end, end)?;
+ Ok(ThompsonRef { start, end })
+ }
+
+ fn c_repetition(
+ &self,
+ rep: &hir::Repetition,
+ ) -> Result<ThompsonRef, Error> {
+ match rep.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ self.c_zero_or_one(&rep.hir, rep.greedy)
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 0)
+ }
+ hir::RepetitionKind::OneOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 1)
+ }
+ hir::RepetitionKind::Range(ref rng) => match *rng {
+ hir::RepetitionRange::Exactly(count) => {
+ self.c_exactly(&rep.hir, count)
+ }
+ hir::RepetitionRange::AtLeast(m) => {
+ self.c_at_least(&rep.hir, rep.greedy, m)
+ }
+ hir::RepetitionRange::Bounded(min, max) => {
+ self.c_bounded(&rep.hir, rep.greedy, min, max)
+ }
+ },
+ }
+ }
+
+ fn c_bounded(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> Result<ThompsonRef, Error> {
+ let prefix = self.c_exactly(expr, min)?;
+ if min == max {
+ return Ok(prefix);
+ }
+
+ // It is tempting here to compile the rest here as a concatenation
+ // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+ // were `aaa?a?a?`. The problem here is that it leads to this program:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: union(03, 04)
+ // 000003: 61 => 04
+ // 000004: union(05, 06)
+ // 000005: 61 => 06
+ // 000006: union(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // And effectively, once you hit state 2, the epsilon closure will
+ // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
+ // to instead compile it like so:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: union(03, 08)
+ // 000003: 61 => 04
+ // 000004: union(05, 08)
+ // 000005: 61 => 06
+ // 000006: union(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // So that the epsilon closure of state 2 is now just 3 and 8.
+ let empty = self.add_empty()?;
+ let mut prev_end = prefix.end;
+ for _ in min..max {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ let compiled = self.c(expr)?;
+ self.patch(prev_end, union)?;
+ self.patch(union, compiled.start)?;
+ self.patch(union, empty)?;
+ prev_end = compiled.end;
+ }
+ self.patch(prev_end, empty)?;
+ Ok(ThompsonRef { start: prefix.start, end: empty })
+ }
+
+ fn c_at_least(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ n: u32,
+ ) -> Result<ThompsonRef, Error> {
+ if n == 0 {
+ // When the expression cannot match the empty string, then we
+ // can get away with something much simpler: just one 'alt'
+ // instruction that optionally repeats itself. But if the expr
+ // can match the empty string... see below.
+ if !expr.is_match_empty() {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ let compiled = self.c(expr)?;
+ self.patch(union, compiled.start)?;
+ self.patch(compiled.end, union)?;
+ return Ok(ThompsonRef { start: union, end: union });
+ }
+
+ // What's going on here? Shouldn't x* be simpler than this? It
+ // turns out that when implementing leftmost-first (Perl-like)
+ // match semantics, x* results in an incorrect preference order
+ // when computing the transitive closure of states if and only if
+ // 'x' can match the empty string. So instead, we compile x* as
+ // (x+)?, which preserves the correct preference order.
+ //
+ // See: https://github.com/rust-lang/regex/issues/779
+ let compiled = self.c(expr)?;
+ let plus = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ self.patch(compiled.end, plus)?;
+ self.patch(plus, compiled.start)?;
+
+ let question = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ let empty = self.add_empty()?;
+ self.patch(question, compiled.start)?;
+ self.patch(question, empty)?;
+ self.patch(plus, empty)?;
+ Ok(ThompsonRef { start: question, end: empty })
+ } else if n == 1 {
+ let compiled = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ self.patch(compiled.end, union)?;
+ self.patch(union, compiled.start)?;
+ Ok(ThompsonRef { start: compiled.start, end: union })
+ } else {
+ let prefix = self.c_exactly(expr, n - 1)?;
+ let last = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ self.patch(prefix.end, last.start)?;
+ self.patch(last.end, union)?;
+ self.patch(union, last.start)?;
+ Ok(ThompsonRef { start: prefix.start, end: union })
+ }
+ }
+
+ fn c_zero_or_one(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> Result<ThompsonRef, Error> {
+ let union =
+ if greedy { self.add_union() } else { self.add_reverse_union() }?;
+ let compiled = self.c(expr)?;
+ let empty = self.add_empty()?;
+ self.patch(union, compiled.start)?;
+ self.patch(union, empty)?;
+ self.patch(compiled.end, empty)?;
+ Ok(ThompsonRef { start: union, end: empty })
+ }
+
+ fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> {
+ let it = (0..n).map(|_| self.c(expr));
+ self.c_concat(it)
+ }
+
+ fn c_byte_class(
+ &self,
+ cls: &hir::ClassBytes,
+ ) -> Result<ThompsonRef, Error> {
+ let end = self.add_empty()?;
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ trans.push(Transition {
+ start: r.start(),
+ end: r.end(),
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+ }
+
+ fn c_unicode_class(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef, Error> {
+ // If all we have are ASCII ranges wrapped in a Unicode package, then
+ // there is zero reason to bring out the big guns. We can fit all ASCII
+ // ranges within a single sparse state.
+ if cls.is_all_ascii() {
+ let end = self.add_empty()?;
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ assert!(r.start() <= '\x7F');
+ assert!(r.end() <= '\x7F');
+ trans.push(Transition {
+ start: r.start() as u8,
+ end: r.end() as u8,
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+ } else if self.is_reverse() {
+ if !self.config.get_shrink() {
+ // When we don't want to spend the extra time shrinking, we
+ // compile the UTF-8 automaton in reverse using something like
+ // the "naive" approach, but will attempt to re-use common
+ // suffixes.
+ self.c_unicode_class_reverse_with_suffix(cls)
+ } else {
+ // When we want to shrink our NFA for reverse UTF-8 automata,
+ // we cannot feed UTF-8 sequences directly to the UTF-8
+ // compiler, since the UTF-8 compiler requires all sequences
+ // to be lexicographically sorted. Instead, we organize our
+ // sequences into a range trie, which can then output our
+ // sequences in the correct order. Unfortunately, building the
+ // range trie is fairly expensive (but not nearly as expensive
+ // as building a DFA). Hence the reason why the 'shrink' option
+ // exists, so that this path can be toggled off. For example,
+ // we might want to turn this off if we know we won't be
+ // compiling a DFA.
+ let mut trie = self.trie_state.borrow_mut();
+ trie.clear();
+
+ for rng in cls.iter() {
+ for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ seq.reverse();
+ trie.insert(seq.as_slice());
+ }
+ }
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+ trie.iter(|seq| {
+ utf8c.add(&seq)?;
+ Ok(())
+ })?;
+ utf8c.finish()
+ }
+ } else {
+ // In the forward direction, we always shrink our UTF-8 automata
+ // because we can stream it right into the UTF-8 compiler. There
+ // is almost no downside (in either memory or time) to using this
+ // approach.
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+ for rng in cls.iter() {
+ for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ utf8c.add(seq.as_slice())?;
+ }
+ }
+ utf8c.finish()
+ }
+
+ // For reference, the code below is the "naive" version of compiling a
+ // UTF-8 automaton. It is deliciously simple (and works for both the
+ // forward and reverse cases), but will unfortunately produce very
+ // large NFAs. When compiling a forward automaton, the size difference
+ // can sometimes be an order of magnitude. For example, the '\w' regex
+ // will generate about ~3000 NFA states using the naive approach below,
+ // but only 283 states when using the approach above. This is because
+ // the approach above actually compiles a *minimal* (or near minimal,
+ // because of the bounded hashmap for reusing equivalent states) UTF-8
+ // automaton.
+ //
+ // The code below is kept as a reference point in order to make it
+ // easier to understand the higher level goal here. Although, it will
+ // almost certainly bit-rot, so keep that in mind.
+ /*
+ let it = cls
+ .iter()
+ .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+ .map(|seq| {
+ let it = seq
+ .as_slice()
+ .iter()
+ .map(|rng| self.c_range(rng.start, rng.end));
+ self.c_concat(it)
+ });
+ self.c_alternation(it)
+ */
+ }
+
+ fn c_unicode_class_reverse_with_suffix(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef, Error> {
+ // N.B. It would likely be better to cache common *prefixes* in the
+ // reverse direction, but it's not quite clear how to do that. The
+ // advantage of caching suffixes is that it does give us a win, and
+ // has a very small additional overhead.
+ let mut cache = self.utf8_suffix.borrow_mut();
+ cache.clear();
+
+ let union = self.add_union()?;
+ let alt_end = self.add_empty()?;
+ for urng in cls.iter() {
+ for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+ let mut end = alt_end;
+ for brng in seq.as_slice() {
+ let key = Utf8SuffixKey {
+ from: end,
+ start: brng.start,
+ end: brng.end,
+ };
+ let hash = cache.hash(&key);
+ if let Some(id) = cache.get(&key, hash) {
+ end = id;
+ continue;
+ }
+
+ let compiled = self.c_range(brng.start, brng.end)?;
+ self.patch(compiled.end, end)?;
+ end = compiled.start;
+ cache.set(key, hash, end);
+ }
+ self.patch(union, end)?;
+ }
+ }
+ Ok(ThompsonRef { start: union, end: alt_end })
+ }
+
+ fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> {
+ let look = match *anchor {
+ Anchor::StartLine => Look::StartLine,
+ Anchor::EndLine => Look::EndLine,
+ Anchor::StartText => Look::StartText,
+ Anchor::EndText => Look::EndText,
+ };
+ let id = self.add_look(look)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_word_boundary(
+ &self,
+ wb: &WordBoundary,
+ ) -> Result<ThompsonRef, Error> {
+ let look = match *wb {
+ WordBoundary::Unicode => Look::WordBoundaryUnicode,
+ WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate,
+ WordBoundary::Ascii => Look::WordBoundaryAscii,
+ WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate,
+ };
+ let id = self.add_look(look)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
+ let mut buf = [0; 4];
+ let it = ch
+ .encode_utf8(&mut buf)
+ .as_bytes()
+ .iter()
+ .map(|&b| self.c_range(b, b));
+ self.c_concat(it)
+ }
+
+ fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> {
+ let id = self.add_range(start, end)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_empty(&self) -> Result<ThompsonRef, Error> {
+ let id = self.add_empty()?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> {
+ self.c_at_least(&Hir::any(false), false, 0)
+ }
+
+ fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> {
+ self.c_at_least(&Hir::any(true), false, 0)
+ }
+
+ fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
+ let old_memory_cstates = self.memory_cstates.get();
+ match self.states.borrow_mut()[from] {
+ CState::Empty { ref mut next } => {
+ *next = to;
+ }
+ CState::Range { ref mut range } => {
+ range.next = to;
+ }
+ CState::Sparse { .. } => {
+ panic!("cannot patch from a sparse NFA state")
+ }
+ CState::Look { ref mut next, .. } => {
+ *next = to;
+ }
+ CState::Union { ref mut alternates } => {
+ alternates.push(to);
+ self.memory_cstates
+ .set(old_memory_cstates + mem::size_of::<StateID>());
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ alternates.push(to);
+ self.memory_cstates
+ .set(old_memory_cstates + mem::size_of::<StateID>());
+ }
+ CState::CaptureStart { ref mut next, .. } => {
+ *next = to;
+ }
+ CState::CaptureEnd { ref mut next, .. } => {
+ *next = to;
+ }
+ CState::Match { .. } => {}
+ }
+ if old_memory_cstates != self.memory_cstates.get() {
+ self.check_nfa_size_limit()?;
+ }
+ Ok(())
+ }
+
+ fn add_empty(&self) -> Result<StateID, Error> {
+ self.add_state(CState::Empty { next: StateID::ZERO })
+ }
+
+ fn add_capture_start(
+ &self,
+ capture_index: u32,
+ name: Option<Arc<str>>,
+ ) -> Result<StateID, Error> {
+ self.add_state(CState::CaptureStart {
+ next: StateID::ZERO,
+ capture_index,
+ name,
+ })
+ }
+
+ fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> {
+ self.add_state(CState::CaptureEnd {
+ next: StateID::ZERO,
+ capture_index,
+ })
+ }
+
+ fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> {
+ let trans = Transition { start, end, next: StateID::ZERO };
+ self.add_state(CState::Range { range: trans })
+ }
+
+ fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> {
+ if ranges.len() == 1 {
+ self.add_state(CState::Range { range: ranges[0] })
+ } else {
+ self.add_state(CState::Sparse { ranges })
+ }
+ }
+
+ fn add_look(&self, mut look: Look) -> Result<StateID, Error> {
+ if self.is_reverse() {
+ look = look.reversed();
+ }
+ self.add_state(CState::Look { look, next: StateID::ZERO })
+ }
+
+ fn add_union(&self) -> Result<StateID, Error> {
+ self.add_state(CState::Union { alternates: vec![] })
+ }
+
+ fn add_reverse_union(&self) -> Result<StateID, Error> {
+ self.add_state(CState::UnionReverse { alternates: vec![] })
+ }
+
+ fn add_match(
+ &self,
+ pattern_id: PatternID,
+ start_id: StateID,
+ ) -> Result<StateID, Error> {
+ self.add_state(CState::Match { pattern_id, start_id })
+ }
+
+ fn add_state(&self, state: CState) -> Result<StateID, Error> {
+ let mut states = self.states.borrow_mut();
+ let id = StateID::new(states.len())
+ .map_err(|_| Error::too_many_states(states.len()))?;
+ self.memory_cstates
+ .set(self.memory_cstates.get() + state.memory_usage());
+ states.push(state);
+ // If we don't explicitly drop this, then 'nfa_memory_usage' will also
+ // try to borrow it when we check the size limit and hit an error.
+ drop(states);
+ self.check_nfa_size_limit()?;
+ Ok(id)
+ }
+
+ fn is_reverse(&self) -> bool {
+ self.config.get_reverse()
+ }
+
+ /// If an NFA size limit was set, this checks that the NFA compiled so far
+ /// fits within that limit. If so, then nothing is returned. Otherwise, an
+ /// error is returned.
+ ///
+ /// This should be called after increasing the heap usage of the
+ /// intermediate NFA.
+ ///
+ /// Note that this borrows 'self.states', so callers should ensure there is
+ /// no mutable borrow of it outstanding.
+ fn check_nfa_size_limit(&self) -> Result<(), Error> {
+ if let Some(limit) = self.config.get_nfa_size_limit() {
+ if self.nfa_memory_usage() > limit {
+ return Err(Error::exceeded_size_limit(limit));
+ }
+ }
+ Ok(())
+ }
+
+ /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
+ ///
+ /// Note that this is an approximation of how big the final NFA will be.
+ /// In practice, the final NFA will likely be a bit smaller since it uses
+ /// things like `Box<[T]>` instead of `Vec<T>`.
+ fn nfa_memory_usage(&self) -> usize {
+ self.states.borrow().len() * mem::size_of::<CState>()
+ + self.memory_cstates.get()
+ }
+}
+
+impl CState {
+ fn memory_usage(&self) -> usize {
+ match *self {
+ CState::Empty { .. }
+ | CState::Range { .. }
+ | CState::Look { .. }
+ | CState::CaptureStart { .. }
+ | CState::CaptureEnd { .. }
+ | CState::Match { .. } => 0,
+ CState::Sparse { ref ranges } => {
+ ranges.len() * mem::size_of::<Transition>()
+ }
+ CState::Union { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ CState::UnionReverse { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ }
+ }
+}
+
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+ nfac: &'a Compiler,
+ state: &'a mut Utf8State,
+ target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+ compiled: Utf8BoundedMap,
+ uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+ trans: Vec<Transition>,
+ last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+ start: u8,
+ end: u8,
+}
+
+impl Utf8State {
+ fn new() -> Utf8State {
+ Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
+ }
+
+ fn clear(&mut self) {
+ self.compiled.clear();
+ self.uncompiled.clear();
+ }
+}
+
+impl<'a> Utf8Compiler<'a> {
+ fn new(
+ nfac: &'a Compiler,
+ state: &'a mut Utf8State,
+ ) -> Result<Utf8Compiler<'a>, Error> {
+ let target = nfac.add_empty()?;
+ state.clear();
+ let mut utf8c = Utf8Compiler { nfac, state, target };
+ utf8c.add_empty();
+ Ok(utf8c)
+ }
+
+ fn finish(&mut self) -> Result<ThompsonRef, Error> {
+ self.compile_from(0)?;
+ let node = self.pop_root();
+ let start = self.compile(node)?;
+ Ok(ThompsonRef { start, end: self.target })
+ }
+
+ fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> {
+ let prefix_len = ranges
+ .iter()
+ .zip(&self.state.uncompiled)
+ .take_while(|&(range, node)| {
+ node.last.as_ref().map_or(false, |t| {
+ (t.start, t.end) == (range.start, range.end)
+ })
+ })
+ .count();
+ assert!(prefix_len < ranges.len());
+ self.compile_from(prefix_len)?;
+ self.add_suffix(&ranges[prefix_len..]);
+ Ok(())
+ }
+
+ fn compile_from(&mut self, from: usize) -> Result<(), Error> {
+ let mut next = self.target;
+ while from + 1 < self.state.uncompiled.len() {
+ let node = self.pop_freeze(next);
+ next = self.compile(node)?;
+ }
+ self.top_last_freeze(next);
+ Ok(())
+ }
+
+ fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> {
+ let hash = self.state.compiled.hash(&node);
+ if let Some(id) = self.state.compiled.get(&node, hash) {
+ return Ok(id);
+ }
+ let id = self.nfac.add_sparse(node.clone())?;
+ self.state.compiled.set(node, hash, id);
+ Ok(id)
+ }
+
+ fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ assert!(self.state.uncompiled[last].last.is_none());
+ self.state.uncompiled[last].last = Some(Utf8LastTransition {
+ start: ranges[0].start,
+ end: ranges[0].end,
+ });
+ for r in &ranges[1..] {
+ self.state.uncompiled.push(Utf8Node {
+ trans: vec![],
+ last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+ });
+ }
+ }
+
+ fn add_empty(&mut self) {
+ self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+ }
+
+ fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+ let mut uncompiled = self.state.uncompiled.pop().unwrap();
+ uncompiled.set_last_transition(next);
+ uncompiled.trans
+ }
+
+ fn pop_root(&mut self) -> Vec<Transition> {
+ assert_eq!(self.state.uncompiled.len(), 1);
+ assert!(self.state.uncompiled[0].last.is_none());
+ self.state.uncompiled.pop().expect("non-empty nodes").trans
+ }
+
+ fn top_last_freeze(&mut self, next: StateID) {
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ self.state.uncompiled[last].set_last_transition(next);
+ }
+}
+
+impl Utf8Node {
+ fn set_last_transition(&mut self, next: StateID) {
+ if let Some(last) = self.last.take() {
+ self.trans.push(Transition {
+ start: last.start,
+ end: last.end,
+ next,
+ });
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+
+ use super::{
+ Builder, Config, PatternID, SparseTransitions, State, StateID,
+ Transition, NFA,
+ };
+
+ fn build(pattern: &str) -> NFA {
+ Builder::new()
+ .configure(Config::new().captures(false).unanchored_prefix(false))
+ .build(pattern)
+ .unwrap()
+ }
+
+ fn pid(id: usize) -> PatternID {
+ PatternID::new(id).unwrap()
+ }
+
+ fn sid(id: usize) -> StateID {
+ StateID::new(id).unwrap()
+ }
+
+ fn s_byte(byte: u8, next: usize) -> State {
+ let next = sid(next);
+ let trans = Transition { start: byte, end: byte, next };
+ State::Range { range: trans }
+ }
+
+ fn s_range(start: u8, end: u8, next: usize) -> State {
+ let next = sid(next);
+ let trans = Transition { start, end, next };
+ State::Range { range: trans }
+ }
+
+ fn s_sparse(ranges: &[(u8, u8, usize)]) -> State {
+ let ranges = ranges
+ .iter()
+ .map(|&(start, end, next)| Transition {
+ start,
+ end,
+ next: sid(next),
+ })
+ .collect();
+ State::Sparse(SparseTransitions { ranges })
+ }
+
+ fn s_union(alts: &[usize]) -> State {
+ State::Union {
+ alternates: alts
+ .iter()
+ .map(|&id| sid(id))
+ .collect::<Vec<StateID>>()
+ .into_boxed_slice(),
+ }
+ }
+
+ fn s_match(id: usize) -> State {
+ State::Match { id: pid(id) }
+ }
+
+ // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
+ // prefix.
+ #[test]
+ fn compile_unanchored_prefix() {
+ // When the machine can only match valid UTF-8.
+ let nfa = Builder::new()
+ .configure(Config::new().captures(false))
+ .build(r"a")
+ .unwrap();
+ // There should be many states since the `.` in `(?s:.)*?` matches any
+ // Unicode scalar value.
+ assert_eq!(11, nfa.len());
+ assert_eq!(nfa.states[10], s_match(0));
+ assert_eq!(nfa.states[9], s_byte(b'a', 10));
+
+ // When the machine can match through invalid UTF-8.
+ let nfa = Builder::new()
+ .configure(Config::new().captures(false).utf8(false))
+ .build(r"a")
+ .unwrap();
+ assert_eq!(
+ nfa.states,
+ &[
+ s_union(&[2, 1]),
+ s_range(0, 255, 0),
+ s_byte(b'a', 3),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_empty() {
+ assert_eq!(build("").states, &[s_match(0),]);
+ }
+
+ #[test]
+ fn compile_literal() {
+ assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]);
+ assert_eq!(
+ build("ab").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
+ );
+ assert_eq!(
+ build("☃").states,
+ &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
+ );
+
+ // Check that non-UTF-8 literals work.
+ let nfa = Builder::new()
+ .configure(
+ Config::new()
+ .captures(false)
+ .utf8(false)
+ .unanchored_prefix(false),
+ )
+ .syntax(crate::SyntaxConfig::new().utf8(false))
+ .build(r"(?-u)\xFF")
+ .unwrap();
+ assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]);
+ }
+
+ #[test]
+ fn compile_class() {
+ assert_eq!(
+ build(r"[a-z]").states,
+ &[s_range(b'a', b'z', 1), s_match(0),]
+ );
+ assert_eq!(
+ build(r"[x-za-c]").states,
+ &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4]").states,
+ &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+ &[
+ s_range(0xB1, 0xB4, 5),
+ s_range(0x99, 0x9E, 5),
+ s_byte(0xA4, 1),
+ s_byte(0x9F, 2),
+ s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+ s_match(0),
+ ]
+ );
+ assert_eq!(
+ build(r"[a-z☃]").states,
+ &[
+ s_byte(0x83, 3),
+ s_byte(0x98, 0),
+ s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_repetition() {
+ assert_eq!(
+ build(r"a?").states,
+ &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),]
+ );
+ assert_eq!(
+ build(r"a??").states,
+ &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),]
+ );
+ }
+
+ #[test]
+ fn compile_group() {
+ assert_eq!(
+ build(r"ab+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"(ab)").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
+ );
+ assert_eq!(
+ build(r"(ab)+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)]
+ );
+ }
+
+ #[test]
+ fn compile_alternation() {
+ assert_eq!(
+ build(r"a|b").states,
+ &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"|b").states,
+ &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"a|").states,
+ &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)]
+ );
+ }
+
+ #[test]
+ fn many_start_pattern() {
+ let nfa = Builder::new()
+ .configure(Config::new().captures(false).unanchored_prefix(false))
+ .build_many(&["a", "b"])
+ .unwrap();
+ assert_eq!(
+ nfa.states,
+ &[
+ s_byte(b'a', 1),
+ s_match(0),
+ s_byte(b'b', 3),
+ s_match(1),
+ s_union(&[0, 2]),
+ ]
+ );
+ assert_eq!(nfa.start_anchored().as_usize(), 4);
+ assert_eq!(nfa.start_unanchored().as_usize(), 4);
+ // Test that the start states for each individual pattern are correct.
+ assert_eq!(nfa.start_pattern(pid(0)), sid(0));
+ assert_eq!(nfa.start_pattern(pid(1)), sid(2));
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/error.rs b/vendor/regex-automata/src/nfa/thompson/error.rs
new file mode 100644
index 000000000..52f02e888
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/error.rs
@@ -0,0 +1,145 @@
+use crate::util::id::{PatternID, StateID};
+
+/// An error that can occured during the construction of a thompson NFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`regex_syntax::Error`] type from its `source`
+/// method via the `std::error::Error` trait. This error only occurs when using
+/// convenience routines for building an NFA directly from a pattern string.
+///
+/// Otherwise, errors typically occur when a limit has been breeched. For
+/// example, if the total heap usage of the compiled NFA exceeds the limit
+/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
+/// building the NFA will fail.
+#[derive(Clone, Debug)]
+pub struct Error {
+ kind: ErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a thompson NFA.
+#[derive(Clone, Debug)]
+enum ErrorKind {
+ /// An error that occurred while parsing a regular expression. Note that
+ /// this error may be printed over multiple lines, and is generally
+ /// intended to be end user readable on its own.
+ Syntax(regex_syntax::Error),
+ /// An error that occurs if too many patterns were given to the NFA
+ /// compiler.
+ TooManyPatterns {
+ /// The number of patterns given, which exceeds the limit.
+ given: usize,
+ /// The limit on the number of patterns.
+ limit: usize,
+ },
+ /// An error that occurs if too states are produced while building an NFA.
+ TooManyStates {
+ /// The minimum number of states that are desired, which exceeds the
+ /// limit.
+ given: usize,
+ /// The limit on the number of states.
+ limit: usize,
+ },
+ /// An error that occurs when NFA compilation exceeds a configured heap
+ /// limit.
+ ExceededSizeLimit {
+ /// The configured limit, in bytes.
+ limit: usize,
+ },
+ /// An error that occurs when an invalid capture group index is added to
+ /// the NFA. An "invalid" index can be one that is too big (e.g., results
+ /// in an integer overflow) or one that is discontinuous from previous
+ /// capture group indices added.
+ InvalidCaptureIndex {
+ /// The invalid index that was given.
+ index: usize,
+ },
+ /// An error that occurs when an NFA contains a Unicode word boundary, but
+ /// where the crate was compiled without the necessary data for dealing
+ /// with Unicode word boundaries.
+ UnicodeWordUnavailable,
+}
+
+impl Error {
+ fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
+ Error { kind: ErrorKind::Syntax(err) }
+ }
+
+ pub(crate) fn too_many_patterns(given: usize) -> Error {
+ let limit = PatternID::LIMIT;
+ Error { kind: ErrorKind::TooManyPatterns { given, limit } }
+ }
+
+ pub(crate) fn too_many_states(given: usize) -> Error {
+ let limit = StateID::LIMIT;
+ Error { kind: ErrorKind::TooManyStates { given, limit } }
+ }
+
+ pub(crate) fn exceeded_size_limit(limit: usize) -> Error {
+ Error { kind: ErrorKind::ExceededSizeLimit { limit } }
+ }
+
+ pub(crate) fn invalid_capture_index(index: usize) -> Error {
+ Error { kind: ErrorKind::InvalidCaptureIndex { index } }
+ }
+
+ pub(crate) fn unicode_word_unavailable() -> Error {
+ Error { kind: ErrorKind::UnicodeWordUnavailable }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind() {
+ ErrorKind::Syntax(ref err) => Some(err),
+ ErrorKind::TooManyPatterns { .. } => None,
+ ErrorKind::TooManyStates { .. } => None,
+ ErrorKind::ExceededSizeLimit { .. } => None,
+ ErrorKind::InvalidCaptureIndex { .. } => None,
+ ErrorKind::UnicodeWordUnavailable => None,
+ }
+ }
+}
+
+impl core::fmt::Display for Error {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind() {
+ ErrorKind::Syntax(_) => write!(f, "error parsing regex"),
+ ErrorKind::TooManyPatterns { given, limit } => write!(
+ f,
+ "attemped to compile {} patterns, \
+ which exceeds the limit of {}",
+ given, limit,
+ ),
+ ErrorKind::TooManyStates { given, limit } => write!(
+ f,
+ "attemped to compile {} NFA states, \
+ which exceeds the limit of {}",
+ given, limit,
+ ),
+ ErrorKind::ExceededSizeLimit { limit } => write!(
+ f,
+ "heap usage during NFA compilation exceeded limit of {}",
+ limit,
+ ),
+ ErrorKind::InvalidCaptureIndex { index } => write!(
+ f,
+ "capture group index {} is invalid (too big or discontinuous)",
+ index,
+ ),
+ ErrorKind::UnicodeWordUnavailable => write!(
+ f,
+ "crate has been compiled without Unicode word boundary \
+ support, but the NFA contains Unicode word boundary \
+ assertions",
+ ),
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs
index e636c0dd3..79ff63ca3 100644
--- a/vendor/regex-automata/src/nfa/map.rs
+++ b/vendor/regex-automata/src/nfa/thompson/map.rs
@@ -7,10 +7,10 @@
// These maps are used in some fairly hot code when generating NFA states for
// large Unicode character classes.
//
-// Instead of exposing a rich hashmap entry API, we just permit the caller
-// to produce a hash of the key directly. The hash can then be reused for both
-// lookups and insertions at the cost of leaking things a bit. But these are
-// for internal use only, so it's fine.
+// Instead of exposing a rich hashmap entry API, we just permit the caller to
+// produce a hash of the key directly. The hash can then be reused for both
+// lookups and insertions at the cost of leaking abstraction a bit. But these
+// are for internal use only, so it's fine.
//
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
// (almost) minimal DFA for large Unicode character classes in linear time.
@@ -33,7 +33,9 @@
// could make one generic map, but the machinery didn't seem worth it. They
// are simple enough.
-use nfa::{StateID, Transition};
+use alloc::{vec, vec::Vec};
+
+use crate::{nfa::thompson::Transition, util::id::StateID};
// Basic FNV-1a hash constants as described in:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
@@ -57,7 +59,7 @@ const INIT: u64 = 14695981039346656037;
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
-/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
@@ -74,6 +76,9 @@ pub struct Utf8BoundedMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
+ ///
+ /// This makes it possible to clear the map by simply incrementing the
+ /// version number instead of actually deallocating any storage.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
@@ -119,6 +124,9 @@ impl Utf8BoundedMap {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
+ // If we loop back to version 0, then we forcefully clear the
+ // entire map. Otherwise, it might be possible to incorrectly
+ // match entries used to generate other NFAs.
if self.version == 0 {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
}
@@ -131,7 +139,7 @@ impl Utf8BoundedMap {
for t in key {
h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
- h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
+ h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME);
}
(h as usize) % self.map.len()
}
@@ -244,7 +252,7 @@ impl Utf8SuffixMap {
const INIT: u64 = 14695981039346656037;
let mut h = INIT;
- h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
+ h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME);
h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
(h as usize) % self.map.len()
diff --git a/vendor/regex-automata/src/nfa/thompson/mod.rs b/vendor/regex-automata/src/nfa/thompson/mod.rs
new file mode 100644
index 000000000..88a438e8e
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/mod.rs
@@ -0,0 +1,1555 @@
+use core::{convert::TryFrom, fmt, mem, ops::Range};
+
+use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
+
+use crate::util::{
+ alphabet::{self, ByteClassSet},
+ decode_last_utf8, decode_utf8,
+ id::{IteratorIDExt, PatternID, PatternIDIter, StateID},
+ is_word_byte, is_word_char_fwd, is_word_char_rev,
+};
+
+pub use self::{
+ compiler::{Builder, Config},
+ error::Error,
+};
+
+mod compiler;
+mod error;
+mod map;
+pub mod pikevm;
+mod range_trie;
+
+/// A map from capture group name to its corresponding capture index.
+///
+/// Since there are always two slots for each capture index, the pair of slots
+/// corresponding to the capture index for a pattern ID of 0 are indexed at
+/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`.
+///
+/// This type is actually wrapped inside a Vec indexed by pattern ID on the
+/// NFA, since multiple patterns may have the same capture group name.
+///
+/// Note that this is somewhat of a sub-optimal representation, since it
+/// requires a hashmap for each pattern. A better representation would be
+/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look
+/// up a capture index by name without producing a `Arc<str>`, which requires
+/// an allocation. To fix this, I think we'd need to define our own unsized
+/// type or something?
+#[cfg(feature = "std")]
+type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>;
+#[cfg(not(feature = "std"))]
+type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>;
+
+// The NFA API below is not something I'm terribly proud of at the moment. In
+// particular, it supports both mutating the NFA and actually using the NFA to
+// perform a search. I think combining these two things muddies the waters a
+// bit too much.
+//
+// I think the issue is that I saw the compiler as the 'builder,' and where
+// the compiler had the ability to manipulate the internal state of the NFA.
+// However, one of my goals was to make it possible for others to build their
+// own NFAs in a way that is *not* couple to the regex-syntax crate.
+//
+// So I think really, there should be an NFA, a NFABuilder and then the
+// internal compiler which uses the NFABuilder API to build an NFA. Alas, at
+// the time of writing, I kind of ran out of steam.
+
+/// A fully compiled Thompson NFA.
+///
+/// The states of the NFA are indexed by state IDs, which are how transitions
+/// are expressed.
+#[derive(Clone)]
+pub struct NFA {
+ /// The state list. This list is guaranteed to be indexable by all starting
+ /// state IDs, and it is also guaranteed to contain at most one `Match`
+ /// state for each pattern compiled into this NFA. (A pattern may not have
+ /// a corresponding `Match` state if a `Match` state is impossible to
+ /// reach.)
+ states: Vec<State>,
+ /// The anchored starting state of this NFA.
+ start_anchored: StateID,
+ /// The unanchored starting state of this NFA.
+ start_unanchored: StateID,
+ /// The starting states for each individual pattern. Starting at any
+ /// of these states will result in only an anchored search for the
+ /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
+ /// contains a single regex, then `start_pattern[0]` and `start_anchored`
+ /// are always equivalent.
+ start_pattern: Vec<StateID>,
+ /// A map from PatternID to its corresponding range of capture slots. Each
+ /// range is guaranteed to be contiguous with the previous range. The
+ /// end of the last range corresponds to the total number of slots needed
+ /// for this NFA.
+ patterns_to_slots: Vec<Range<usize>>,
+ /// A map from capture name to its corresponding index. So e.g., given
+ /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name
+ /// 'word' for pattern ID=0 would corresponding to the index '3'. Its
+ /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'.
+ capture_name_to_index: Vec<CaptureNameMap>,
+ /// A map from pattern ID to capture group index to name, if one exists.
+ /// This is effectively the inverse of 'capture_name_to_index'. The outer
+ /// vec is indexed by pattern ID, while the inner vec is index by capture
+ /// index offset for the corresponding pattern.
+ ///
+ /// The first capture group for each pattern is always unnamed and is thus
+ /// always None.
+ capture_index_to_name: Vec<Vec<Option<Arc<str>>>>,
+ /// A representation of equivalence classes over the transitions in this
+ /// NFA. Two bytes in the same equivalence class must not discriminate
+ /// between a match or a non-match. This map can be used to shrink the
+ /// total size of a DFA's transition table with a small match-time cost.
+ ///
+ /// Note that the NFA's transitions are *not* defined in terms of these
+ /// equivalence classes. The NFA's transitions are defined on the original
+ /// byte values. For the most part, this is because they wouldn't really
+ /// help the NFA much since the NFA already uses a sparse representation
+ /// to represent transitions. Byte classes are most effective in a dense
+ /// representation.
+ byte_class_set: ByteClassSet,
+ /// Various facts about this NFA, which can be used to improve failure
+ /// modes (e.g., rejecting DFA construction if an NFA has Unicode word
+ /// boundaries) or for performing optimizations (avoiding an increase in
+ /// states if there are no look-around states).
+ facts: Facts,
+ /// Heap memory used indirectly by NFA states. Since each state might use a
+ /// different amount of heap, we need to keep track of this incrementally.
+ memory_states: usize,
+}
+
+impl NFA {
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Returns an NFA with no states. Its match semantics are unspecified.
+ ///
+ /// An empty NFA is useful as a starting point for building one. It is
+ /// itself not intended to be used for matching. For example, its starting
+ /// state identifiers are configured to be `0`, but since it has no states,
+ /// the identifiers are invalid.
+ ///
+ /// If you need an NFA that never matches is anything and can be correctly
+ /// used for matching, use [`NFA::never_match`].
+ #[inline]
+ pub fn empty() -> NFA {
+ NFA {
+ states: vec![],
+ start_anchored: StateID::ZERO,
+ start_unanchored: StateID::ZERO,
+ start_pattern: vec![],
+ patterns_to_slots: vec![],
+ capture_name_to_index: vec![],
+ capture_index_to_name: vec![],
+ byte_class_set: ByteClassSet::empty(),
+ facts: Facts::default(),
+ memory_states: 0,
+ }
+ }
+
+ /// Returns an NFA with a single regex that always matches at every
+ /// position.
+ #[inline]
+ pub fn always_match() -> NFA {
+ let mut nfa = NFA::empty();
+ // Since we're only adding one pattern, these are guaranteed to work.
+ let start = nfa.add_match().unwrap();
+ assert_eq!(start.as_usize(), 0);
+ let pid = nfa.finish_pattern(start).unwrap();
+ assert_eq!(pid.as_usize(), 0);
+ nfa
+ }
+
+ /// Returns an NFA that never matches at any position. It contains no
+ /// regexes.
+ #[inline]
+ pub fn never_match() -> NFA {
+ let mut nfa = NFA::empty();
+ // Since we're only adding one state, this can never fail.
+ nfa.add_fail().unwrap();
+ nfa
+ }
+
+ /// Return the number of states in this NFA.
+ ///
+ /// This is guaranteed to be no bigger than [`StateID::LIMIT`].
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ /// Returns the total number of distinct match states in this NFA.
+ /// Stated differently, this returns the total number of regex patterns
+ /// used to build this NFA.
+ ///
+ /// This may return zero if the NFA was constructed with no patterns. In
+ /// this case, and only this case, the NFA can never produce a match for
+ /// any input.
+ ///
+ /// This is guaranteed to be no bigger than [`PatternID::LIMIT`].
+ #[inline]
+ pub fn pattern_len(&self) -> usize {
+ self.start_pattern.len()
+ }
+
+ /// Returns the pattern ID of the pattern currently being compiled by this
+ /// NFA.
+ fn current_pattern_id(&self) -> PatternID {
+ // This always works because we never permit more patterns in
+ // 'start_pattern' than can be addressed by PatternID. Also, we only
+ // add a new entry to 'start_pattern' once we finish compiling a
+ // pattern. Thus, the length refers to the ID of the current pattern
+ // being compiled.
+ PatternID::new(self.start_pattern.len()).unwrap()
+ }
+
+ /// Returns the total number of capturing groups in this NFA.
+ ///
+ /// This includes the special 0th capture group that is always present and
+ /// captures the start and end offset of the entire match.
+ ///
+ /// This is a convenience routine for `nfa.capture_slot_len() / 2`.
+ #[inline]
+ pub fn capture_len(&self) -> usize {
+ let slots = self.capture_slot_len();
+ // This assert is guaranteed to pass since the NFA construction process
+ // guarantees that it is always true.
+ assert_eq!(slots % 2, 0, "capture slots must be divisible by 2");
+ slots / 2
+ }
+
+ /// Returns the total number of capturing slots in this NFA.
+ ///
+ /// This value is guaranteed to be a multiple of 2. (Where each capturing
+ /// group has precisely two capturing slots in the NFA.)
+ #[inline]
+ pub fn capture_slot_len(&self) -> usize {
+ self.patterns_to_slots.last().map_or(0, |r| r.end)
+ }
+
+ /// Return a range of capture slots for the given pattern.
+ ///
+ /// The range returned is guaranteed to be contiguous with ranges for
+ /// adjacent patterns.
+ ///
+ /// This panics if the given pattern ID is greater than or equal to the
+ /// number of patterns in this NFA.
+ #[inline]
+ pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> {
+ self.patterns_to_slots[pid].clone()
+ }
+
+ /// Return the capture group index corresponding to the given name in the
+ /// given pattern. If no such capture group name exists in the given
+ /// pattern, then this returns `None`.
+ ///
+ /// If the given pattern ID is invalid, then this panics.
+ #[inline]
+ pub fn capture_name_to_index(
+ &self,
+ pid: PatternID,
+ name: &str,
+ ) -> Option<usize> {
+ assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID");
+ self.capture_name_to_index[pid].get(name).cloned()
+ }
+
+ // TODO: add iterators over capture group names.
+ // Do we also permit indexing?
+
+ /// Returns an iterator over all pattern IDs in this NFA.
+ #[inline]
+ pub fn patterns(&self) -> PatternIter {
+ PatternIter {
+ it: PatternID::iter(self.pattern_len()),
+ _marker: core::marker::PhantomData,
+ }
+ }
+
+ /// Return the ID of the initial anchored state of this NFA.
+ #[inline]
+ pub fn start_anchored(&self) -> StateID {
+ self.start_anchored
+ }
+
+ /// Set the anchored starting state ID for this NFA.
+ #[inline]
+ pub fn set_start_anchored(&mut self, id: StateID) {
+ self.start_anchored = id;
+ }
+
+ /// Return the ID of the initial unanchored state of this NFA.
+ #[inline]
+ pub fn start_unanchored(&self) -> StateID {
+ self.start_unanchored
+ }
+
+ /// Set the unanchored starting state ID for this NFA.
+ #[inline]
+ pub fn set_start_unanchored(&mut self, id: StateID) {
+ self.start_unanchored = id;
+ }
+
+ /// Return the ID of the initial anchored state for the given pattern.
+ ///
+ /// If the pattern doesn't exist in this NFA, then this panics.
+ #[inline]
+ pub fn start_pattern(&self, pid: PatternID) -> StateID {
+ self.start_pattern[pid]
+ }
+
+ /// Get the byte class set for this NFA.
+ #[inline]
+ pub fn byte_class_set(&self) -> &ByteClassSet {
+ &self.byte_class_set
+ }
+
+ /// Return a reference to the NFA state corresponding to the given ID.
+ #[inline]
+ pub fn state(&self, id: StateID) -> &State {
+ &self.states[id]
+ }
+
+ /// Returns a slice of all states in this NFA.
+ ///
+ /// The slice returned may be indexed by a `StateID` generated by `add`.
+ #[inline]
+ pub fn states(&self) -> &[State] {
+ &self.states
+ }
+
+ #[inline]
+ pub fn is_always_start_anchored(&self) -> bool {
+ self.start_anchored() == self.start_unanchored()
+ }
+
+ #[inline]
+ pub fn has_any_look(&self) -> bool {
+ self.facts.has_any_look()
+ }
+
+ #[inline]
+ pub fn has_any_anchor(&self) -> bool {
+ self.facts.has_any_anchor()
+ }
+
+ #[inline]
+ pub fn has_word_boundary(&self) -> bool {
+ self.has_word_boundary_unicode() || self.has_word_boundary_ascii()
+ }
+
+ #[inline]
+ pub fn has_word_boundary_unicode(&self) -> bool {
+ self.facts.has_word_boundary_unicode()
+ }
+
+ #[inline]
+ pub fn has_word_boundary_ascii(&self) -> bool {
+ self.facts.has_word_boundary_ascii()
+ }
+
+ /// Returns the memory usage, in bytes, of this NFA.
+ ///
+ /// This does **not** include the stack size used up by this NFA. To
+ /// compute that, use `std::mem::size_of::<NFA>()`.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ self.states.len() * mem::size_of::<State>()
+ + self.memory_states
+ + self.start_pattern.len() * mem::size_of::<StateID>()
+ }
+
+ // Why do we define a bunch of 'add_*' routines below instead of just
+ // defining a single 'add' routine that accepts a 'State'? Indeed, for most
+ // of the 'add_*' routines below, such a simple API would be more than
+ // appropriate. Unfortunately, adding capture states and, to a lesser
+ // extent, match states, is a bit more complex. Namely, when we add a
+ // capture state, we *really* want to know the corresponding capture
+ // group's name and index and what not, so that we can update other state
+ // inside this NFA. But, e.g., the capture group name is not and should
+ // not be included in 'State::Capture'. So what are our choices?
+ //
+ // 1) Define one 'add' and require some additional optional parameters.
+ // This feels quite ugly, and adds unnecessary complexity to more common
+ // and simpler cases.
+ //
+ // 2) Do what we do below. The sad thing is that our API is bigger with
+ // more methods. But each method is very specific and hopefully simple.
+ //
+ // 3) Define a new enum, say, 'StateWithInfo', or something that permits
+ // providing both a State and some extra ancillary info in some cases. This
+ // doesn't seem too bad to me, but seems slightly worse than (2) because of
+ // the additional type required.
+ //
+ // 4) Abandon the idea that we have to specify things like the capture
+ // group name when we add the Capture state to the NFA. We would then need
+ // to add other methods that permit the caller to add this additional state
+ // "out of band." Other than it introducing some additional complexity, I
+ // decided against this because I wanted the NFA builder API to make it
+ // as hard as possible to build a bad or invalid NFA. Using the approach
+ // below, as you'll see, permits us to do a lot of strict checking of our
+ // inputs and return an error if we see something we don't expect.
+
+ pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> {
+ self.byte_class_set.set_range(range.start, range.end);
+ self.add_state(State::Range { range })
+ }
+
+ pub fn add_sparse(
+ &mut self,
+ sparse: SparseTransitions,
+ ) -> Result<StateID, Error> {
+ for range in sparse.ranges.iter() {
+ self.byte_class_set.set_range(range.start, range.end);
+ }
+ self.add_state(State::Sparse(sparse))
+ }
+
+ pub fn add_look(
+ &mut self,
+ next: StateID,
+ look: Look,
+ ) -> Result<StateID, Error> {
+ self.facts.set_has_any_look(true);
+ look.add_to_byteset(&mut self.byte_class_set);
+ match look {
+ Look::StartLine
+ | Look::EndLine
+ | Look::StartText
+ | Look::EndText => {
+ self.facts.set_has_any_anchor(true);
+ }
+ Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => {
+ self.facts.set_has_word_boundary_unicode(true);
+ }
+ Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => {
+ self.facts.set_has_word_boundary_ascii(true);
+ }
+ }
+ self.add_state(State::Look { look, next })
+ }
+
+ pub fn add_union(
+ &mut self,
+ alternates: Box<[StateID]>,
+ ) -> Result<StateID, Error> {
+ self.add_state(State::Union { alternates })
+ }
+
+ pub fn add_capture_start(
+ &mut self,
+ next_id: StateID,
+ capture_index: u32,
+ name: Option<Arc<str>>,
+ ) -> Result<StateID, Error> {
+ let pid = self.current_pattern_id();
+ let capture_index = match usize::try_from(capture_index) {
+ Err(_) => {
+ return Err(Error::invalid_capture_index(core::usize::MAX))
+ }
+ Ok(capture_index) => capture_index,
+ };
+ // Do arithmetic to find our absolute slot index first, to make sure
+ // the index is at least possibly valid (doesn't overflow).
+ let relative_slot = match capture_index.checked_mul(2) {
+ Some(relative_slot) => relative_slot,
+ None => return Err(Error::invalid_capture_index(capture_index)),
+ };
+ let slot = match relative_slot.checked_add(self.capture_slot_len()) {
+ Some(slot) => slot,
+ None => return Err(Error::invalid_capture_index(capture_index)),
+ };
+ // Make sure we have space to insert our (pid,index)|-->name mapping.
+ if pid.as_usize() >= self.capture_index_to_name.len() {
+ // Note that we require that if you're adding capturing groups,
+ // then there must be at least one capturing group per pattern.
+ // Moreover, whenever we expand our space here, it should always
+ // first be for the first capture group (at index==0).
+ if pid.as_usize() > self.capture_index_to_name.len()
+ || capture_index > 0
+ {
+ return Err(Error::invalid_capture_index(capture_index));
+ }
+ self.capture_name_to_index.push(CaptureNameMap::new());
+ self.capture_index_to_name.push(vec![]);
+ }
+ if capture_index >= self.capture_index_to_name[pid].len() {
+ // We require that capturing groups are added in correspondence
+ // to their index. So no discontinuous indices. This is likely
+ // overly strict, but also makes it simpler to provide guarantees
+ // about our capturing group data.
+ if capture_index > self.capture_index_to_name[pid].len() {
+ return Err(Error::invalid_capture_index(capture_index));
+ }
+ self.capture_index_to_name[pid].push(None);
+ }
+ if let Some(ref name) = name {
+ self.capture_name_to_index[pid]
+ .insert(Arc::clone(name), capture_index);
+ }
+ self.capture_index_to_name[pid][capture_index] = name;
+ self.add_state(State::Capture { next: next_id, slot })
+ }
+
+ pub fn add_capture_end(
+ &mut self,
+ next_id: StateID,
+ capture_index: u32,
+ ) -> Result<StateID, Error> {
+ let pid = self.current_pattern_id();
+ let capture_index = match usize::try_from(capture_index) {
+ Err(_) => {
+ return Err(Error::invalid_capture_index(core::usize::MAX))
+ }
+ Ok(capture_index) => capture_index,
+ };
+ // If we haven't already added this capture group via a corresponding
+ // 'add_capture_start' call, then we consider the index given to be
+ // invalid.
+ if pid.as_usize() >= self.capture_index_to_name.len()
+ || capture_index >= self.capture_index_to_name[pid].len()
+ {
+ return Err(Error::invalid_capture_index(capture_index));
+ }
+ // Since we've already confirmed that this capture index is invalid
+ // and has a corresponding starting slot, we know the multiplcation
+ // has already been done and succeeded.
+ let relative_slot_start = capture_index.checked_mul(2).unwrap();
+ let relative_slot = match relative_slot_start.checked_add(1) {
+ Some(relative_slot) => relative_slot,
+ None => return Err(Error::invalid_capture_index(capture_index)),
+ };
+ let slot = match relative_slot.checked_add(self.capture_slot_len()) {
+ Some(slot) => slot,
+ None => return Err(Error::invalid_capture_index(capture_index)),
+ };
+ self.add_state(State::Capture { next: next_id, slot })
+ }
+
+ pub fn add_fail(&mut self) -> Result<StateID, Error> {
+ self.add_state(State::Fail)
+ }
+
+ /// Add a new match state to this NFA and return its state ID.
+ pub fn add_match(&mut self) -> Result<StateID, Error> {
+ let pattern_id = self.current_pattern_id();
+ let sid = self.add_state(State::Match { id: pattern_id })?;
+ Ok(sid)
+ }
+
+ /// Finish compiling the current pattern and return its identifier. The
+ /// given ID should be the state ID corresponding to the anchored starting
+ /// state for matching this pattern.
+ pub fn finish_pattern(
+ &mut self,
+ start_id: StateID,
+ ) -> Result<PatternID, Error> {
+ // We've gotta make sure that we never permit the user to add more
+ // patterns than we can identify. So if we're already at the limit,
+ // then return an error. This is somewhat non-ideal since this won't
+ // result in an error until trying to complete the compilation of a
+ // pattern instead of starting it.
+ if self.start_pattern.len() >= PatternID::LIMIT {
+ return Err(Error::too_many_patterns(
+ self.start_pattern.len().saturating_add(1),
+ ));
+ }
+ let pid = self.current_pattern_id();
+ self.start_pattern.push(start_id);
+ // Add the number of new slots created by this pattern. This is always
+ // equivalent to '2 * caps.len()', where 'caps.len()' is the number of
+ // new capturing groups introduced by the pattern we're finishing.
+ let new_cap_groups = self
+ .capture_index_to_name
+ .get(pid.as_usize())
+ .map_or(0, |caps| caps.len());
+ let new_slots = match new_cap_groups.checked_mul(2) {
+ Some(new_slots) => new_slots,
+ None => {
+ // Just return the biggest index that we know exists.
+ let index = new_cap_groups.saturating_sub(1);
+ return Err(Error::invalid_capture_index(index));
+ }
+ };
+ let slot_start = self.capture_slot_len();
+ self.patterns_to_slots.push(slot_start..(slot_start + new_slots));
+ Ok(pid)
+ }
+
+ fn add_state(&mut self, state: State) -> Result<StateID, Error> {
+ let id = StateID::new(self.states.len())
+ .map_err(|_| Error::too_many_states(self.states.len()))?;
+ self.memory_states += state.memory_usage();
+ self.states.push(state);
+ Ok(id)
+ }
+
+ /// Remap the transitions in every state of this NFA using the given map.
+ /// The given map should be indexed according to state ID namespace used by
+ /// the transitions of the states currently in this NFA.
+ ///
+ /// This may be used during the final phases of an NFA compiler, which
+ /// turns its intermediate NFA into the final NFA. Remapping may be
+ /// required to bring the state pointers from the intermediate NFA to the
+ /// final NFA.
+ pub fn remap(&mut self, old_to_new: &[StateID]) {
+ for state in &mut self.states {
+ state.remap(old_to_new);
+ }
+ self.start_anchored = old_to_new[self.start_anchored];
+ self.start_unanchored = old_to_new[self.start_unanchored];
+ for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() {
+ *id = old_to_new[*id];
+ }
+ }
+
+ /// Clear this NFA such that it has zero states and is otherwise "empty."
+ ///
+ /// An empty NFA is useful as a starting point for building one. It is
+ /// itself not intended to be used for matching. For example, its starting
+ /// state identifiers are configured to be `0`, but since it has no states,
+ /// the identifiers are invalid.
+ pub fn clear(&mut self) {
+ self.states.clear();
+ self.start_anchored = StateID::ZERO;
+ self.start_unanchored = StateID::ZERO;
+ self.start_pattern.clear();
+ self.patterns_to_slots.clear();
+ self.capture_name_to_index.clear();
+ self.capture_index_to_name.clear();
+ self.byte_class_set = ByteClassSet::empty();
+ self.facts = Facts::default();
+ self.memory_states = 0;
+ }
+}
+
+impl fmt::Debug for NFA {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "thompson::NFA(")?;
+ for (sid, state) in self.states.iter().with_state_ids() {
+ let status = if sid == self.start_anchored {
+ '^'
+ } else if sid == self.start_unanchored {
+ '>'
+ } else {
+ ' '
+ };
+ writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
+ }
+ if self.pattern_len() > 1 {
+ writeln!(f, "")?;
+ for pid in self.patterns() {
+ let sid = self.start_pattern(pid);
+ writeln!(
+ f,
+ "START({:06?}): {:?}",
+ pid.as_usize(),
+ sid.as_usize()
+ )?;
+ }
+ }
+ writeln!(f, "")?;
+ writeln!(
+ f,
+ "transition equivalence classes: {:?}",
+ self.byte_class_set().byte_classes()
+ )?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// A state in a final compiled NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+ /// A state that transitions to `next` if and only if the current input
+ /// byte is in the range `[start, end]` (inclusive).
+ ///
+ /// This is a special case of Sparse in that it encodes only one transition
+ /// (and therefore avoids the allocation).
+ Range { range: Transition },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range. As
+ /// such, this may only be used when every transition has equal priority.
+ /// (In practice, this is only used for encoding UTF-8 automata.)
+ Sparse(SparseTransitions),
+ /// A conditional epsilon transition satisfied via some sort of
+ /// look-around.
+ Look { look: Look, next: StateID },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union { alternates: Box<[StateID]> },
+ /// An empty state that records a capture location.
+ ///
+ /// From the perspective of finite automata, this is precisely equivalent
+ /// to an epsilon transition, but serves the purpose of instructing NFA
+ /// simulations to record additional state when the finite state machine
+ /// passes through this epsilon transition.
+ ///
+ /// These transitions are treated as epsilon transitions with no additional
+ /// effects in DFAs.
+ ///
+ /// 'slot' in this context refers to the specific capture group offset that
+ /// is being recorded. Each capturing group has two slots corresponding to
+ /// the start and end of the matching portion of that group.
+ /// A fail state. When encountered, the automaton is guaranteed to never
+ /// reach a match state.
+ Capture { next: StateID, slot: usize },
+ /// A state that cannot be transitioned out of. If a search reaches this
+ /// state, then no match is possible and the search should terminate.
+ Fail,
+ /// A match state. There is exactly one such occurrence of this state for
+ /// each regex compiled into the NFA.
+ Match { id: PatternID },
+}
+
+impl State {
+ /// Returns true if and only if this state contains one or more epsilon
+ /// transitions.
+ #[inline]
+ pub fn is_epsilon(&self) -> bool {
+ match *self {
+ State::Range { .. }
+ | State::Sparse { .. }
+ | State::Fail
+ | State::Match { .. } => false,
+ State::Look { .. }
+ | State::Union { .. }
+ | State::Capture { .. } => true,
+ }
+ }
+
+ /// Returns the heap memory usage of this NFA state in bytes.
+ fn memory_usage(&self) -> usize {
+ match *self {
+ State::Range { .. }
+ | State::Look { .. }
+ | State::Capture { .. }
+ | State::Match { .. }
+ | State::Fail => 0,
+ State::Sparse(SparseTransitions { ref ranges }) => {
+ ranges.len() * mem::size_of::<Transition>()
+ }
+ State::Union { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ }
+ }
+
+ /// Remap the transitions in this state using the given map. Namely, the
+ /// given map should be indexed according to the transitions currently
+ /// in this state.
+ ///
+ /// This is used during the final phase of the NFA compiler, which turns
+ /// its intermediate NFA into the final NFA.
+ fn remap(&mut self, remap: &[StateID]) {
+ match *self {
+ State::Range { ref mut range } => range.next = remap[range.next],
+ State::Sparse(SparseTransitions { ref mut ranges }) => {
+ for r in ranges.iter_mut() {
+ r.next = remap[r.next];
+ }
+ }
+ State::Look { ref mut next, .. } => *next = remap[*next],
+ State::Union { ref mut alternates } => {
+ for alt in alternates.iter_mut() {
+ *alt = remap[*alt];
+ }
+ }
+ State::Capture { ref mut next, .. } => *next = remap[*next],
+ State::Fail => {}
+ State::Match { .. } => {}
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ State::Range { ref range } => range.fmt(f),
+ State::Sparse(SparseTransitions { ref ranges }) => {
+ let rs = ranges
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "sparse({})", rs)
+ }
+ State::Look { ref look, next } => {
+ write!(f, "{:?} => {:?}", look, next.as_usize())
+ }
+ State::Union { ref alternates } => {
+ let alts = alternates
+ .iter()
+ .map(|id| format!("{:?}", id.as_usize()))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "alt({})", alts)
+ }
+ State::Capture { next, slot } => {
+ write!(f, "capture({:?}) => {:?}", slot, next.as_usize())
+ }
+ State::Fail => write!(f, "FAIL"),
+ State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()),
+ }
+ }
+}
+
+/// A collection of facts about an NFA.
+///
+/// There are no real cohesive principles behind what gets put in here. For
+/// the most part, it is implementation driven.
+#[derive(Clone, Copy, Debug, Default)]
+struct Facts {
+ /// Various yes/no facts about this NFA.
+ bools: u16,
+}
+
+impl Facts {
+ define_bool!(0, has_any_look, set_has_any_look);
+ define_bool!(1, has_any_anchor, set_has_any_anchor);
+ define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode);
+ define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii);
+}
+
+/// A sequence of transitions used to represent a sparse state.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SparseTransitions {
+ pub ranges: Box<[Transition]>,
+}
+
+impl SparseTransitions {
+ pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
+ haystack.get(at).and_then(|&b| self.matches_byte(b))
+ }
+
+ pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> {
+ unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
+ }
+
+ pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
+ for t in self.ranges.iter() {
+ if t.start > byte {
+ break;
+ } else if t.matches_byte(byte) {
+ return Some(t.next);
+ }
+ }
+ None
+
+ /*
+ // This is an alternative implementation that uses binary search. In
+ // some ad hoc experiments, like
+ //
+ // smallishru=OpenSubtitles2018.raw.sample.smallish.ru
+ // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+ //
+ // I could not observe any improvement, and in fact, things seemed to
+ // be a bit slower.
+ self.ranges
+ .binary_search_by(|t| {
+ if t.end < byte {
+ core::cmp::Ordering::Less
+ } else if t.start > byte {
+ core::cmp::Ordering::Greater
+ } else {
+ core::cmp::Ordering::Equal
+ }
+ })
+ .ok()
+ .map(|i| self.ranges[i].next)
+ */
+ }
+}
+
+/// A transition to another state, only if the given byte falls in the
+/// inclusive range specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+ pub start: u8,
+ pub end: u8,
+ pub next: StateID,
+}
+
+impl Transition {
+ pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
+ haystack.get(at).map_or(false, |&b| self.matches_byte(b))
+ }
+
+ pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
+ unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
+ }
+
+ pub fn matches_byte(&self, byte: u8) -> bool {
+ self.start <= byte && byte <= self.end
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use crate::util::DebugByte;
+
+ let Transition { start, end, next } = *self;
+ if self.start == self.end {
+ write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize(),
+ )
+ }
+ }
+}
+
+/// A conditional NFA epsilon transition.
+///
+/// A simulation of the NFA can only move through this epsilon transition if
+/// the current position satisfies some look-around property. Some assertions
+/// are look-behind (StartLine, StartText), some assertions are look-ahead
+/// (EndLine, EndText) while other assertions are both look-behind and
+/// look-ahead (WordBoundary*).
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Look {
+ /// The previous position is either `\n` or the current position is the
+ /// beginning of the haystack (i.e., at position `0`).
+ StartLine = 1 << 0,
+ /// The next position is either `\n` or the current position is the end of
+ /// the haystack (i.e., at position `haystack.len()`).
+ EndLine = 1 << 1,
+ /// The current position is the beginning of the haystack (i.e., at
+ /// position `0`).
+ StartText = 1 << 2,
+ /// The current position is the end of the haystack (i.e., at position
+ /// `haystack.len()`).
+ EndText = 1 << 3,
+ /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])`
+ /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only
+ /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if
+ /// `i=haystack.len()`, then `is_word(n)=false`.
+ WordBoundaryUnicode = 1 << 4,
+ /// Same as for `WordBoundaryUnicode`, but requires that
+ /// `is_word(p) == is_word(n)`.
+ WordBoundaryUnicodeNegate = 1 << 5,
+ /// When tested at position `i`, where `p=haystack[i-1]` and
+ /// `n=haystack[i]`, this assertion passes if and only if `is_word(p)
+ /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if
+ /// `i=haystack.len()`, then `is_word(n)=false`.
+ WordBoundaryAscii = 1 << 6,
+ /// Same as for `WordBoundaryAscii`, but requires that
+ /// `is_word(p) == is_word(n)`.
+ ///
+ /// Note that it is possible for this assertion to match at positions that
+ /// split the UTF-8 encoding of a codepoint. For this reason, this may only
+ /// be used when UTF-8 mode is disable in the regex syntax.
+ WordBoundaryAsciiNegate = 1 << 7,
+}
+
+impl Look {
+ #[inline(always)]
+ pub fn matches(&self, bytes: &[u8], at: usize) -> bool {
+ match *self {
+ Look::StartLine => at == 0 || bytes[at - 1] == b'\n',
+ Look::EndLine => at == bytes.len() || bytes[at] == b'\n',
+ Look::StartText => at == 0,
+ Look::EndText => at == bytes.len(),
+ Look::WordBoundaryUnicode => {
+ let word_before = is_word_char_rev(bytes, at);
+ let word_after = is_word_char_fwd(bytes, at);
+ word_before != word_after
+ }
+ Look::WordBoundaryUnicodeNegate => {
+ // This is pretty subtle. Why do we need to do UTF-8 decoding
+ // here? Well... at time of writing, the is_word_char_{fwd,rev}
+ // routines will only return true if there is a valid UTF-8
+ // encoding of a "word" codepoint, and false in every other
+ // case (including invalid UTF-8). This means that in regions
+ // of invalid UTF-8 (which might be a subset of valid UTF-8!),
+ // it would result in \B matching. While this would be
+ // questionable in the context of truly invalid UTF-8, it is
+ // *certainly* wrong to report match boundaries that split the
+ // encoding of a codepoint. So to work around this, we ensure
+ // that we can decode a codepoint on either side of `at`. If
+ // either direction fails, then we don't permit \B to match at
+ // all.
+ //
+ // Now, this isn't exactly optimal from a perf perspective. We
+ // could try and detect this in is_word_char_{fwd,rev}, but
+ // it's not clear if it's worth it. \B is, after all, rarely
+ // used.
+ //
+ // And in particular, we do *not* have to do this with \b,
+ // because \b *requires* that at least one side of `at` be a
+ // "word" codepoint, which in turn implies one side of `at`
+ // must be valid UTF-8. This in turn implies that \b can never
+ // split a valid UTF-8 encoding of a codepoint. In the case
+ // where one side of `at` is truly invalid UTF-8 and the other
+ // side IS a word codepoint, then we want \b to match since it
+ // represents a valid UTF-8 boundary. It also makes sense. For
+ // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
+ let word_before = at > 0
+ && match decode_last_utf8(&bytes[..at]) {
+ None | Some(Err(_)) => return false,
+ Some(Ok(_)) => is_word_char_rev(bytes, at),
+ };
+ let word_after = at < bytes.len()
+ && match decode_utf8(&bytes[at..]) {
+ None | Some(Err(_)) => return false,
+ Some(Ok(_)) => is_word_char_fwd(bytes, at),
+ };
+ word_before == word_after
+ }
+ Look::WordBoundaryAscii => {
+ let word_before = at > 0 && is_word_byte(bytes[at - 1]);
+ let word_after = at < bytes.len() && is_word_byte(bytes[at]);
+ word_before != word_after
+ }
+ Look::WordBoundaryAsciiNegate => {
+ let word_before = at > 0 && is_word_byte(bytes[at - 1]);
+ let word_after = at < bytes.len() && is_word_byte(bytes[at]);
+ word_before == word_after
+ }
+ }
+ }
+
+ /// Create a look-around assertion from its corresponding integer (as
+ /// defined in `Look`). If the given integer does not correspond to any
+ /// assertion, then None is returned.
+ fn from_int(n: u8) -> Option<Look> {
+ match n {
+ 0b0000_0001 => Some(Look::StartLine),
+ 0b0000_0010 => Some(Look::EndLine),
+ 0b0000_0100 => Some(Look::StartText),
+ 0b0000_1000 => Some(Look::EndText),
+ 0b0001_0000 => Some(Look::WordBoundaryUnicode),
+ 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate),
+ 0b0100_0000 => Some(Look::WordBoundaryAscii),
+ 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate),
+ _ => None,
+ }
+ }
+
+ /// Flip the look-around assertion to its equivalent for reverse searches.
+ fn reversed(&self) -> Look {
+ match *self {
+ Look::StartLine => Look::EndLine,
+ Look::EndLine => Look::StartLine,
+ Look::StartText => Look::EndText,
+ Look::EndText => Look::StartText,
+ Look::WordBoundaryUnicode => Look::WordBoundaryUnicode,
+ Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate,
+ Look::WordBoundaryAscii => Look::WordBoundaryAscii,
+ Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate,
+ }
+ }
+
+ /// Split up the given byte classes into equivalence classes in a way that
+ /// is consistent with this look-around assertion.
+ fn add_to_byteset(&self, set: &mut ByteClassSet) {
+ match *self {
+ Look::StartText | Look::EndText => {}
+ Look::StartLine | Look::EndLine => {
+ set.set_range(b'\n', b'\n');
+ }
+ Look::WordBoundaryUnicode
+ | Look::WordBoundaryUnicodeNegate
+ | Look::WordBoundaryAscii
+ | Look::WordBoundaryAsciiNegate => {
+ // We need to mark all ranges of bytes whose pairs result in
+ // evaluating \b differently. This isn't technically correct
+ // for Unicode word boundaries, but DFAs can't handle those
+ // anyway, and thus, the byte classes don't need to either
+ // since they are themselves only used in DFAs.
+ let iswb = regex_syntax::is_word_byte;
+ let mut b1: u16 = 0;
+ let mut b2: u16;
+ while b1 <= 255 {
+ b2 = b1 + 1;
+ while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
+ b2 += 1;
+ }
+ set.set_range(b1 as u8, (b2 - 1) as u8);
+ b1 = b2;
+ }
+ }
+ }
+ }
+}
+
+/// LookSet is a memory-efficient set of look-around assertions. Callers may
+/// idempotently insert or remove any look-around assertion from a set.
+#[repr(transparent)]
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub(crate) struct LookSet {
+ set: u8,
+}
+
+impl LookSet {
+ /// Return a LookSet from its representation.
+ pub(crate) fn from_repr(repr: u8) -> LookSet {
+ LookSet { set: repr }
+ }
+
+ /// Return a mutable LookSet from a mutable pointer to its representation.
+ pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet {
+ // SAFETY: This is safe since a LookSet is repr(transparent) where its
+ // repr is a u8.
+ unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) }
+ }
+
+ /// Return true if and only if this set is empty.
+ pub(crate) fn is_empty(&self) -> bool {
+ self.set == 0
+ }
+
+ /// Clears this set such that it has no assertions in it.
+ pub(crate) fn clear(&mut self) {
+ self.set = 0;
+ }
+
+ /// Insert the given look-around assertion into this set. If the assertion
+ /// already exists, then this is a no-op.
+ pub(crate) fn insert(&mut self, look: Look) {
+ self.set |= look as u8;
+ }
+
+ /// Remove the given look-around assertion from this set. If the assertion
+ /// is not in this set, then this is a no-op.
+ #[cfg(test)]
+ pub(crate) fn remove(&mut self, look: Look) {
+ self.set &= !(look as u8);
+ }
+
+ /// Return true if and only if the given assertion is in this set.
+ pub(crate) fn contains(&self, look: Look) -> bool {
+ (look as u8) & self.set != 0
+ }
+
+ /// Subtract the given `other` set from the `self` set and return a new
+ /// set.
+ pub(crate) fn subtract(&self, other: LookSet) -> LookSet {
+ LookSet { set: self.set & !other.set }
+ }
+
+ /// Return the intersection of the given `other` set with the `self` set
+ /// and return the resulting set.
+ pub(crate) fn intersect(&self, other: LookSet) -> LookSet {
+ LookSet { set: self.set & other.set }
+ }
+}
+
+impl core::fmt::Debug for LookSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut members = vec![];
+ for i in 0..8 {
+ let look = match Look::from_int(1 << i) {
+ None => continue,
+ Some(look) => look,
+ };
+ if self.contains(look) {
+ members.push(look);
+ }
+ }
+ f.debug_tuple("LookSet").field(&members).finish()
+ }
+}
+
+/// An iterator over all pattern IDs in an NFA.
+pub struct PatternIter<'a> {
+ it: PatternIDIter,
+ /// We explicitly associate a lifetime with this iterator even though we
+ /// don't actually borrow anything from the NFA. We do this for backward
+ /// compatibility purposes. If we ever do need to borrow something from
+ /// the NFA, then we can and just get rid of this marker without breaking
+ /// the public API.
+ _marker: core::marker::PhantomData<&'a ()>,
+}
+
+impl<'a> Iterator for PatternIter<'a> {
+ type Item = PatternID;
+
+ fn next(&mut self) -> Option<PatternID> {
+ self.it.next()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ // TODO: Replace tests using DFA with NFA matching engine once implemented.
+ use crate::dfa::{dense, Automaton};
+
+ #[test]
+ fn always_match() {
+ let nfa = NFA::always_match();
+ let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap();
+ let find = |input, start, end| {
+ dfa.find_leftmost_fwd_at(None, None, input, start, end)
+ .unwrap()
+ .map(|m| m.offset())
+ };
+
+ assert_eq!(Some(0), find(b"", 0, 0));
+ assert_eq!(Some(0), find(b"a", 0, 1));
+ assert_eq!(Some(1), find(b"a", 1, 1));
+ assert_eq!(Some(0), find(b"ab", 0, 2));
+ assert_eq!(Some(1), find(b"ab", 1, 2));
+ assert_eq!(Some(2), find(b"ab", 2, 2));
+ }
+
+ #[test]
+ fn never_match() {
+ let nfa = NFA::never_match();
+ let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap();
+ let find = |input, start, end| {
+ dfa.find_leftmost_fwd_at(None, None, input, start, end)
+ .unwrap()
+ .map(|m| m.offset())
+ };
+
+ assert_eq!(None, find(b"", 0, 0));
+ assert_eq!(None, find(b"a", 0, 1));
+ assert_eq!(None, find(b"a", 1, 1));
+ assert_eq!(None, find(b"ab", 0, 2));
+ assert_eq!(None, find(b"ab", 1, 2));
+ assert_eq!(None, find(b"ab", 2, 2));
+ }
+
+ #[test]
+ fn look_set() {
+ let mut f = LookSet::default();
+ assert!(!f.contains(Look::StartText));
+ assert!(!f.contains(Look::EndText));
+ assert!(!f.contains(Look::StartLine));
+ assert!(!f.contains(Look::EndLine));
+ assert!(!f.contains(Look::WordBoundaryUnicode));
+ assert!(!f.contains(Look::WordBoundaryUnicodeNegate));
+ assert!(!f.contains(Look::WordBoundaryAscii));
+ assert!(!f.contains(Look::WordBoundaryAsciiNegate));
+
+ f.insert(Look::StartText);
+ assert!(f.contains(Look::StartText));
+ f.remove(Look::StartText);
+ assert!(!f.contains(Look::StartText));
+
+ f.insert(Look::EndText);
+ assert!(f.contains(Look::EndText));
+ f.remove(Look::EndText);
+ assert!(!f.contains(Look::EndText));
+
+ f.insert(Look::StartLine);
+ assert!(f.contains(Look::StartLine));
+ f.remove(Look::StartLine);
+ assert!(!f.contains(Look::StartLine));
+
+ f.insert(Look::EndLine);
+ assert!(f.contains(Look::EndLine));
+ f.remove(Look::EndLine);
+ assert!(!f.contains(Look::EndLine));
+
+ f.insert(Look::WordBoundaryUnicode);
+ assert!(f.contains(Look::WordBoundaryUnicode));
+ f.remove(Look::WordBoundaryUnicode);
+ assert!(!f.contains(Look::WordBoundaryUnicode));
+
+ f.insert(Look::WordBoundaryUnicodeNegate);
+ assert!(f.contains(Look::WordBoundaryUnicodeNegate));
+ f.remove(Look::WordBoundaryUnicodeNegate);
+ assert!(!f.contains(Look::WordBoundaryUnicodeNegate));
+
+ f.insert(Look::WordBoundaryAscii);
+ assert!(f.contains(Look::WordBoundaryAscii));
+ f.remove(Look::WordBoundaryAscii);
+ assert!(!f.contains(Look::WordBoundaryAscii));
+
+ f.insert(Look::WordBoundaryAsciiNegate);
+ assert!(f.contains(Look::WordBoundaryAsciiNegate));
+ f.remove(Look::WordBoundaryAsciiNegate);
+ assert!(!f.contains(Look::WordBoundaryAsciiNegate));
+ }
+
+ #[test]
+ fn look_matches_start_line() {
+ let look = Look::StartLine;
+
+ assert!(look.matches(B(""), 0));
+ assert!(look.matches(B("\n"), 0));
+ assert!(look.matches(B("\n"), 1));
+ assert!(look.matches(B("a"), 0));
+ assert!(look.matches(B("\na"), 1));
+
+ assert!(!look.matches(B("a"), 1));
+ assert!(!look.matches(B("a\na"), 1));
+ }
+
+ #[test]
+ fn look_matches_end_line() {
+ let look = Look::EndLine;
+
+ assert!(look.matches(B(""), 0));
+ assert!(look.matches(B("\n"), 1));
+ assert!(look.matches(B("\na"), 0));
+ assert!(look.matches(B("\na"), 2));
+ assert!(look.matches(B("a\na"), 1));
+
+ assert!(!look.matches(B("a"), 0));
+ assert!(!look.matches(B("\na"), 1));
+ assert!(!look.matches(B("a\na"), 0));
+ assert!(!look.matches(B("a\na"), 2));
+ }
+
+ #[test]
+ fn look_matches_start_text() {
+ let look = Look::StartText;
+
+ assert!(look.matches(B(""), 0));
+ assert!(look.matches(B("\n"), 0));
+ assert!(look.matches(B("a"), 0));
+
+ assert!(!look.matches(B("\n"), 1));
+ assert!(!look.matches(B("\na"), 1));
+ assert!(!look.matches(B("a"), 1));
+ assert!(!look.matches(B("a\na"), 1));
+ }
+
+ #[test]
+ fn look_matches_end_text() {
+ let look = Look::EndText;
+
+ assert!(look.matches(B(""), 0));
+ assert!(look.matches(B("\n"), 1));
+ assert!(look.matches(B("\na"), 2));
+
+ assert!(!look.matches(B("\na"), 0));
+ assert!(!look.matches(B("a\na"), 1));
+ assert!(!look.matches(B("a"), 0));
+ assert!(!look.matches(B("\na"), 1));
+ assert!(!look.matches(B("a\na"), 0));
+ assert!(!look.matches(B("a\na"), 2));
+ }
+
+ #[test]
+ fn look_matches_word_unicode() {
+ let look = Look::WordBoundaryUnicode;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(look.matches(B("a"), 0));
+ assert!(look.matches(B("a"), 1));
+ assert!(look.matches(B("a "), 1));
+ assert!(look.matches(B(" a "), 1));
+ assert!(look.matches(B(" a "), 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(look.matches(B("𝛃"), 0));
+ assert!(look.matches(B("𝛃"), 4));
+ assert!(look.matches(B("𝛃 "), 4));
+ assert!(look.matches(B(" 𝛃 "), 1));
+ assert!(look.matches(B(" 𝛃 "), 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(look.matches(B("𝛃𐆀"), 0));
+ assert!(look.matches(B("𝛃𐆀"), 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!look.matches(B(""), 0));
+ assert!(!look.matches(B("ab"), 1));
+ assert!(!look.matches(B("a "), 2));
+ assert!(!look.matches(B(" a "), 0));
+ assert!(!look.matches(B(" a "), 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!look.matches(B("𝛃b"), 4));
+ assert!(!look.matches(B("𝛃 "), 5));
+ assert!(!look.matches(B(" 𝛃 "), 0));
+ assert!(!look.matches(B(" 𝛃 "), 6));
+ assert!(!look.matches(B("𝛃"), 1));
+ assert!(!look.matches(B("𝛃"), 2));
+ assert!(!look.matches(B("𝛃"), 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!look.matches(B("𝛃𐆀"), 1));
+ assert!(!look.matches(B("𝛃𐆀"), 2));
+ assert!(!look.matches(B("𝛃𐆀"), 3));
+ assert!(!look.matches(B("𝛃𐆀"), 5));
+ assert!(!look.matches(B("𝛃𐆀"), 6));
+ assert!(!look.matches(B("𝛃𐆀"), 7));
+ assert!(!look.matches(B("𝛃𐆀"), 8));
+ }
+
+ #[test]
+ fn look_matches_word_ascii() {
+ let look = Look::WordBoundaryAscii;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(look.matches(B("a"), 0));
+ assert!(look.matches(B("a"), 1));
+ assert!(look.matches(B("a "), 1));
+ assert!(look.matches(B(" a "), 1));
+ assert!(look.matches(B(" a "), 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(!look.matches(B("𝛃"), 0));
+ assert!(!look.matches(B("𝛃"), 4));
+ assert!(!look.matches(B("𝛃 "), 4));
+ assert!(!look.matches(B(" 𝛃 "), 1));
+ assert!(!look.matches(B(" 𝛃 "), 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(!look.matches(B("𝛃𐆀"), 0));
+ assert!(!look.matches(B("𝛃𐆀"), 4));
+
+ // Non word boundaries for ASCII.
+ assert!(!look.matches(B(""), 0));
+ assert!(!look.matches(B("ab"), 1));
+ assert!(!look.matches(B("a "), 2));
+ assert!(!look.matches(B(" a "), 0));
+ assert!(!look.matches(B(" a "), 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(look.matches(B("𝛃b"), 4));
+ assert!(!look.matches(B("𝛃 "), 5));
+ assert!(!look.matches(B(" 𝛃 "), 0));
+ assert!(!look.matches(B(" 𝛃 "), 6));
+ assert!(!look.matches(B("𝛃"), 1));
+ assert!(!look.matches(B("𝛃"), 2));
+ assert!(!look.matches(B("𝛃"), 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(!look.matches(B("𝛃𐆀"), 1));
+ assert!(!look.matches(B("𝛃𐆀"), 2));
+ assert!(!look.matches(B("𝛃𐆀"), 3));
+ assert!(!look.matches(B("𝛃𐆀"), 5));
+ assert!(!look.matches(B("𝛃𐆀"), 6));
+ assert!(!look.matches(B("𝛃𐆀"), 7));
+ assert!(!look.matches(B("𝛃𐆀"), 8));
+ }
+
+ #[test]
+ fn look_matches_word_unicode_negate() {
+ let look = Look::WordBoundaryUnicodeNegate;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!look.matches(B("a"), 0));
+ assert!(!look.matches(B("a"), 1));
+ assert!(!look.matches(B("a "), 1));
+ assert!(!look.matches(B(" a "), 1));
+ assert!(!look.matches(B(" a "), 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint.
+ assert!(!look.matches(B("𝛃"), 0));
+ assert!(!look.matches(B("𝛃"), 4));
+ assert!(!look.matches(B("𝛃 "), 4));
+ assert!(!look.matches(B(" 𝛃 "), 1));
+ assert!(!look.matches(B(" 𝛃 "), 5));
+
+ // Unicode word boundaries between non-ASCII codepoints.
+ assert!(!look.matches(B("𝛃𐆀"), 0));
+ assert!(!look.matches(B("𝛃𐆀"), 4));
+
+ // Non word boundaries for ASCII.
+ assert!(look.matches(B(""), 0));
+ assert!(look.matches(B("ab"), 1));
+ assert!(look.matches(B("a "), 2));
+ assert!(look.matches(B(" a "), 0));
+ assert!(look.matches(B(" a "), 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(look.matches(B("𝛃b"), 4));
+ assert!(look.matches(B("𝛃 "), 5));
+ assert!(look.matches(B(" 𝛃 "), 0));
+ assert!(look.matches(B(" 𝛃 "), 6));
+ // These don't match because they could otherwise return an offset that
+ // splits the UTF-8 encoding of a codepoint.
+ assert!(!look.matches(B("𝛃"), 1));
+ assert!(!look.matches(B("𝛃"), 2));
+ assert!(!look.matches(B("𝛃"), 3));
+
+ // Non word boundaries with non-ASCII codepoints. These also don't
+ // match because they could otherwise return an offset that splits the
+ // UTF-8 encoding of a codepoint.
+ assert!(!look.matches(B("𝛃𐆀"), 1));
+ assert!(!look.matches(B("𝛃𐆀"), 2));
+ assert!(!look.matches(B("𝛃𐆀"), 3));
+ assert!(!look.matches(B("𝛃𐆀"), 5));
+ assert!(!look.matches(B("𝛃𐆀"), 6));
+ assert!(!look.matches(B("𝛃𐆀"), 7));
+ // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
+ // of the haystack. So the "end" of the haystack isn't a word and 𐆀
+ // isn't a word, thus, \B matches.
+ assert!(look.matches(B("𝛃𐆀"), 8));
+ }
+
+ #[test]
+ fn look_matches_word_ascii_negate() {
+ let look = Look::WordBoundaryAsciiNegate;
+
+ // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+ // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+ // Simple ASCII word boundaries.
+ assert!(!look.matches(B("a"), 0));
+ assert!(!look.matches(B("a"), 1));
+ assert!(!look.matches(B("a "), 1));
+ assert!(!look.matches(B(" a "), 1));
+ assert!(!look.matches(B(" a "), 2));
+
+ // Unicode word boundaries with a non-ASCII codepoint. Since this is
+ // an ASCII word boundary, none of these match.
+ assert!(look.matches(B("𝛃"), 0));
+ assert!(look.matches(B("𝛃"), 4));
+ assert!(look.matches(B("𝛃 "), 4));
+ assert!(look.matches(B(" 𝛃 "), 1));
+ assert!(look.matches(B(" 𝛃 "), 5));
+
+ // Unicode word boundaries between non-ASCII codepoints. Again, since
+ // this is an ASCII word boundary, none of these match.
+ assert!(look.matches(B("𝛃𐆀"), 0));
+ assert!(look.matches(B("𝛃𐆀"), 4));
+
+ // Non word boundaries for ASCII.
+ assert!(look.matches(B(""), 0));
+ assert!(look.matches(B("ab"), 1));
+ assert!(look.matches(B("a "), 2));
+ assert!(look.matches(B(" a "), 0));
+ assert!(look.matches(B(" a "), 3));
+
+ // Non word boundaries with a non-ASCII codepoint.
+ assert!(!look.matches(B("𝛃b"), 4));
+ assert!(look.matches(B("𝛃 "), 5));
+ assert!(look.matches(B(" 𝛃 "), 0));
+ assert!(look.matches(B(" 𝛃 "), 6));
+ assert!(look.matches(B("𝛃"), 1));
+ assert!(look.matches(B("𝛃"), 2));
+ assert!(look.matches(B("𝛃"), 3));
+
+ // Non word boundaries with non-ASCII codepoints.
+ assert!(look.matches(B("𝛃𐆀"), 1));
+ assert!(look.matches(B("𝛃𐆀"), 2));
+ assert!(look.matches(B("𝛃𐆀"), 3));
+ assert!(look.matches(B("𝛃𐆀"), 5));
+ assert!(look.matches(B("𝛃𐆀"), 6));
+ assert!(look.matches(B("𝛃𐆀"), 7));
+ assert!(look.matches(B("𝛃𐆀"), 8));
+ }
+
+ fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] {
+ string.as_ref()
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/thompson/pikevm.rs b/vendor/regex-automata/src/nfa/thompson/pikevm.rs
new file mode 100644
index 000000000..7572f9f10
--- /dev/null
+++ b/vendor/regex-automata/src/nfa/thompson/pikevm.rs
@@ -0,0 +1,554 @@
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{
+ nfa::thompson::{self, Error, State, NFA},
+ util::{
+ id::{PatternID, StateID},
+ matchtypes::MultiMatch,
+ sparse_set::SparseSet,
+ },
+};
+
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ anchored: Option<bool>,
+ utf8: Option<bool>,
+}
+
+impl Config {
+ /// Return a new default PikeVM configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ pub fn anchored(mut self, yes: bool) -> Config {
+ self.anchored = Some(yes);
+ self
+ }
+
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ pub fn get_anchored(&self) -> bool {
+ self.anchored.unwrap_or(false)
+ }
+
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config {
+ anchored: o.anchored.or(self.anchored),
+ utf8: o.utf8.or(self.utf8),
+ }
+ }
+}
+
+/// A builder for a PikeVM.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ thompson: thompson::Builder,
+}
+
+impl Builder {
+ /// Create a new PikeVM builder with its default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ thompson: thompson::Builder::new(),
+ }
+ }
+
+ pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> {
+ self.build_many(&[pattern])
+ }
+
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<PikeVM, Error> {
+ let nfa = self.thompson.build_many(patterns)?;
+ self.build_from_nfa(Arc::new(nfa))
+ }
+
+ pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> {
+ // TODO: Check that this is correct.
+ // if !cfg!(all(
+ // feature = "dfa",
+ // feature = "syntax",
+ // feature = "unicode-perl"
+ // )) {
+ if !cfg!(feature = "syntax") {
+ if nfa.has_word_boundary_unicode() {
+ return Err(Error::unicode_word_unavailable());
+ }
+ }
+ Ok(PikeVM { config: self.config, nfa })
+ }
+
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a PikeVM directly from a
+ /// pattern.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like if additional time should be spent
+ /// shrinking the size of the NFA.
+ ///
+ /// These settings only apply when constructing a PikeVM directly from a
+ /// pattern.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct PikeVM {
+ config: Config,
+ nfa: Arc<NFA>,
+}
+
+impl PikeVM {
+ pub fn new(pattern: &str) -> Result<PikeVM, Error> {
+ PikeVM::builder().build(pattern)
+ }
+
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> {
+ PikeVM::builder().build_many(patterns)
+ }
+
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self.nfa())
+ }
+
+ pub fn create_captures(&self) -> Captures {
+ Captures::new(self.nfa())
+ }
+
+ pub fn nfa(&self) -> &Arc<NFA> {
+ &self.nfa
+ }
+
+ pub fn find_leftmost_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 'c, 't> {
+ FindLeftmostMatches::new(self, cache, haystack)
+ }
+
+ // BREADCRUMBS:
+ //
+ // 1) Don't forget about prefilters.
+ //
+ // 2) Consider the case of using a PikeVM with an NFA that has Capture
+ // states, but where we don't want to track capturing groups (other than
+ // group 0). This potentially saves a lot of copying around and what not. I
+ // believe the current regex crate does this, for example. The interesting
+ // bit here is how to handle the case of multiple patterns...
+ //
+ // 3) Permit the caller to specify a pattern ID to run an anchored-only
+ // search on.
+ //
+ // 4) How to do overlapping? The way multi-regex support works in the regex
+ // crate currently is to run the PikeVM until either we reach the end of
+ // the haystack or when we know all regexes have matched. The latter case
+ // is probably quite rare, so the common case is likely that we're always
+ // searching the entire input. The question is: can we emulate that with
+ // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
+ // all we need to do is provide an overlapping API on the PikeVM that
+ // roughly matches the ones we provide on DFAs. For those APIs, the only
+ // thing they need over non-overlapping APIs is "caller state." For DFAs,
+ // the caller state is simple: it contains the last state visited and the
+ // last match reported. For the PikeVM (and NFAs in general), the "last
+ // state" is actually a *set* of NFA states. So I think what happens here
+ // is that we can just force the `Cache` to subsume this role. We'll still
+ // need some additional state to track the last match reported though.
+ // Because when two or more patterns match at the same location, we need a
+ // way to know to iterate over them. Although maybe it's not match index we
+ // need, but the state index of the last NFA state processed in the cache.
+ // Then we just pick up where we left off. There might be another match
+ // state, in which case, we report it.
+
+ pub fn find_leftmost_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ caps: &mut Captures,
+ ) -> Option<MultiMatch> {
+ let anchored =
+ self.config.get_anchored() || self.nfa.is_always_start_anchored();
+ let mut at = start;
+ let mut matched_pid = None;
+ cache.clear();
+ 'LOOP: loop {
+ if cache.clist.set.is_empty() {
+ if matched_pid.is_some() || (anchored && at > start) {
+ break 'LOOP;
+ }
+ // TODO: prefilter
+ }
+ if (!anchored && matched_pid.is_none())
+ || cache.clist.set.is_empty()
+ {
+ self.epsilon_closure(
+ &mut cache.clist,
+ &mut caps.slots,
+ &mut cache.stack,
+ self.nfa.start_anchored(),
+ haystack,
+ at,
+ );
+ }
+ for i in 0..cache.clist.set.len() {
+ let sid = cache.clist.set.get(i);
+ let pid = match self.step(
+ &mut cache.nlist,
+ &mut caps.slots,
+ cache.clist.caps(sid),
+ &mut cache.stack,
+ sid,
+ haystack,
+ at,
+ ) {
+ None => continue,
+ Some(pid) => pid,
+ };
+ matched_pid = Some(pid);
+ break;
+ }
+ if at >= end {
+ break;
+ }
+ at += 1;
+ cache.swap();
+ cache.nlist.set.clear();
+ }
+ matched_pid.map(|pid| {
+ let slots = self.nfa.pattern_slots(pid);
+ let (start, end) = (slots.start, slots.start + 1);
+ MultiMatch::new(
+ pid,
+ caps.slots[start].unwrap(),
+ caps.slots[end].unwrap(),
+ )
+ })
+ }
+
+ #[inline(always)]
+ fn step(
+ &self,
+ nlist: &mut Threads,
+ slots: &mut [Slot],
+ thread_caps: &mut [Slot],
+ stack: &mut Vec<FollowEpsilon>,
+ sid: StateID,
+ haystack: &[u8],
+ at: usize,
+ ) -> Option<PatternID> {
+ match *self.nfa.state(sid) {
+ State::Fail
+ | State::Look { .. }
+ | State::Union { .. }
+ | State::Capture { .. } => None,
+ State::Range { ref range } => {
+ if range.matches(haystack, at) {
+ self.epsilon_closure(
+ nlist,
+ thread_caps,
+ stack,
+ range.next,
+ haystack,
+ at + 1,
+ );
+ }
+ None
+ }
+ State::Sparse(ref sparse) => {
+ if let Some(next) = sparse.matches(haystack, at) {
+ self.epsilon_closure(
+ nlist,
+ thread_caps,
+ stack,
+ next,
+ haystack,
+ at + 1,
+ );
+ }
+ None
+ }
+ State::Match { id } => {
+ slots.copy_from_slice(thread_caps);
+ Some(id)
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn epsilon_closure(
+ &self,
+ nlist: &mut Threads,
+ thread_caps: &mut [Slot],
+ stack: &mut Vec<FollowEpsilon>,
+ sid: StateID,
+ haystack: &[u8],
+ at: usize,
+ ) {
+ stack.push(FollowEpsilon::StateID(sid));
+ while let Some(frame) = stack.pop() {
+ match frame {
+ FollowEpsilon::StateID(sid) => {
+ self.epsilon_closure_step(
+ nlist,
+ thread_caps,
+ stack,
+ sid,
+ haystack,
+ at,
+ );
+ }
+ FollowEpsilon::Capture { slot, pos } => {
+ thread_caps[slot] = pos;
+ }
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn epsilon_closure_step(
+ &self,
+ nlist: &mut Threads,
+ thread_caps: &mut [Slot],
+ stack: &mut Vec<FollowEpsilon>,
+ mut sid: StateID,
+ haystack: &[u8],
+ at: usize,
+ ) {
+ loop {
+ if !nlist.set.insert(sid) {
+ return;
+ }
+ match *self.nfa.state(sid) {
+ State::Fail
+ | State::Range { .. }
+ | State::Sparse { .. }
+ | State::Match { .. } => {
+ let t = &mut nlist.caps(sid);
+ t.copy_from_slice(thread_caps);
+ return;
+ }
+ State::Look { look, next } => {
+ if !look.matches(haystack, at) {
+ return;
+ }
+ sid = next;
+ }
+ State::Union { ref alternates } => {
+ sid = match alternates.get(0) {
+ None => return,
+ Some(&sid) => sid,
+ };
+ stack.extend(
+ alternates[1..]
+ .iter()
+ .copied()
+ .rev()
+ .map(FollowEpsilon::StateID),
+ );
+ }
+ State::Capture { next, slot } => {
+ if slot < thread_caps.len() {
+ stack.push(FollowEpsilon::Capture {
+ slot,
+ pos: thread_caps[slot],
+ });
+ thread_caps[slot] = Some(at);
+ }
+ sid = next;
+ }
+ }
+ }
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindLeftmostMatches<'r, 'c, 't> {
+ vm: &'r PikeVM,
+ cache: &'c mut Cache,
+ // scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
+ fn new(
+ vm: &'r PikeVM,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 'c, 't> {
+ FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
+ // type Item = Captures;
+ type Item = MultiMatch;
+
+ // fn next(&mut self) -> Option<Captures> {
+ fn next(&mut self) -> Option<MultiMatch> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let mut caps = self.vm.create_captures();
+ let m = self.vm.find_leftmost_at(
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ &mut caps,
+ )?;
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.vm.config.get_utf8() {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(m)
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Captures {
+ slots: Vec<Slot>,
+}
+
+impl Captures {
+ pub fn new(nfa: &NFA) -> Captures {
+ Captures { slots: vec![None; nfa.capture_slot_len()] }
+ }
+}
+
+#[derive(Clone, Debug)]
+pub struct Cache {
+ stack: Vec<FollowEpsilon>,
+ clist: Threads,
+ nlist: Threads,
+}
+
+type Slot = Option<usize>;
+
+#[derive(Clone, Debug)]
+struct Threads {
+ set: SparseSet,
+ caps: Vec<Slot>,
+ slots_per_thread: usize,
+}
+
+#[derive(Clone, Debug)]
+enum FollowEpsilon {
+ StateID(StateID),
+ Capture { slot: usize, pos: Slot },
+}
+
+impl Cache {
+ pub fn new(nfa: &NFA) -> Cache {
+ Cache {
+ stack: vec![],
+ clist: Threads::new(nfa),
+ nlist: Threads::new(nfa),
+ }
+ }
+
+ fn clear(&mut self) {
+ self.stack.clear();
+ self.clist.set.clear();
+ self.nlist.set.clear();
+ }
+
+ fn swap(&mut self) {
+ core::mem::swap(&mut self.clist, &mut self.nlist);
+ }
+}
+
+impl Threads {
+ fn new(nfa: &NFA) -> Threads {
+ let mut threads = Threads {
+ set: SparseSet::new(0),
+ caps: vec![],
+ slots_per_thread: 0,
+ };
+ threads.resize(nfa);
+ threads
+ }
+
+ fn resize(&mut self, nfa: &NFA) {
+ if nfa.states().len() == self.set.capacity() {
+ return;
+ }
+ self.slots_per_thread = nfa.capture_slot_len();
+ self.set.resize(nfa.states().len());
+ self.caps.resize(self.slots_per_thread * nfa.states().len(), None);
+ }
+
+ fn caps(&mut self, sid: StateID) -> &mut [Slot] {
+ let i = sid.as_usize() * self.slots_per_thread;
+ &mut self.caps[i..i + self.slots_per_thread]
+ }
+}
diff --git a/vendor/regex-automata/src/nfa/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
index 50767c7c6..92f36ce3a 100644
--- a/vendor/regex-automata/src/nfa/range_trie.rs
+++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs
@@ -60,7 +60,7 @@
// Another approach, however, is to reuse an algorithm for constructing a
// *minimal* DFA from a sorted sequence of inputs. I don't want to go into
// the full details here, but I explain it in more depth in my blog post on
-// FSTs[1]. Note that the algorithm not invented by me, but was published
+// FSTs[1]. Note that the algorithm was not invented by me, but was published
// in paper by Daciuk et al. in 2000 called "Incremental Construction of
// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
// it is also possible to control the amount of extra memory one uses, although
@@ -76,10 +76,11 @@
// [BC-BF][80-BF]
// [BC-BF][90-BF]
//
-// Then Daciuk's algorithm also would not work, since there is nothing to
-// handle the fact that the ranges overlap. They would need to be split apart.
-// Thankfully, Thompson's algorithm for producing byte ranges for Unicode
-// codepoint ranges meets both of our requirements.
+// Then Daciuk's algorithm would not work, since there is nothing to handle the
+// fact that the ranges overlap. They would need to be split apart. Thankfully,
+// Thompson's algorithm for producing byte ranges for Unicode codepoint ranges
+// meets both of our requirements. (A proof for this eludes me, but it appears
+// true.)
//
// ... however, we would also like to be able to compile UTF-8 automata in
// reverse. We want this because in order to find the starting location of a
@@ -139,11 +140,9 @@
// [1] - https://blog.burntsushi.net/transducers/
// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
-use std::cell::RefCell;
-use std::fmt;
-use std::mem;
-use std::ops::RangeInclusive;
-use std::u32;
+use core::{cell::RefCell, fmt, mem, ops::RangeInclusive, u32};
+
+use alloc::{format, string::String, vec, vec::Vec};
use regex_syntax::utf8::Utf8Range;
@@ -188,8 +187,8 @@ pub struct RangeTrie {
/// particular order.
states: Vec<State>,
/// A free-list of states. When a range trie is cleared, all of its states
- /// are added to list. Creating a new state reuses states from this list
- /// before allocating a new one.
+ /// are added to this list. Creating a new state reuses states from this
+ /// list before allocating a new one.
free: Vec<State>,
/// A stack for traversing this trie to yield sequences of byte ranges in
/// lexicographic order.
@@ -197,7 +196,7 @@ pub struct RangeTrie {
/// A bufer that stores the current sequence during iteration.
iter_ranges: RefCell<Vec<Utf8Range>>,
/// A stack used for traversing the trie in order to (deeply) duplicate
- /// a state.
+ /// a state. States are recursively duplicated when ranges are split.
dupe_stack: Vec<NextDupe>,
/// A stack used for traversing the trie during insertion of a new
/// sequence of byte ranges.
@@ -249,7 +248,10 @@ impl RangeTrie {
/// Iterate over all of the sequences of byte ranges in this trie, and
/// call the provided function for each sequence. Iteration occurs in
/// lexicographic order.
- pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) {
+ pub fn iter<E, F: FnMut(&[Utf8Range]) -> Result<(), E>>(
+ &self,
+ mut f: F,
+ ) -> Result<(), E> {
let mut stack = self.iter_stack.borrow_mut();
stack.clear();
let mut ranges = self.iter_ranges.borrow_mut();
@@ -264,7 +266,7 @@ impl RangeTrie {
// here, but at the cost of more stack pushes.
loop {
let state = self.state(state_id);
- // If we're visited all transitions in this state, then pop
+ // If we've visited all transitions in this state, then pop
// back to the parent state.
if tidx >= state.transitions.len() {
ranges.pop();
@@ -274,7 +276,7 @@ impl RangeTrie {
let t = &state.transitions[tidx];
ranges.push(t.range);
if t.next_id == FINAL {
- f(&ranges);
+ f(&ranges)?;
ranges.pop();
tidx += 1;
} else {
@@ -288,6 +290,7 @@ impl RangeTrie {
}
}
}
+ Ok(())
}
/// Inserts a new sequence of ranges into this trie.
@@ -455,8 +458,8 @@ impl RangeTrie {
/// the given state ID and the returned state ID share nothing.
///
/// This is useful during range trie insertion when a new range overlaps
- /// with an existing range that is bigger than the new one. The part of
- /// the existing range that does *not* overlap with the new one is that
+ /// with an existing range that is bigger than the new one. The part
+ /// of the existing range that does *not* overlap with the new one is
/// duplicated so that adding the new range to the overlap doesn't disturb
/// the non-overlapping portion.
///
@@ -594,7 +597,7 @@ impl State {
// Benchmarks suggest that binary search is just a bit faster than
// straight linear search. Specifically when using the debug tool:
//
- // hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
+ // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
binary_search(&self.transitions, |t| range.start <= t.range.end)
}
@@ -865,7 +868,7 @@ impl Split {
}
impl fmt::Debug for RangeTrie {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "")?;
for (i, state) in self.states.iter().enumerate() {
let status = if i == FINAL as usize { '*' } else { ' ' };
@@ -876,7 +879,7 @@ impl fmt::Debug for RangeTrie {
}
impl fmt::Debug for State {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let rs = self
.transitions
.iter()
@@ -888,7 +891,7 @@ impl fmt::Debug for State {
}
impl fmt::Debug for Transition {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.range.start == self.range.end {
write!(f, "{:02X} => {:02X}", self.range.start, self.next_id)
} else {
@@ -908,7 +911,7 @@ fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool {
#[cfg(test)]
mod tests {
- use std::ops::RangeInclusive;
+ use core::ops::RangeInclusive;
use regex_syntax::utf8::Utf8Range;
diff --git a/vendor/regex-automata/src/regex.rs b/vendor/regex-automata/src/regex.rs
deleted file mode 100644
index 47e1c5819..000000000
--- a/vendor/regex-automata/src/regex.rs
+++ /dev/null
@@ -1,771 +0,0 @@
-#[cfg(feature = "std")]
-use dense::{self, DenseDFA};
-use dfa::DFA;
-#[cfg(feature = "std")]
-use error::Result;
-#[cfg(feature = "std")]
-use sparse::SparseDFA;
-#[cfg(feature = "std")]
-use state_id::StateID;
-
-/// A regular expression that uses deterministic finite automata for fast
-/// searching.
-///
-/// A regular expression is comprised of two DFAs, a "forward" DFA and a
-/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
-/// match while the reverse DFA is responsible for detecting the start of a
-/// match. Thus, in order to find the bounds of any given match, a forward
-/// search must first be run followed by a reverse search. A match found by
-/// the forward DFA guarantees that the reverse DFA will also find a match.
-///
-/// The type of the DFA used by a `Regex` corresponds to the `D` type
-/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
-/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
-/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
-/// search faster, while sparse DFAs use less memory but search more slowly.
-///
-/// By default, a regex's DFA type parameter is set to
-/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
-/// most convenient type that gives the best search performance.
-///
-/// # Sparse DFAs
-///
-/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
-/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
-/// enough to build corresponding sparse DFAs, and then build a regex from
-/// them:
-///
-/// ```
-/// use regex_automata::Regex;
-///
-/// # fn example() -> Result<(), regex_automata::Error> {
-/// // First, build a regex that uses dense DFAs.
-/// let dense_re = Regex::new("foo[0-9]+")?;
-///
-/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
-/// let fwd = dense_re.forward().to_sparse()?;
-/// let rev = dense_re.reverse().to_sparse()?;
-///
-/// // Third, build a new regex from the constituent sparse DFAs.
-/// let sparse_re = Regex::from_dfas(fwd, rev);
-///
-/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
-/// assert_eq!(true, sparse_re.is_match(b"foo123"));
-/// # Ok(()) }; example().unwrap()
-/// ```
-#[cfg(feature = "std")]
-#[derive(Clone, Debug)]
-pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
- forward: D,
- reverse: D,
-}
-
-/// A regular expression that uses deterministic finite automata for fast
-/// searching.
-///
-/// A regular expression is comprised of two DFAs, a "forward" DFA and a
-/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
-/// match while the reverse DFA is responsible for detecting the start of a
-/// match. Thus, in order to find the bounds of any given match, a forward
-/// search must first be run followed by a reverse search. A match found by
-/// the forward DFA guarantees that the reverse DFA will also find a match.
-///
-/// The type of the DFA used by a `Regex` corresponds to the `D` type
-/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
-/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
-/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
-/// search faster, while sparse DFAs use less memory but search more slowly.
-///
-/// When using this crate without the standard library, the `Regex` type has
-/// no default type parameter.
-///
-/// # Sparse DFAs
-///
-/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
-/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
-/// enough to build corresponding sparse DFAs, and then build a regex from
-/// them:
-///
-/// ```
-/// use regex_automata::Regex;
-///
-/// # fn example() -> Result<(), regex_automata::Error> {
-/// // First, build a regex that uses dense DFAs.
-/// let dense_re = Regex::new("foo[0-9]+")?;
-///
-/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
-/// let fwd = dense_re.forward().to_sparse()?;
-/// let rev = dense_re.reverse().to_sparse()?;
-///
-/// // Third, build a new regex from the constituent sparse DFAs.
-/// let sparse_re = Regex::from_dfas(fwd, rev);
-///
-/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
-/// assert_eq!(true, sparse_re.is_match(b"foo123"));
-/// # Ok(()) }; example().unwrap()
-/// ```
-#[cfg(not(feature = "std"))]
-#[derive(Clone, Debug)]
-pub struct Regex<D> {
- forward: D,
- reverse: D,
-}
-
-#[cfg(feature = "std")]
-impl Regex {
- /// Parse the given regular expression using a default configuration and
- /// return the corresponding regex.
- ///
- /// The default configuration uses `usize` for state IDs, premultiplies
- /// them and reduces the alphabet size by splitting bytes into equivalence
- /// classes. The underlying DFAs are *not* minimized.
- ///
- /// If you want a non-default configuration, then use the
- /// [`RegexBuilder`](struct.RegexBuilder.html)
- /// to set your own configuration.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let re = Regex::new("foo[0-9]+bar")?;
- /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn new(pattern: &str) -> Result<Regex> {
- RegexBuilder::new().build(pattern)
- }
-}
-
-#[cfg(feature = "std")]
-impl Regex<SparseDFA<Vec<u8>, usize>> {
- /// Parse the given regular expression using a default configuration and
- /// return the corresponding regex using sparse DFAs.
- ///
- /// The default configuration uses `usize` for state IDs, reduces the
- /// alphabet size by splitting bytes into equivalence classes. The
- /// underlying DFAs are *not* minimized.
- ///
- /// If you want a non-default configuration, then use the
- /// [`RegexBuilder`](struct.RegexBuilder.html)
- /// to set your own configuration.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let re = Regex::new_sparse("foo[0-9]+bar")?;
- /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn new_sparse(
- pattern: &str,
- ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
- RegexBuilder::new().build_sparse(pattern)
- }
-}
-
-impl<D: DFA> Regex<D> {
- /// Returns true if and only if the given bytes match.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let re = Regex::new("foo[0-9]+bar")?;
- /// assert_eq!(true, re.is_match(b"foo12345bar"));
- /// assert_eq!(false, re.is_match(b"foobar"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn is_match(&self, input: &[u8]) -> bool {
- self.is_match_at(input, 0)
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
- ///
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the shortest match semantics detect a match earlier.
- /// let re = Regex::new("abc|a")?;
- /// assert_eq!(Some(1), re.shortest_match(b"abc"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
- self.shortest_match_at(input, 0)
- }
-
- /// Returns the start and end offset of the leftmost first match. If no
- /// match exists, then `None` is returned.
- ///
- /// The "leftmost first" match corresponds to the match with the smallest
- /// starting offset, but where the end offset is determined by preferring
- /// earlier branches in the original regular expression. For example,
- /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
- /// match `Samwise` in `Samwise`.
- ///
- /// Generally speaking, the "leftmost first" match is how most backtracking
- /// regular expressions tend to work. This is in contrast to POSIX-style
- /// regular expressions that yield "leftmost longest" matches. Namely,
- /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
- /// leftmost longest semantics.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
- ///
- /// // Even though a match is found after reading the first byte (`a`),
- /// // the leftmost first match semantics demand that we find the earliest
- /// // match that prefers earlier parts of the pattern over latter parts.
- /// let re = Regex::new("abc|a")?;
- /// assert_eq!(Some((0, 3)), re.find(b"abc"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
- self.find_at(input, 0)
- }
-
- /// Returns the same as `is_match`, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
- self.forward().is_match_at(input, start)
- }
-
- /// Returns the same as `shortest_match`, but starts the search at the
- /// given offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- pub fn shortest_match_at(
- &self,
- input: &[u8],
- start: usize,
- ) -> Option<usize> {
- self.forward().shortest_match_at(input, start)
- }
-
- /// Returns the same as `find`, but starts the search at the given
- /// offset.
- ///
- /// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, if the DFA is anchored, then
- /// a match can only occur when `start == 0`.
- pub fn find_at(
- &self,
- input: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- let end = match self.forward().find_at(input, start) {
- None => return None,
- Some(end) => end,
- };
- let start = self
- .reverse()
- .rfind(&input[start..end])
- .map(|i| start + i)
- .expect("reverse search must match if forward search does");
- Some((start, end))
- }
-
- /// Returns an iterator over all non-overlapping leftmost first matches
- /// in the given bytes. If no match exists, then the iterator yields no
- /// elements.
- ///
- /// Note that if the regex can match the empty string, then it is
- /// possible for the iterator to yield a zero-width match at a location
- /// that is not a valid UTF-8 boundary (for example, between the code units
- /// of a UTF-8 encoded codepoint). This can happen regardless of whether
- /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
- /// was enabled or not.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let re = Regex::new("foo[0-9]+")?;
- /// let text = b"foo1 foo12 foo123";
- /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
- /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
- Matches::new(self, input)
- }
-
- /// Build a new regex from its constituent forward and reverse DFAs.
- ///
- /// This is useful when deserializing a regex from some arbitrary
- /// memory region. This is also useful for building regexes from other
- /// types of DFAs.
- ///
- /// # Example
- ///
- /// This example is a bit a contrived. The usual use of these methods
- /// would involve serializing `initial_re` somewhere and then deserializing
- /// it later to build a regex.
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let initial_re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(true, initial_re.is_match(b"foo123"));
- ///
- /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
- /// let re = Regex::from_dfas(fwd, rev);
- /// assert_eq!(true, re.is_match(b"foo123"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- ///
- /// This example shows how you might build smaller DFAs, and then use those
- /// smaller DFAs to build a new regex.
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let initial_re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(true, initial_re.is_match(b"foo123"));
- ///
- /// let fwd = initial_re.forward().to_u16()?;
- /// let rev = initial_re.reverse().to_u16()?;
- /// let re = Regex::from_dfas(fwd, rev);
- /// assert_eq!(true, re.is_match(b"foo123"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- ///
- /// This example shows how to build a `Regex` that uses sparse DFAs instead
- /// of dense DFAs:
- ///
- /// ```
- /// use regex_automata::Regex;
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let initial_re = Regex::new("foo[0-9]+")?;
- /// assert_eq!(true, initial_re.is_match(b"foo123"));
- ///
- /// let fwd = initial_re.forward().to_sparse()?;
- /// let rev = initial_re.reverse().to_sparse()?;
- /// let re = Regex::from_dfas(fwd, rev);
- /// assert_eq!(true, re.is_match(b"foo123"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
- Regex { forward, reverse }
- }
-
- /// Return the underlying DFA responsible for forward matching.
- pub fn forward(&self) -> &D {
- &self.forward
- }
-
- /// Return the underlying DFA responsible for reverse matching.
- pub fn reverse(&self) -> &D {
- &self.reverse
- }
-}
-
-/// An iterator over all non-overlapping matches for a particular search.
-///
-/// The iterator yields a `(usize, usize)` value until no more matches could be
-/// found. The first `usize` is the start of the match (inclusive) while the
-/// second `usize` is the end of the match (exclusive).
-///
-/// `S` is the type used to represent state identifiers in the underlying
-/// regex. The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression value itself.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Clone, Debug)]
-pub struct Matches<'r, 't, D: DFA + 'r> {
- re: &'r Regex<D>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 't, D: DFA> Matches<'r, 't, D> {
- fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
- Matches { re, text, last_end: 0, last_match: None }
- }
-}
-
-impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
- type Item = (usize, usize);
-
- fn next(&mut self) -> Option<(usize, usize)> {
- if self.last_end > self.text.len() {
- return None;
- }
- let (s, e) = match self.re.find_at(self.text, self.last_end) {
- None => return None,
- Some((s, e)) => (s, e),
- };
- if s == e {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = e + 1;
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(e) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = e;
- }
- self.last_match = Some(e);
- Some((s, e))
- }
-}
-
-/// A builder for a regex based on deterministic finite automatons.
-///
-/// This builder permits configuring several aspects of the construction
-/// process such as case insensitivity, Unicode support and various options
-/// that impact the size of the underlying DFAs. In some cases, options (like
-/// performing DFA minimization) can come with a substantial additional cost.
-///
-/// This builder generally constructs two DFAs, where one is responsible for
-/// finding the end of a match and the other is responsible for finding the
-/// start of a match. If you only need to detect whether something matched,
-/// or only the end of a match, then you should use a
-/// [`dense::Builder`](dense/struct.Builder.html)
-/// to construct a single DFA, which is cheaper than building two DFAs.
-#[cfg(feature = "std")]
-#[derive(Clone, Debug)]
-pub struct RegexBuilder {
- dfa: dense::Builder,
-}
-
-#[cfg(feature = "std")]
-impl RegexBuilder {
- /// Create a new regex builder with the default configuration.
- pub fn new() -> RegexBuilder {
- RegexBuilder { dfa: dense::Builder::new() }
- }
-
- /// Build a regex from the given pattern.
- ///
- /// If there was a problem parsing or compiling the pattern, then an error
- /// is returned.
- pub fn build(&self, pattern: &str) -> Result<Regex> {
- self.build_with_size::<usize>(pattern)
- }
-
- /// Build a regex from the given pattern using sparse DFAs.
- ///
- /// If there was a problem parsing or compiling the pattern, then an error
- /// is returned.
- pub fn build_sparse(
- &self,
- pattern: &str,
- ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
- self.build_with_size_sparse::<usize>(pattern)
- }
-
- /// Build a regex from the given pattern using a specific representation
- /// for the underlying DFA state IDs.
- ///
- /// If there was a problem parsing or compiling the pattern, then an error
- /// is returned.
- ///
- /// The representation of state IDs is determined by the `S` type
- /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
- /// or `usize`, where `usize` is the default used for `build`. The purpose
- /// of specifying a representation for state IDs is to reduce the memory
- /// footprint of the underlying DFAs.
- ///
- /// When using this routine, the chosen state ID representation will be
- /// used throughout determinization and minimization, if minimization was
- /// requested. Even if the minimized DFAs can fit into the chosen state ID
- /// representation but the initial determinized DFA cannot, then this will
- /// still return an error. To get a minimized DFA with a smaller state ID
- /// representation, first build it with a bigger state ID representation,
- /// and then shrink the sizes of the DFAs using one of its conversion
- /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
- /// Finally, reconstitute the regex via
- /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
- pub fn build_with_size<S: StateID>(
- &self,
- pattern: &str,
- ) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
- let forward = self.dfa.build_with_size(pattern)?;
- let reverse = self
- .dfa
- .clone()
- .anchored(true)
- .reverse(true)
- .longest_match(true)
- .build_with_size(pattern)?;
- Ok(Regex::from_dfas(forward, reverse))
- }
-
- /// Build a regex from the given pattern using a specific representation
- /// for the underlying DFA state IDs using sparse DFAs.
- pub fn build_with_size_sparse<S: StateID>(
- &self,
- pattern: &str,
- ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
- let re = self.build_with_size(pattern)?;
- let fwd = re.forward().to_sparse()?;
- let rev = re.reverse().to_sparse()?;
- Ok(Regex::from_dfas(fwd, rev))
- }
-
- /// Set whether matching must be anchored at the beginning of the input.
- ///
- /// When enabled, a match must begin at the start of the input. When
- /// disabled, the regex will act as if the pattern started with a `.*?`,
- /// which enables a match to appear anywhere.
- ///
- /// By default this is disabled.
- pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.anchored(yes);
- self
- }
-
- /// Enable or disable the case insensitive flag by default.
- ///
- /// By default this is disabled. It may alternatively be selectively
- /// enabled in the regular expression itself via the `i` flag.
- pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.case_insensitive(yes);
- self
- }
-
- /// Enable verbose mode in the regular expression.
- ///
- /// When enabled, verbose mode permits insigificant whitespace in many
- /// places in the regular expression, as well as comments. Comments are
- /// started using `#` and continue until the end of the line.
- ///
- /// By default, this is disabled. It may be selectively enabled in the
- /// regular expression by using the `x` flag regardless of this setting.
- pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.ignore_whitespace(yes);
- self
- }
-
- /// Enable or disable the "dot matches any character" flag by default.
- ///
- /// By default this is disabled. It may alternatively be selectively
- /// enabled in the regular expression itself via the `s` flag.
- pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.dot_matches_new_line(yes);
- self
- }
-
- /// Enable or disable the "swap greed" flag by default.
- ///
- /// By default this is disabled. It may alternatively be selectively
- /// enabled in the regular expression itself via the `U` flag.
- pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.swap_greed(yes);
- self
- }
-
- /// Enable or disable the Unicode flag (`u`) by default.
- ///
- /// By default this is **enabled**. It may alternatively be selectively
- /// disabled in the regular expression itself via the `u` flag.
- ///
- /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
- /// default), a regular expression will fail to parse if Unicode mode is
- /// disabled and a sub-expression could possibly match invalid UTF-8.
- pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.unicode(yes);
- self
- }
-
- /// When enabled, the builder will permit the construction of a regular
- /// expression that may match invalid UTF-8.
- ///
- /// When disabled (the default), the builder is guaranteed to produce a
- /// regex that will only ever match valid UTF-8 (otherwise, the builder
- /// will return an error).
- pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.allow_invalid_utf8(yes);
- self
- }
-
- /// Set the nesting limit used for the regular expression parser.
- ///
- /// The nesting limit controls how deep the abstract syntax tree is allowed
- /// to be. If the AST exceeds the given limit (e.g., with too many nested
- /// groups), then an error is returned by the parser.
- ///
- /// The purpose of this limit is to act as a heuristic to prevent stack
- /// overflow when building a finite automaton from a regular expression's
- /// abstract syntax tree. In particular, construction currently uses
- /// recursion. In the future, the implementation may stop using recursion
- /// and this option will no longer be necessary.
- ///
- /// This limit is not checked until the entire AST is parsed. Therefore,
- /// if callers want to put a limit on the amount of heap space used, then
- /// they should impose a limit on the length, in bytes, of the concrete
- /// pattern string. In particular, this is viable since the parser will
- /// limit itself to heap space proportional to the lenth of the pattern
- /// string.
- ///
- /// Note that a nest limit of `0` will return a nest limit error for most
- /// patterns but not all. For example, a nest limit of `0` permits `a` but
- /// not `ab`, since `ab` requires a concatenation AST item, which results
- /// in a nest depth of `1`. In general, a nest limit is not something that
- /// manifests in an obvious way in the concrete syntax, therefore, it
- /// should not be used in a granular way.
- pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
- self.dfa.nest_limit(limit);
- self
- }
-
- /// Minimize the underlying DFAs.
- ///
- /// When enabled, the DFAs powering the resulting regex will be minimized
- /// such that it is as small as possible.
- ///
- /// Whether one enables minimization or not depends on the types of costs
- /// you're willing to pay and how much you care about its benefits. In
- /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
- /// space, where `n` is the number of DFA states and `k` is the alphabet
- /// size. In practice, minimization can be quite costly in terms of both
- /// space and time, so it should only be done if you're willing to wait
- /// longer to produce a DFA. In general, you might want a minimal DFA in
- /// the following circumstances:
- ///
- /// 1. You would like to optimize for the size of the automaton. This can
- /// manifest in one of two ways. Firstly, if you're converting the
- /// DFA into Rust code (or a table embedded in the code), then a minimal
- /// DFA will translate into a corresponding reduction in code size, and
- /// thus, also the final compiled binary size. Secondly, if you are
- /// building many DFAs and putting them on the heap, you'll be able to
- /// fit more if they are smaller. Note though that building a minimal
- /// DFA itself requires additional space; you only realize the space
- /// savings once the minimal DFA is constructed (at which point, the
- /// space used for minimization is freed).
- /// 2. You've observed that a smaller DFA results in faster match
- /// performance. Naively, this isn't guaranteed since there is no
- /// inherent difference between matching with a bigger-than-minimal
- /// DFA and a minimal DFA. However, a smaller DFA may make use of your
- /// CPU's cache more efficiently.
- /// 3. You are trying to establish an equivalence between regular
- /// languages. The standard method for this is to build a minimal DFA
- /// for each language and then compare them. If the DFAs are equivalent
- /// (up to state renaming), then the languages are equivalent.
- ///
- /// This option is disabled by default.
- pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.minimize(yes);
- self
- }
-
- /// Premultiply state identifiers in the underlying DFA transition tables.
- ///
- /// When enabled, state identifiers are premultiplied to point to their
- /// corresponding row in the DFA's transition table. That is, given the
- /// `i`th state, its corresponding premultiplied identifier is `i * k`
- /// where `k` is the alphabet size of the DFA. (The alphabet size is at
- /// most 256, but is in practice smaller if byte classes is enabled.)
- ///
- /// When state identifiers are not premultiplied, then the identifier of
- /// the `i`th state is `i`.
- ///
- /// The advantage of premultiplying state identifiers is that is saves
- /// a multiplication instruction per byte when searching with the DFA.
- /// This has been observed to lead to a 20% performance benefit in
- /// micro-benchmarks.
- ///
- /// The primary disadvantage of premultiplying state identifiers is
- /// that they require a larger integer size to represent. For example,
- /// if your DFA has 200 states, then its premultiplied form requires
- /// 16 bits to represent every possible state identifier, where as its
- /// non-premultiplied form only requires 8 bits.
- ///
- /// This option is enabled by default.
- pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.premultiply(yes);
- self
- }
-
- /// Shrink the size of the underlying DFA alphabet by mapping bytes to
- /// their equivalence classes.
- ///
- /// When enabled, each DFA will use a map from all possible bytes to their
- /// corresponding equivalence class. Each equivalence class represents a
- /// set of bytes that does not discriminate between a match and a non-match
- /// in the DFA. For example, the pattern `[ab]+` has at least two
- /// equivalence classes: a set containing `a` and `b` and a set containing
- /// every byte except for `a` and `b`. `a` and `b` are in the same
- /// equivalence classes because they never discriminate between a match
- /// and a non-match.
- ///
- /// The advantage of this map is that the size of the transition table can
- /// be reduced drastically from `#states * 256 * sizeof(id)` to
- /// `#states * k * sizeof(id)` where `k` is the number of equivalence
- /// classes. As a result, total space usage can decrease substantially.
- /// Moreover, since a smaller alphabet is used, compilation becomes faster
- /// as well.
- ///
- /// The disadvantage of this map is that every byte searched must be
- /// passed through this map before it can be used to determine the next
- /// transition. This has a small match time performance cost.
- ///
- /// This option is enabled by default.
- pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.byte_classes(yes);
- self
- }
-
- /// Apply best effort heuristics to shrink the NFA at the expense of more
- /// time/memory.
- ///
- /// This may be exposed in the future, but for now is exported for use in
- /// the `regex-automata-debug` tool.
- #[doc(hidden)]
- pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
- self.dfa.shrink(yes);
- self
- }
-}
-
-#[cfg(feature = "std")]
-impl Default for RegexBuilder {
- fn default() -> RegexBuilder {
- RegexBuilder::new()
- }
-}
diff --git a/vendor/regex-automata/src/sparse.rs b/vendor/regex-automata/src/sparse.rs
deleted file mode 100644
index d18024b34..000000000
--- a/vendor/regex-automata/src/sparse.rs
+++ /dev/null
@@ -1,1256 +0,0 @@
-#[cfg(feature = "std")]
-use core::fmt;
-#[cfg(feature = "std")]
-use core::iter;
-use core::marker::PhantomData;
-use core::mem::size_of;
-#[cfg(feature = "std")]
-use std::collections::HashMap;
-
-#[cfg(feature = "std")]
-use byteorder::{BigEndian, LittleEndian};
-use byteorder::{ByteOrder, NativeEndian};
-
-use classes::ByteClasses;
-use dense;
-use dfa::DFA;
-#[cfg(feature = "std")]
-use error::{Error, Result};
-#[cfg(feature = "std")]
-use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID};
-#[cfg(not(feature = "std"))]
-use state_id::{dead_id, StateID};
-
-/// A sparse table-based deterministic finite automaton (DFA).
-///
-/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a
-/// more space efficient representation for its transition table. Consequently,
-/// sparse DFAs can use much less memory than dense DFAs, but this comes at a
-/// price. In particular, reading the more space efficient transitions takes
-/// more work, and consequently, searching using a sparse DFA is typically
-/// slower than a dense DFA.
-///
-/// A sparse DFA can be built using the default configuration via the
-/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise,
-/// one can configure various aspects of a dense DFA via
-/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense
-/// DFA to a sparse DFA using
-/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse).
-///
-/// In general, a sparse DFA supports all the same operations as a dense DFA.
-///
-/// Making the choice between a dense and sparse DFA depends on your specific
-/// work load. If you can sacrifice a bit of search time performance, then a
-/// sparse DFA might be the best choice. In particular, while sparse DFAs are
-/// probably always slower than dense DFAs, you may find that they are easily
-/// fast enough for your purposes!
-///
-/// # State size
-///
-/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to
-/// the type of the DFA's transition table while `S` corresponds to the
-/// representation used for the DFA's state identifiers as described by the
-/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
-/// `usize`, but other valid choices provided by this crate include `u8`,
-/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
-/// identifier representation than the default is to reduce the amount of
-/// memory used by a DFA. Note though, that if the chosen representation cannot
-/// accommodate the size of your DFA, then building the DFA will fail and
-/// return an error.
-///
-/// While the reduction in heap memory used by a DFA is one reason for choosing
-/// a smaller state identifier representation, another possible reason is for
-/// decreasing the serialization size of a DFA, as returned by
-/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian),
-/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian)
-/// or
-/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
-///
-/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`,
-/// depending on where the transition table is stored. Note that this is
-/// different than a dense DFA, whose transition table is typically
-/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads
-/// its transition table from raw bytes because the table is compactly packed.
-///
-/// # Variants
-///
-/// This DFA is defined as a non-exhaustive enumeration of different types of
-/// dense DFAs. All of the variants use the same internal representation
-/// for the transition table, but they vary in how the transition table is
-/// read. A DFA's specific variant depends on the configuration options set via
-/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
-/// `ByteClass`.
-///
-/// # The `DFA` trait
-///
-/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
-/// can be used for searching. For example:
-///
-/// ```
-/// use regex_automata::{DFA, SparseDFA};
-///
-/// # fn example() -> Result<(), regex_automata::Error> {
-/// let dfa = SparseDFA::new("foo[0-9]+")?;
-/// assert_eq!(Some(8), dfa.find(b"foo12345"));
-/// # Ok(()) }; example().unwrap()
-/// ```
-///
-/// The `DFA` trait also provides an assortment of other lower level methods
-/// for DFAs, such as `start_state` and `next_state`. While these are correctly
-/// implemented, it is an anti-pattern to use them in performance sensitive
-/// code on the `SparseDFA` type directly. Namely, each implementation requires
-/// a branch to determine which type of sparse DFA is being used. Instead,
-/// this branch should be pushed up a layer in the code since walking the
-/// transitions of a DFA is usually a hot path. If you do need to use these
-/// lower level methods in performance critical code, then you should match on
-/// the variants of this DFA and use each variant's implementation of the `DFA`
-/// trait directly.
-#[derive(Clone, Debug)]
-pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
- /// A standard DFA that does not use byte classes.
- Standard(Standard<T, S>),
- /// A DFA that shrinks its alphabet to a set of equivalence classes instead
- /// of using all possible byte values. Any two bytes belong to the same
- /// equivalence class if and only if they can be used interchangeably
- /// anywhere in the DFA while never discriminating between a match and a
- /// non-match.
- ///
- /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much
- /// from using byte classes. In some cases, using byte classes can even
- /// marginally increase the size of a sparse DFA's transition table. The
- /// reason for this is that a sparse DFA already compacts each state's
- /// transitions separate from whether byte classes are used.
- ByteClass(ByteClass<T, S>),
- /// Hints that destructuring should not be exhaustive.
- ///
- /// This enum may grow additional variants, so this makes sure clients
- /// don't count on exhaustive matching. (Otherwise, adding a new variant
- /// could break existing code.)
- #[doc(hidden)]
- __Nonexhaustive,
-}
-
-#[cfg(feature = "std")]
-impl SparseDFA<Vec<u8>, usize> {
- /// Parse the given regular expression using a default configuration and
- /// return the corresponding sparse DFA.
- ///
- /// The default configuration uses `usize` for state IDs and reduces the
- /// alphabet size by splitting bytes into equivalence classes. The
- /// resulting DFA is *not* minimized.
- ///
- /// If you want a non-default configuration, then use the
- /// [`dense::Builder`](dense/struct.Builder.html)
- /// to set your own configuration, and then call
- /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse)
- /// to create a sparse DFA.
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::{DFA, SparseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa = SparseDFA::new("foo[0-9]+bar")?;
- /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
- dense::Builder::new()
- .build(pattern)
- .and_then(|dense| dense.to_sparse())
- }
-}
-
-#[cfg(feature = "std")]
-impl<S: StateID> SparseDFA<Vec<u8>, S> {
- /// Create a new empty sparse DFA that never matches any input.
- ///
- /// # Example
- ///
- /// In order to build an empty DFA, callers must provide a type hint
- /// indicating their choice of state identifier representation.
- ///
- /// ```
- /// use regex_automata::{DFA, SparseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty();
- /// assert_eq!(None, dfa.find(b""));
- /// assert_eq!(None, dfa.find(b"foo"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub fn empty() -> SparseDFA<Vec<u8>, S> {
- dense::DenseDFA::empty().to_sparse().unwrap()
- }
-
- pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
- dfa: &dense::Repr<T, S>,
- ) -> Result<SparseDFA<Vec<u8>, A>> {
- Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
- }
-}
-
-impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
- /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
- /// DFA returned always uses `&[u8]` for its transition table while keeping
- /// the same state identifier representation.
- pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
- match *self {
- SparseDFA::Standard(Standard(ref r)) => {
- SparseDFA::Standard(Standard(r.as_ref()))
- }
- SparseDFA::ByteClass(ByteClass(ref r)) => {
- SparseDFA::ByteClass(ByteClass(r.as_ref()))
- }
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- /// Return an owned version of this sparse DFA. Specifically, the DFA
- /// returned always uses `Vec<u8>` for its transition table while keeping
- /// the same state identifier representation.
- ///
- /// Effectively, this returns a sparse DFA whose transition table lives
- /// on the heap.
- #[cfg(feature = "std")]
- pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
- match *self {
- SparseDFA::Standard(Standard(ref r)) => {
- SparseDFA::Standard(Standard(r.to_owned()))
- }
- SparseDFA::ByteClass(ByteClass(ref r)) => {
- SparseDFA::ByteClass(ByteClass(r.to_owned()))
- }
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- /// Returns the memory usage, in bytes, of this DFA.
- ///
- /// The memory usage is computed based on the number of bytes used to
- /// represent this DFA's transition table. This typically corresponds to
- /// heap memory usage.
- ///
- /// This does **not** include the stack size used up by this DFA. To
- /// compute that, used `std::mem::size_of::<SparseDFA>()`.
- pub fn memory_usage(&self) -> usize {
- self.repr().memory_usage()
- }
-
- fn repr(&self) -> &Repr<T, S> {
- match *self {
- SparseDFA::Standard(ref r) => &r.0,
- SparseDFA::ByteClass(ref r) => &r.0,
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-}
-
-/// Routines for converting a sparse DFA to other representations, such as
-/// smaller state identifiers or raw bytes suitable for persistent storage.
-#[cfg(feature = "std")]
-impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
- /// Create a new sparse DFA whose match semantics are equivalent to
- /// this DFA, but attempt to use `u8` for the representation of state
- /// identifiers. If `u8` is insufficient to represent all state identifiers
- /// in this DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u8>()`.
- pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
- self.to_sized()
- }
-
- /// Create a new sparse DFA whose match semantics are equivalent to
- /// this DFA, but attempt to use `u16` for the representation of state
- /// identifiers. If `u16` is insufficient to represent all state
- /// identifiers in this DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u16>()`.
- pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
- self.to_sized()
- }
-
- /// Create a new sparse DFA whose match semantics are equivalent to
- /// this DFA, but attempt to use `u32` for the representation of state
- /// identifiers. If `u32` is insufficient to represent all state
- /// identifiers in this DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u32>()`.
- #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
- pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
- self.to_sized()
- }
-
- /// Create a new sparse DFA whose match semantics are equivalent to
- /// this DFA, but attempt to use `u64` for the representation of state
- /// identifiers. If `u64` is insufficient to represent all state
- /// identifiers in this DFA, then this returns an error.
- ///
- /// This is a convenience routine for `to_sized::<u64>()`.
- #[cfg(target_pointer_width = "64")]
- pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
- self.to_sized()
- }
-
- /// Create a new sparse DFA whose match semantics are equivalent to
- /// this DFA, but attempt to use `A` for the representation of state
- /// identifiers. If `A` is insufficient to represent all state identifiers
- /// in this DFA, then this returns an error.
- ///
- /// An alternative way to construct such a DFA is to use
- /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized).
- /// In general, picking the appropriate size upon initial construction of
- /// a sparse DFA is preferred, since it will do the conversion in one
- /// step instead of two.
- pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
- self.repr().to_sized().map(|r| r.into_sparse_dfa())
- }
-
- /// Serialize a sparse DFA to raw bytes in little endian format.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
- self.repr().to_bytes::<LittleEndian>()
- }
-
- /// Serialize a sparse DFA to raw bytes in big endian format.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
- self.repr().to_bytes::<BigEndian>()
- }
-
- /// Serialize a sparse DFA to raw bytes in native endian format.
- /// Generally, it is better to pick an explicit endianness using either
- /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is
- /// useful in tests where the DFA is serialized and deserialized on the
- /// same platform.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
- self.repr().to_bytes::<NativeEndian>()
- }
-}
-
-impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
- /// Deserialize a sparse DFA with a specific state identifier
- /// representation.
- ///
- /// Deserializing a DFA using this routine will never allocate heap memory.
- /// This is also guaranteed to be a constant time operation that does not
- /// vary with the size of the DFA.
- ///
- /// The bytes given should be generated by the serialization of a DFA with
- /// either the
- /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
- /// method or the
- /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
- /// endian, depending on the endianness of the machine you are
- /// deserializing this DFA from.
- ///
- /// If the state identifier representation is `usize`, then deserialization
- /// is dependent on the pointer size. For this reason, it is best to
- /// serialize DFAs using a fixed size representation for your state
- /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
- ///
- /// # Panics
- ///
- /// The bytes given should be *trusted*. In particular, if the bytes
- /// are not a valid serialization of a DFA, or if the endianness of the
- /// serialized bytes is different than the endianness of the machine that
- /// is deserializing the DFA, then this routine will panic. Moreover, it
- /// is possible for this deserialization routine to succeed even if the
- /// given bytes do not represent a valid serialized sparse DFA.
- ///
- /// # Safety
- ///
- /// This routine is unsafe because it permits callers to provide an
- /// arbitrary transition table with possibly incorrect transitions. While
- /// the various serialization routines will never return an incorrect
- /// transition table, there is no guarantee that the bytes provided here
- /// are correct. While deserialization does many checks (as documented
- /// above in the panic conditions), this routine does not check that the
- /// transition table is correct. Given an incorrect transition table, it is
- /// possible for the search routines to access out-of-bounds memory because
- /// of explicit bounds check elision.
- ///
- /// # Example
- ///
- /// This example shows how to serialize a DFA to raw bytes, deserialize it
- /// and then use it for searching. Note that we first convert the DFA to
- /// using `u16` for its state identifier representation before serializing
- /// it. While this isn't strictly necessary, it's good practice in order to
- /// decrease the size of the DFA and to avoid platform specific pitfalls
- /// such as differing pointer sizes.
- ///
- /// ```
- /// use regex_automata::{DFA, DenseDFA, SparseDFA};
- ///
- /// # fn example() -> Result<(), regex_automata::Error> {
- /// let sparse = SparseDFA::new("foo[0-9]+")?;
- /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?;
- ///
- /// let dfa: SparseDFA<&[u8], u16> = unsafe {
- /// SparseDFA::from_bytes(&bytes)
- /// };
- ///
- /// assert_eq!(Some(8), dfa.find(b"foo12345"));
- /// # Ok(()) }; example().unwrap()
- /// ```
- pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
- Repr::from_bytes(buf).into_sparse_dfa()
- }
-}
-
-impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.repr().start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.repr().is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.repr().is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.repr().is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.repr().is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- match *self {
- SparseDFA::Standard(ref r) => r.next_state(current, input),
- SparseDFA::ByteClass(ref r) => r.next_state(current, input),
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- self.next_state(current, input)
- }
-
- // We specialize the following methods because it lets us lift the
- // case analysis between the different types of sparse DFAs. Instead of
- // doing the case analysis for every transition, we do it once before
- // searching. For sparse DFAs, this doesn't seem to benefit performance as
- // much as it does for the dense DFAs, but it's easy to do so we might as
- // well do it.
-
- #[inline]
- fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
- match *self {
- SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
- SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- match *self {
- SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
- SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- match *self {
- SparseDFA::Standard(ref r) => r.find_at(bytes, start),
- SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-
- #[inline]
- fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
- match *self {
- SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
- SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
- SparseDFA::__Nonexhaustive => unreachable!(),
- }
- }
-}
-
-/// A standard sparse DFA that does not use premultiplication or byte classes.
-///
-/// Generally, it isn't necessary to use this type directly, since a
-/// `SparseDFA` can be used for searching directly. One possible reason why
-/// one might want to use this type directly is if you are implementing your
-/// own search routines by walking a DFA's transitions directly. In that case,
-/// you'll want to use this type (or any of the other DFA variant types)
-/// directly, since they implement `next_state` more efficiently.
-#[derive(Clone, Debug)]
-pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
-
-impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.0.start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.0.is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.0.is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.0.is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.0.is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- self.0.state(current).next(input)
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- self.next_state(current, input)
- }
-}
-
-/// A sparse DFA that shrinks its alphabet.
-///
-/// Alphabet shrinking is achieved by using a set of equivalence classes
-/// instead of using all possible byte values. Any two bytes belong to the same
-/// equivalence class if and only if they can be used interchangeably anywhere
-/// in the DFA while never discriminating between a match and a non-match.
-///
-/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from
-/// using byte classes. In some cases, using byte classes can even marginally
-/// increase the size of a sparse DFA's transition table. The reason for this
-/// is that a sparse DFA already compacts each state's transitions separate
-/// from whether byte classes are used.
-///
-/// Generally, it isn't necessary to use this type directly, since a
-/// `SparseDFA` can be used for searching directly. One possible reason why
-/// one might want to use this type directly is if you are implementing your
-/// own search routines by walking a DFA's transitions directly. In that case,
-/// you'll want to use this type (or any of the other DFA variant types)
-/// directly, since they implement `next_state` more efficiently.
-#[derive(Clone, Debug)]
-pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
-
-impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
- type ID = S;
-
- #[inline]
- fn start_state(&self) -> S {
- self.0.start_state()
- }
-
- #[inline]
- fn is_match_state(&self, id: S) -> bool {
- self.0.is_match_state(id)
- }
-
- #[inline]
- fn is_dead_state(&self, id: S) -> bool {
- self.0.is_dead_state(id)
- }
-
- #[inline]
- fn is_match_or_dead_state(&self, id: S) -> bool {
- self.0.is_match_or_dead_state(id)
- }
-
- #[inline]
- fn is_anchored(&self) -> bool {
- self.0.is_anchored()
- }
-
- #[inline]
- fn next_state(&self, current: S, input: u8) -> S {
- let input = self.0.byte_classes.get(input);
- self.0.state(current).next(input)
- }
-
- #[inline]
- unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
- self.next_state(current, input)
- }
-}
-
-/// The underlying representation of a sparse DFA. This is shared by all of
-/// the different variants of a sparse DFA.
-#[derive(Clone)]
-#[cfg_attr(not(feature = "std"), derive(Debug))]
-struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
- anchored: bool,
- start: S,
- state_count: usize,
- max_match: S,
- byte_classes: ByteClasses,
- trans: T,
-}
-
-impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
- fn into_sparse_dfa(self) -> SparseDFA<T, S> {
- if self.byte_classes.is_singleton() {
- SparseDFA::Standard(Standard(self))
- } else {
- SparseDFA::ByteClass(ByteClass(self))
- }
- }
-
- fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
- Repr {
- anchored: self.anchored,
- start: self.start,
- state_count: self.state_count,
- max_match: self.max_match,
- byte_classes: self.byte_classes.clone(),
- trans: self.trans(),
- }
- }
-
- #[cfg(feature = "std")]
- fn to_owned(&self) -> Repr<Vec<u8>, S> {
- Repr {
- anchored: self.anchored,
- start: self.start,
- state_count: self.state_count,
- max_match: self.max_match,
- byte_classes: self.byte_classes.clone(),
- trans: self.trans().to_vec(),
- }
- }
-
- /// Return a convenient representation of the given state.
- ///
- /// This is marked as inline because it doesn't seem to get inlined
- /// otherwise, which leads to a fairly significant performance loss (~25%).
- #[inline]
- fn state<'a>(&'a self, id: S) -> State<'a, S> {
- let mut pos = id.to_usize();
- let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
- pos += 2;
- let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
- pos += 2 * ntrans;
- let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
- State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
- }
-
- /// Return an iterator over all of the states in this DFA.
- ///
- /// The iterator returned yields tuples, where the first element is the
- /// state ID and the second element is the state itself.
- #[cfg(feature = "std")]
- fn states<'a>(&'a self) -> StateIter<'a, T, S> {
- StateIter { dfa: self, id: dead_id() }
- }
-
- fn memory_usage(&self) -> usize {
- self.trans().len()
- }
-
- fn start_state(&self) -> S {
- self.start
- }
-
- fn is_match_state(&self, id: S) -> bool {
- self.is_match_or_dead_state(id) && !self.is_dead_state(id)
- }
-
- fn is_dead_state(&self, id: S) -> bool {
- id == dead_id()
- }
-
- fn is_match_or_dead_state(&self, id: S) -> bool {
- id <= self.max_match
- }
-
- fn is_anchored(&self) -> bool {
- self.anchored
- }
-
- fn trans(&self) -> &[u8] {
- self.trans.as_ref()
- }
-
- /// Create a new sparse DFA whose match semantics are equivalent to this
- /// DFA, but attempt to use `A` for the representation of state
- /// identifiers. If `A` is insufficient to represent all state identifiers
- /// in this DFA, then this returns an error.
- #[cfg(feature = "std")]
- fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
- // To build the new DFA, we proceed much like the initial construction
- // of the sparse DFA. Namely, since the state ID size is changing,
- // we don't actually know all of our state IDs until we've allocated
- // all necessary space. So we do one pass that allocates all of the
- // storage we need, and then another pass to fill in the transitions.
-
- let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
- let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
- for (old_id, state) in self.states() {
- let pos = trans.len();
- map.insert(old_id, usize_to_state_id(pos)?);
-
- let n = state.ntrans;
- let zeros = 2 + (n * 2) + (n * size_of::<A>());
- trans.extend(iter::repeat(0).take(zeros));
-
- NativeEndian::write_u16(&mut trans[pos..], n as u16);
- let (s, e) = (pos + 2, pos + 2 + (n * 2));
- trans[s..e].copy_from_slice(state.input_ranges);
- }
-
- let mut new = Repr {
- anchored: self.anchored,
- start: map[&self.start],
- state_count: self.state_count,
- max_match: map[&self.max_match],
- byte_classes: self.byte_classes.clone(),
- trans,
- };
- for (&old_id, &new_id) in map.iter() {
- let old_state = self.state(old_id);
- let mut new_state = new.state_mut(new_id);
- for i in 0..new_state.ntrans {
- let next = map[&old_state.next_at(i)];
- new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
- }
- }
- new.start = map[&self.start];
- new.max_match = map[&self.max_match];
- Ok(new)
- }
-
- /// Serialize a sparse DFA to raw bytes using the provided endianness.
- ///
- /// If the state identifier representation of this DFA has a size different
- /// than 1, 2, 4 or 8 bytes, then this returns an error. All
- /// implementations of `StateID` provided by this crate satisfy this
- /// requirement.
- ///
- /// Unlike dense DFAs, the result is not necessarily aligned since a
- /// sparse DFA's transition table is always read as a sequence of bytes.
- #[cfg(feature = "std")]
- fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
- let label = b"rust-regex-automata-sparse-dfa\x00";
- let size =
- // For human readable label.
- label.len()
- // endiannes check, must be equal to 0xFEFF for native endian
- + 2
- // For version number.
- + 2
- // Size of state ID representation, in bytes.
- // Must be 1, 2, 4 or 8.
- + 2
- // For DFA misc options. (Currently unused.)
- + 2
- // For start state.
- + 8
- // For state count.
- + 8
- // For max match state.
- + 8
- // For byte class map.
- + 256
- // For transition table.
- + self.trans().len();
-
- let mut i = 0;
- let mut buf = vec![0; size];
-
- // write label
- for &b in label {
- buf[i] = b;
- i += 1;
- }
- // endianness check
- A::write_u16(&mut buf[i..], 0xFEFF);
- i += 2;
- // version number
- A::write_u16(&mut buf[i..], 1);
- i += 2;
- // size of state ID
- let state_size = size_of::<S>();
- if ![1, 2, 4, 8].contains(&state_size) {
- return Err(Error::serialize(&format!(
- "state size of {} not supported, must be 1, 2, 4 or 8",
- state_size
- )));
- }
- A::write_u16(&mut buf[i..], state_size as u16);
- i += 2;
- // DFA misc options
- let mut options = 0u16;
- if self.anchored {
- options |= dense::MASK_ANCHORED;
- }
- A::write_u16(&mut buf[i..], options);
- i += 2;
- // start state
- A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
- i += 8;
- // state count
- A::write_u64(&mut buf[i..], self.state_count as u64);
- i += 8;
- // max match state
- A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
- i += 8;
- // byte class map
- for b in (0..256).map(|b| b as u8) {
- buf[i] = self.byte_classes.get(b);
- i += 1;
- }
- // transition table
- for (_, state) in self.states() {
- A::write_u16(&mut buf[i..], state.ntrans as u16);
- i += 2;
- buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
- i += state.ntrans * 2;
- for j in 0..state.ntrans {
- write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
- i += size_of::<S>();
- }
- }
-
- assert_eq!(size, i, "expected to consume entire buffer");
-
- Ok(buf)
- }
-}
-
-impl<'a, S: StateID> Repr<&'a [u8], S> {
- /// The implementation for deserializing a sparse DFA from raw bytes.
- unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
- // skip over label
- match buf.iter().position(|&b| b == b'\x00') {
- None => panic!("could not find label"),
- Some(i) => buf = &buf[i + 1..],
- }
-
- // check that current endianness is same as endianness of DFA
- let endian_check = NativeEndian::read_u16(buf);
- buf = &buf[2..];
- if endian_check != 0xFEFF {
- panic!(
- "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
- are you trying to load a SparseDFA serialized with a \
- different endianness?",
- endian_check,
- );
- }
-
- // check that the version number is supported
- let version = NativeEndian::read_u16(buf);
- buf = &buf[2..];
- if version != 1 {
- panic!(
- "expected version 1, but found unsupported version {}",
- version,
- );
- }
-
- // read size of state
- let state_size = NativeEndian::read_u16(buf) as usize;
- if state_size != size_of::<S>() {
- panic!(
- "state size of SparseDFA ({}) does not match \
- requested state size ({})",
- state_size,
- size_of::<S>(),
- );
- }
- buf = &buf[2..];
-
- // read miscellaneous options
- let opts = NativeEndian::read_u16(buf);
- buf = &buf[2..];
-
- // read start state
- let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
- buf = &buf[8..];
-
- // read state count
- let state_count = NativeEndian::read_u64(buf) as usize;
- buf = &buf[8..];
-
- // read max match state
- let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
- buf = &buf[8..];
-
- // read byte classes
- let byte_classes = ByteClasses::from_slice(&buf[..256]);
- buf = &buf[256..];
-
- Repr {
- anchored: opts & dense::MASK_ANCHORED > 0,
- start,
- state_count,
- max_match,
- byte_classes,
- trans: buf,
- }
- }
-}
-
-#[cfg(feature = "std")]
-impl<S: StateID> Repr<Vec<u8>, S> {
- /// The implementation for constructing a sparse DFA from a dense DFA.
- fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
- dfa: &dense::Repr<T, S>,
- ) -> Result<Repr<Vec<u8>, A>> {
- // In order to build the transition table, we need to be able to write
- // state identifiers for each of the "next" transitions in each state.
- // Our state identifiers correspond to the byte offset in the
- // transition table at which the state is encoded. Therefore, we do not
- // actually know what the state identifiers are until we've allocated
- // exactly as much space as we need for each state. Thus, construction
- // of the transition table happens in two passes.
- //
- // In the first pass, we fill out the shell of each state, which
- // includes the transition count, the input byte ranges and zero-filled
- // space for the transitions. In this first pass, we also build up a
- // map from the state identifier index of the dense DFA to the state
- // identifier in this sparse DFA.
- //
- // In the second pass, we fill in the transitions based on the map
- // built in the first pass.
-
- let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
- let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
- for (old_id, state) in dfa.states() {
- let pos = trans.len();
-
- remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
- // zero-filled space for the transition count
- trans.push(0);
- trans.push(0);
-
- let mut trans_count = 0;
- for (b1, b2, _) in state.sparse_transitions() {
- trans_count += 1;
- trans.push(b1);
- trans.push(b2);
- }
- // fill in the transition count
- NativeEndian::write_u16(&mut trans[pos..], trans_count);
-
- // zero-fill the actual transitions
- let zeros = trans_count as usize * size_of::<A>();
- trans.extend(iter::repeat(0).take(zeros));
- }
-
- let mut new = Repr {
- anchored: dfa.is_anchored(),
- start: remap[dfa.state_id_to_index(dfa.start_state())],
- state_count: dfa.state_count(),
- max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
- byte_classes: dfa.byte_classes().clone(),
- trans,
- };
- for (old_id, old_state) in dfa.states() {
- let new_id = remap[dfa.state_id_to_index(old_id)];
- let mut new_state = new.state_mut(new_id);
- let sparse = old_state.sparse_transitions();
- for (i, (_, _, next)) in sparse.enumerate() {
- let next = remap[dfa.state_id_to_index(next)];
- new_state.set_next_at(i, next);
- }
- }
- Ok(new)
- }
-
- /// Return a convenient mutable representation of the given state.
- fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
- let mut pos = id.to_usize();
- let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
- pos += 2;
-
- let size = (ntrans * 2) + (ntrans * size_of::<S>());
- let ranges_and_next = &mut self.trans[pos..pos + size];
- let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
- StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
- }
-}
-
-#[cfg(feature = "std")]
-impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- fn state_status<T: AsRef<[u8]>, S: StateID>(
- dfa: &Repr<T, S>,
- id: S,
- ) -> &'static str {
- if id == dead_id() {
- if dfa.is_match_state(id) {
- "D*"
- } else {
- "D "
- }
- } else if id == dfa.start_state() {
- if dfa.is_match_state(id) {
- ">*"
- } else {
- "> "
- }
- } else {
- if dfa.is_match_state(id) {
- " *"
- } else {
- " "
- }
- }
- }
-
- writeln!(f, "SparseDFA(")?;
- for (id, state) in self.states() {
- let status = state_status(self, id);
- writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
- }
- writeln!(f, ")")?;
- Ok(())
- }
-}
-
-/// An iterator over all states in a sparse DFA.
-///
-/// This iterator yields tuples, where the first element is the state ID and
-/// the second element is the state itself.
-#[cfg(feature = "std")]
-#[derive(Debug)]
-struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
- dfa: &'a Repr<T, S>,
- id: S,
-}
-
-#[cfg(feature = "std")]
-impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
- type Item = (S, State<'a, S>);
-
- fn next(&mut self) -> Option<(S, State<'a, S>)> {
- if self.id.to_usize() >= self.dfa.trans().len() {
- return None;
- }
- let id = self.id;
- let state = self.dfa.state(id);
- self.id = S::from_usize(self.id.to_usize() + state.bytes());
- Some((id, state))
- }
-}
-
-/// A representation of a sparse DFA state that can be cheaply materialized
-/// from a state identifier.
-#[derive(Clone)]
-struct State<'a, S: StateID = usize> {
- /// The state identifier representation used by the DFA from which this
- /// state was extracted. Since our transition table is compacted in a
- /// &[u8], we don't actually use the state ID type parameter explicitly
- /// anywhere, so we fake it. This prevents callers from using an incorrect
- /// state ID representation to read from this state.
- _state_id_repr: PhantomData<S>,
- /// The number of transitions in this state.
- ntrans: usize,
- /// Pairs of input ranges, where there is one pair for each transition.
- /// Each pair specifies an inclusive start and end byte range for the
- /// corresponding transition.
- input_ranges: &'a [u8],
- /// Transitions to the next state. This slice contains native endian
- /// encoded state identifiers, with `S` as the representation. Thus, there
- /// are `ntrans * size_of::<S>()` bytes in this slice.
- next: &'a [u8],
-}
-
-impl<'a, S: StateID> State<'a, S> {
- /// Searches for the next transition given an input byte. If no such
- /// transition could be found, then a dead state is returned.
- fn next(&self, input: u8) -> S {
- // This straight linear search was observed to be much better than
- // binary search on ASCII haystacks, likely because a binary search
- // visits the ASCII case last but a linear search sees it first. A
- // binary search does do a little better on non-ASCII haystacks, but
- // not by much. There might be a better trade off lurking here.
- for i in 0..self.ntrans {
- let (start, end) = self.range(i);
- if start <= input && input <= end {
- return self.next_at(i);
- }
- // We could bail early with an extra branch: if input < b1, then
- // we know we'll never find a matching transition. Interestingly,
- // this extra branch seems to not help performance, or will even
- // hurt it. It's likely very dependent on the DFA itself and what
- // is being searched.
- }
- dead_id()
- }
-
- /// Returns the inclusive input byte range for the ith transition in this
- /// state.
- fn range(&self, i: usize) -> (u8, u8) {
- (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
- }
-
- /// Returns the next state for the ith transition in this state.
- fn next_at(&self, i: usize) -> S {
- S::read_bytes(&self.next[i * size_of::<S>()..])
- }
-
- /// Return the total number of bytes that this state consumes in its
- /// encoded form.
- #[cfg(feature = "std")]
- fn bytes(&self) -> usize {
- 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
- }
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> fmt::Debug for State<'a, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let mut transitions = vec![];
- for i in 0..self.ntrans {
- let next = self.next_at(i);
- if next == dead_id() {
- continue;
- }
-
- let (start, end) = self.range(i);
- if start == end {
- transitions.push(format!(
- "{} => {}",
- escape(start),
- next.to_usize()
- ));
- } else {
- transitions.push(format!(
- "{}-{} => {}",
- escape(start),
- escape(end),
- next.to_usize(),
- ));
- }
- }
- write!(f, "{}", transitions.join(", "))
- }
-}
-
-/// A representation of a mutable sparse DFA state that can be cheaply
-/// materialized from a state identifier.
-#[cfg(feature = "std")]
-struct StateMut<'a, S: StateID = usize> {
- /// The state identifier representation used by the DFA from which this
- /// state was extracted. Since our transition table is compacted in a
- /// &[u8], we don't actually use the state ID type parameter explicitly
- /// anywhere, so we fake it. This prevents callers from using an incorrect
- /// state ID representation to read from this state.
- _state_id_repr: PhantomData<S>,
- /// The number of transitions in this state.
- ntrans: usize,
- /// Pairs of input ranges, where there is one pair for each transition.
- /// Each pair specifies an inclusive start and end byte range for the
- /// corresponding transition.
- input_ranges: &'a mut [u8],
- /// Transitions to the next state. This slice contains native endian
- /// encoded state identifiers, with `S` as the representation. Thus, there
- /// are `ntrans * size_of::<S>()` bytes in this slice.
- next: &'a mut [u8],
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> StateMut<'a, S> {
- /// Sets the ith transition to the given state.
- fn set_next_at(&mut self, i: usize, next: S) {
- next.write_bytes(&mut self.next[i * size_of::<S>()..]);
- }
-}
-
-#[cfg(feature = "std")]
-impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- let state = State {
- _state_id_repr: self._state_id_repr,
- ntrans: self.ntrans,
- input_ranges: self.input_ranges,
- next: self.next,
- };
- fmt::Debug::fmt(&state, f)
- }
-}
-
-/// Return the given byte as its escaped string form.
-#[cfg(feature = "std")]
-fn escape(b: u8) -> String {
- use std::ascii;
-
- String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
-}
-
-/// A binary search routine specialized specifically to a sparse DFA state's
-/// transitions. Specifically, the transitions are defined as a set of pairs
-/// of input bytes that delineate an inclusive range of bytes. If the input
-/// byte is in the range, then the corresponding transition is a match.
-///
-/// This binary search accepts a slice of these pairs and returns the position
-/// of the matching pair (the ith transition), or None if no matching pair
-/// could be found.
-///
-/// Note that this routine is not currently used since it was observed to
-/// either decrease performance when searching ASCII, or did not provide enough
-/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
-/// for posterity in case we can find a way to use it.
-///
-/// In theory, we could use the standard library's search routine if we could
-/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
-/// guaranteed to be safe and is thus UB (since I don't think the in-memory
-/// representation of `(u8, u8)` has been nailed down).
-#[inline(always)]
-#[allow(dead_code)]
-fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
- debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
- debug_assert!(ranges.len() <= 512, "ranges should be short");
-
- let (mut left, mut right) = (0, ranges.len() / 2);
- while left < right {
- let mid = (left + right) / 2;
- let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
- if needle < b1 {
- right = mid;
- } else if needle > b2 {
- left = mid + 1;
- } else {
- return Some(mid);
- }
- }
- None
-}
diff --git a/vendor/regex-automata/src/sparse_set.rs b/vendor/regex-automata/src/sparse_set.rs
deleted file mode 100644
index 56743b033..000000000
--- a/vendor/regex-automata/src/sparse_set.rs
+++ /dev/null
@@ -1,60 +0,0 @@
-use std::slice;
-
-/// A sparse set used for representing ordered NFA states.
-///
-/// This supports constant time addition and membership testing. Clearing an
-/// entire set can also be done in constant time. Iteration yields elements
-/// in the order in which they were inserted.
-///
-/// The data structure is based on: https://research.swtch.com/sparse
-/// Note though that we don't actually use uninitialized memory. We generally
-/// reuse sparse sets, so the initial allocation cost is bareable. However, its
-/// other properties listed above are extremely useful.
-#[derive(Clone, Debug)]
-pub struct SparseSet {
- /// Dense contains the instruction pointers in the order in which they
- /// were inserted.
- dense: Vec<usize>,
- /// Sparse maps instruction pointers to their location in dense.
- ///
- /// An instruction pointer is in the set if and only if
- /// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
- sparse: Box<[usize]>,
-}
-
-impl SparseSet {
- pub fn new(size: usize) -> SparseSet {
- SparseSet {
- dense: Vec::with_capacity(size),
- sparse: vec![0; size].into_boxed_slice(),
- }
- }
-
- pub fn len(&self) -> usize {
- self.dense.len()
- }
-
- pub fn insert(&mut self, value: usize) {
- let i = self.len();
- assert!(i < self.dense.capacity());
- self.dense.push(value);
- self.sparse[value] = i;
- }
-
- pub fn contains(&self, value: usize) -> bool {
- let i = self.sparse[value];
- self.dense.get(i) == Some(&value)
- }
-
- pub fn clear(&mut self) {
- self.dense.clear();
- }
-}
-
-impl<'a> IntoIterator for &'a SparseSet {
- type Item = &'a usize;
- type IntoIter = slice::Iter<'a, usize>;
- fn into_iter(self) -> Self::IntoIter {
- self.dense.iter()
- }
-}
diff --git a/vendor/regex-automata/src/state_id.rs b/vendor/regex-automata/src/state_id.rs
deleted file mode 100644
index c9bac1941..000000000
--- a/vendor/regex-automata/src/state_id.rs
+++ /dev/null
@@ -1,291 +0,0 @@
-use core::fmt::Debug;
-use core::hash::Hash;
-use core::mem::size_of;
-
-use byteorder::{ByteOrder, NativeEndian};
-
-#[cfg(feature = "std")]
-pub use self::std::*;
-
-#[cfg(feature = "std")]
-mod std {
- use byteorder::ByteOrder;
- use core::mem::size_of;
- use error::{Error, Result};
-
- use super::StateID;
-
- /// Check that the premultiplication of the given state identifier can
- /// fit into the representation indicated by `S`. If it cannot, or if it
- /// overflows `usize` itself, then an error is returned.
- pub fn premultiply_overflow_error<S: StateID>(
- last_state: S,
- alphabet_len: usize,
- ) -> Result<()> {
- let requested = match last_state.to_usize().checked_mul(alphabet_len) {
- Some(requested) => requested,
- None => return Err(Error::premultiply_overflow(0, 0)),
- };
- if requested > S::max_id() {
- return Err(Error::premultiply_overflow(S::max_id(), requested));
- }
- Ok(())
- }
-
- /// Allocate the next sequential identifier for a fresh state given
- /// the previously constructed state identified by `current`. If the
- /// next sequential identifier would overflow `usize` or the chosen
- /// representation indicated by `S`, then an error is returned.
- pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
- let next = match current.to_usize().checked_add(1) {
- Some(next) => next,
- None => return Err(Error::state_id_overflow(::std::usize::MAX)),
- };
- if next > S::max_id() {
- return Err(Error::state_id_overflow(S::max_id()));
- }
- Ok(S::from_usize(next))
- }
-
- /// Convert the given `usize` to the chosen state identifier
- /// representation. If the given value cannot fit in the chosen
- /// representation, then an error is returned.
- pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
- if value > S::max_id() {
- Err(Error::state_id_overflow(S::max_id()))
- } else {
- Ok(S::from_usize(value))
- }
- }
-
- /// Write the given identifier to the given slice of bytes using the
- /// specified endianness. The given slice must have length at least
- /// `size_of::<S>()`.
- ///
- /// The given state identifier representation must have size 1, 2, 4 or 8.
- pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
- slice: &mut [u8],
- id: S,
- ) {
- assert!(
- 1 == size_of::<S>()
- || 2 == size_of::<S>()
- || 4 == size_of::<S>()
- || 8 == size_of::<S>()
- );
-
- match size_of::<S>() {
- 1 => slice[0] = id.to_usize() as u8,
- 2 => E::write_u16(slice, id.to_usize() as u16),
- 4 => E::write_u32(slice, id.to_usize() as u32),
- 8 => E::write_u64(slice, id.to_usize() as u64),
- _ => unreachable!(),
- }
- }
-}
-
-/// Return the unique identifier for a DFA's dead state in the chosen
-/// representation indicated by `S`.
-pub fn dead_id<S: StateID>() -> S {
- S::from_usize(0)
-}
-
-/// A trait describing the representation of a DFA's state identifier.
-///
-/// The purpose of this trait is to safely express both the possible state
-/// identifier representations that can be used in a DFA and to convert between
-/// state identifier representations and types that can be used to efficiently
-/// index memory (such as `usize`).
-///
-/// In general, one should not need to implement this trait explicitly. In
-/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
-/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
-/// represent all corresponding values in a `usize`.)
-///
-/// # Safety
-///
-/// This trait is unsafe because the correctness of its implementations may be
-/// relied upon by other unsafe code. For example, one possible way to
-/// implement this trait incorrectly would be to return a maximum identifier
-/// in `max_id` that is greater than the real maximum identifier. This will
-/// likely result in wrap-on-overflow semantics in release mode, which can in
-/// turn produce incorrect state identifiers. Those state identifiers may then
-/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
-/// checks are explicitly elided for performance reasons.
-pub unsafe trait StateID:
- Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
-{
- /// Convert from a `usize` to this implementation's representation.
- ///
- /// Implementors may assume that `n <= Self::max_id`. That is, implementors
- /// do not need to check whether `n` can fit inside this implementation's
- /// representation.
- fn from_usize(n: usize) -> Self;
-
- /// Convert this implementation's representation to a `usize`.
- ///
- /// Implementors must not return a `usize` value greater than
- /// `Self::max_id` and must not permit overflow when converting between the
- /// implementor's representation and `usize`. In general, the preferred
- /// way for implementors to achieve this is to simply not provide
- /// implementations of `StateID` that cannot fit into the target platform's
- /// `usize`.
- fn to_usize(self) -> usize;
-
- /// Return the maximum state identifier supported by this representation.
- ///
- /// Implementors must return a correct bound. Doing otherwise may result
- /// in memory unsafety.
- fn max_id() -> usize;
-
- /// Read a single state identifier from the given slice of bytes in native
- /// endian format.
- ///
- /// Implementors may assume that the given slice has length at least
- /// `size_of::<Self>()`.
- fn read_bytes(slice: &[u8]) -> Self;
-
- /// Write this state identifier to the given slice of bytes in native
- /// endian format.
- ///
- /// Implementors may assume that the given slice has length at least
- /// `size_of::<Self>()`.
- fn write_bytes(self, slice: &mut [u8]);
-}
-
-unsafe impl StateID for usize {
- #[inline]
- fn from_usize(n: usize) -> usize {
- n
- }
-
- #[inline]
- fn to_usize(self) -> usize {
- self
- }
-
- #[inline]
- fn max_id() -> usize {
- ::core::usize::MAX
- }
-
- #[inline]
- fn read_bytes(slice: &[u8]) -> Self {
- NativeEndian::read_uint(slice, size_of::<usize>()) as usize
- }
-
- #[inline]
- fn write_bytes(self, slice: &mut [u8]) {
- NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
- }
-}
-
-unsafe impl StateID for u8 {
- #[inline]
- fn from_usize(n: usize) -> u8 {
- n as u8
- }
-
- #[inline]
- fn to_usize(self) -> usize {
- self as usize
- }
-
- #[inline]
- fn max_id() -> usize {
- ::core::u8::MAX as usize
- }
-
- #[inline]
- fn read_bytes(slice: &[u8]) -> Self {
- slice[0]
- }
-
- #[inline]
- fn write_bytes(self, slice: &mut [u8]) {
- slice[0] = self;
- }
-}
-
-unsafe impl StateID for u16 {
- #[inline]
- fn from_usize(n: usize) -> u16 {
- n as u16
- }
-
- #[inline]
- fn to_usize(self) -> usize {
- self as usize
- }
-
- #[inline]
- fn max_id() -> usize {
- ::core::u16::MAX as usize
- }
-
- #[inline]
- fn read_bytes(slice: &[u8]) -> Self {
- NativeEndian::read_u16(slice)
- }
-
- #[inline]
- fn write_bytes(self, slice: &mut [u8]) {
- NativeEndian::write_u16(slice, self)
- }
-}
-
-#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
-unsafe impl StateID for u32 {
- #[inline]
- fn from_usize(n: usize) -> u32 {
- n as u32
- }
-
- #[inline]
- fn to_usize(self) -> usize {
- self as usize
- }
-
- #[inline]
- fn max_id() -> usize {
- ::core::u32::MAX as usize
- }
-
- #[inline]
- fn read_bytes(slice: &[u8]) -> Self {
- NativeEndian::read_u32(slice)
- }
-
- #[inline]
- fn write_bytes(self, slice: &mut [u8]) {
- NativeEndian::write_u32(slice, self)
- }
-}
-
-#[cfg(target_pointer_width = "64")]
-unsafe impl StateID for u64 {
- #[inline]
- fn from_usize(n: usize) -> u64 {
- n as u64
- }
-
- #[inline]
- fn to_usize(self) -> usize {
- self as usize
- }
-
- #[inline]
- fn max_id() -> usize {
- ::core::u64::MAX as usize
- }
-
- #[inline]
- fn read_bytes(slice: &[u8]) -> Self {
- NativeEndian::read_u64(slice)
- }
-
- #[inline]
- fn write_bytes(self, slice: &mut [u8]) {
- NativeEndian::write_u64(slice, self)
- }
-}
diff --git a/vendor/regex-automata/src/transducer.rs b/vendor/regex-automata/src/transducer.rs
deleted file mode 100644
index 679c75726..000000000
--- a/vendor/regex-automata/src/transducer.rs
+++ /dev/null
@@ -1,107 +0,0 @@
-use fst::Automaton;
-
-use crate::{StateID, DFA};
-
-macro_rules! imp {
- ($ty:ty, $id:ty) => {
- impl<T: AsRef<[$id]>, S: StateID> Automaton for $ty {
- type State = S;
-
- #[inline]
- fn start(&self) -> S {
- self.start_state()
- }
-
- #[inline]
- fn is_match(&self, state: &S) -> bool {
- self.is_match_state(*state)
- }
-
- #[inline]
- fn accept(&self, state: &S, byte: u8) -> S {
- self.next_state(*state, byte)
- }
-
- #[inline]
- fn can_match(&self, state: &S) -> bool {
- !self.is_dead_state(*state)
- }
- }
- };
-}
-
-imp!(crate::dense::DenseDFA<T, S>, S);
-imp!(crate::dense::Standard<T, S>, S);
-imp!(crate::dense::ByteClass<T, S>, S);
-imp!(crate::dense::Premultiplied<T, S>, S);
-imp!(crate::dense::PremultipliedByteClass<T, S>, S);
-imp!(crate::sparse::SparseDFA<T, S>, u8);
-imp!(crate::sparse::Standard<T, S>, u8);
-imp!(crate::sparse::ByteClass<T, S>, u8);
-
-#[cfg(test)]
-mod tests {
- use bstr::BString;
- use fst::{Automaton, IntoStreamer, Set, Streamer};
-
- use crate::dense::{self, DenseDFA};
- use crate::sparse::SparseDFA;
-
- fn search<A: Automaton, D: AsRef<[u8]>>(
- set: &Set<D>,
- aut: A,
- ) -> Vec<BString> {
- let mut stream = set.search(aut).into_stream();
-
- let mut results = vec![];
- while let Some(key) = stream.next() {
- results.push(BString::from(key));
- }
- results
- }
-
- #[test]
- fn dense_anywhere() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = DenseDFA::new("ba.*").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
- }
-
- #[test]
- fn dense_anchored() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::Builder::new().anchored(true).build("ba.*").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz"]);
- }
-
- #[test]
- fn sparse_anywhere() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = SparseDFA::new("ba.*").unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
- }
-
- #[test]
- fn sparse_anchored() {
- let set =
- Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
- .unwrap();
- let dfa = dense::Builder::new()
- .anchored(true)
- .build("ba.*")
- .unwrap()
- .to_sparse()
- .unwrap();
- let got = search(&set, &dfa);
- assert_eq!(got, vec!["bar", "baz"]);
- }
-}
diff --git a/vendor/regex-automata/src/util/alphabet.rs b/vendor/regex-automata/src/util/alphabet.rs
new file mode 100644
index 000000000..0bc1ece58
--- /dev/null
+++ b/vendor/regex-automata/src/util/alphabet.rs
@@ -0,0 +1,790 @@
+use core::convert::TryFrom;
+
+use crate::util::{
+ bytes::{DeserializeError, SerializeError},
+ DebugByte,
+};
+
+/// Unit represents a single unit of input for DFA based regex engines.
+///
+/// **NOTE:** It is not expected for consumers of this crate to need to use
+/// this type unless they are implementing their own DFA. And even then, it's
+/// not required: implementors may use other techniques to handle input.
+///
+/// Typically, a single unit of input for a DFA would be a single byte.
+/// However, for the DFAs in this crate, matches are delayed by a single byte
+/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once
+/// we have consumed the haystack, we must run the DFA through one additional
+/// transition using an input that indicates the haystack has ended.
+///
+/// Since there is no way to represent a sentinel with a `u8` since all
+/// possible values *may* be valid inputs to a DFA, this type explicitly adds
+/// room for a sentinel value.
+///
+/// The sentinel EOI value is always its own equivalence class and is
+/// ultimately represented by adding 1 to the maximum equivalence class value.
+/// So for example, the regex `^[a-z]+$` might be split into the following
+/// equivalence classes:
+///
+/// ```text
+/// 0 => [\x00-`]
+/// 1 => [a-z]
+/// 2 => [{-\xFF]
+/// 3 => [EOI]
+/// ```
+///
+/// Where EOI is the special sentinel value that is always in its own
+/// singleton equivalence class.
+#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub enum Unit {
+ U8(u8),
+ EOI(u16),
+}
+
+impl Unit {
+ /// Create a new input unit from a byte value.
+ ///
+ /// All possible byte values are legal. However, when creating an input
+ /// unit for a specific DFA, one should be careful to only construct input
+ /// units that are in that DFA's alphabet. Namely, one way to compact a
+ /// DFA's in-memory representation is to collapse its transitions to a set
+ /// of equivalence classes into a set of all possible byte values. If a
+ /// DFA uses equivalence classes instead of byte values, then the byte
+ /// given here should be the equivalence class.
+ pub fn u8(byte: u8) -> Unit {
+ Unit::U8(byte)
+ }
+
+ pub fn eoi(num_byte_equiv_classes: usize) -> Unit {
+ assert!(
+ num_byte_equiv_classes <= 256,
+ "max number of byte-based equivalent classes is 256, but got {}",
+ num_byte_equiv_classes,
+ );
+ Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap())
+ }
+
+ pub fn as_u8(self) -> Option<u8> {
+ match self {
+ Unit::U8(b) => Some(b),
+ Unit::EOI(_) => None,
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ pub fn as_eoi(self) -> Option<usize> {
+ match self {
+ Unit::U8(_) => None,
+ Unit::EOI(eoi) => Some(eoi as usize),
+ }
+ }
+
+ pub fn as_usize(self) -> usize {
+ match self {
+ Unit::U8(b) => b as usize,
+ Unit::EOI(eoi) => eoi as usize,
+ }
+ }
+
+ pub fn is_eoi(&self) -> bool {
+ match *self {
+ Unit::EOI(_) => true,
+ _ => false,
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ pub fn is_word_byte(&self) -> bool {
+ self.as_u8().map_or(false, crate::util::is_word_byte)
+ }
+}
+
+impl core::fmt::Debug for Unit {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self {
+ Unit::U8(b) => write!(f, "{:?}", DebugByte(b)),
+ Unit::EOI(_) => write!(f, "EOI"),
+ }
+ }
+}
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in a DFA to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of a dense DFA,
+/// but also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+ /// Creates a new set of equivalence classes where all bytes are mapped to
+ /// the same class.
+ pub fn empty() -> ByteClasses {
+ ByteClasses([0; 256])
+ }
+
+ /// Creates a new set of equivalence classes where each byte belongs to
+ /// its own equivalence class.
+ #[cfg(feature = "alloc")]
+ pub fn singletons() -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ for i in 0..256 {
+ classes.set(i as u8, i as u8);
+ }
+ classes
+ }
+
+ /// Deserializes a byte class map from the given slice. If the slice is of
+ /// insufficient length or otherwise contains an impossible mapping, then
+ /// an error is returned. Upon success, the number of bytes read along with
+ /// the map are returned. The number of bytes read is always a multiple of
+ /// 8.
+ pub fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(ByteClasses, usize), DeserializeError> {
+ if slice.len() < 256 {
+ return Err(DeserializeError::buffer_too_small("byte class map"));
+ }
+ let mut classes = ByteClasses::empty();
+ for (b, &class) in slice[..256].iter().enumerate() {
+ classes.set(b as u8, class);
+ }
+ for b in classes.iter() {
+ if b.as_usize() >= classes.alphabet_len() {
+ return Err(DeserializeError::generic(
+ "found equivalence class greater than alphabet len",
+ ));
+ }
+ }
+ Ok((classes, 256))
+ }
+
+ /// Writes this byte class map to the given byte buffer. if the given
+ /// buffer is too small, then an error is returned. Upon success, the total
+ /// number of bytes written is returned. The number of bytes written is
+ /// guaranteed to be a multiple of 8.
+ pub fn write_to(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("byte class map"));
+ }
+ for b in 0..=255 {
+ dst[0] = self.get(b);
+ dst = &mut dst[1..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub fn write_to_len(&self) -> usize {
+ 256
+ }
+
+ /// Set the equivalence class for the given byte.
+ #[inline]
+ pub fn set(&mut self, byte: u8, class: u8) {
+ self.0[byte as usize] = class;
+ }
+
+ /// Get the equivalence class for the given byte.
+ #[inline]
+ pub fn get(&self, byte: u8) -> u8 {
+ self.0[byte as usize]
+ }
+
+ /// Get the equivalence class for the given byte while forcefully
+ /// eliding bounds checks.
+ #[inline]
+ pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
+ *self.0.get_unchecked(byte as usize)
+ }
+
+ /// Get the equivalence class for the given input unit and return the
+ /// class as a `usize`.
+ #[inline]
+ pub fn get_by_unit(&self, unit: Unit) -> usize {
+ match unit {
+ Unit::U8(b) => usize::try_from(self.get(b)).unwrap(),
+ Unit::EOI(b) => usize::try_from(b).unwrap(),
+ }
+ }
+
+ #[inline]
+ pub fn eoi(&self) -> Unit {
+ Unit::eoi(self.alphabet_len().checked_sub(1).unwrap())
+ }
+
+ /// Return the total number of elements in the alphabet represented by
+ /// these equivalence classes. Equivalently, this returns the total number
+ /// of equivalence classes.
+ #[inline]
+ pub fn alphabet_len(&self) -> usize {
+ // Add one since the number of equivalence classes is one bigger than
+ // the last one. But add another to account for the final EOI class
+ // that isn't explicitly represented.
+ self.0[255] as usize + 1 + 1
+ }
+
+ /// Returns the stride, as a base-2 exponent, required for these
+ /// equivalence classes.
+ ///
+ /// The stride is always the smallest power of 2 that is greater than or
+ /// equal to the alphabet length. This is done so that converting between
+ /// state IDs and indices can be done with shifts alone, which is much
+ /// faster than integer division.
+ #[cfg(feature = "alloc")]
+ pub fn stride2(&self) -> usize {
+ self.alphabet_len().next_power_of_two().trailing_zeros() as usize
+ }
+
+ /// Returns true if and only if every byte in this class maps to its own
+ /// equivalence class. Equivalently, there are 257 equivalence classes
+ /// and each class contains exactly one byte (plus the special EOI class).
+ #[inline]
+ pub fn is_singleton(&self) -> bool {
+ self.alphabet_len() == 257
+ }
+
+ /// Returns an iterator over all equivalence classes in this set.
+ pub fn iter(&self) -> ByteClassIter<'_> {
+ ByteClassIter { classes: self, i: 0 }
+ }
+
+ /// Returns an iterator over a sequence of representative bytes from each
+ /// equivalence class. Namely, this yields exactly N items, where N is
+ /// equivalent to the number of equivalence classes. Each item is an
+ /// arbitrary byte drawn from each equivalence class.
+ ///
+ /// This is useful when one is determinizing an NFA and the NFA's alphabet
+ /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+ /// byte from each equivalence class then permits a full exploration of
+ /// the NFA instead of using every possible byte value.
+ #[cfg(feature = "alloc")]
+ pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
+ ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+ }
+
+ /// Returns an iterator of the bytes in the given equivalence class.
+ pub fn elements(&self, class: Unit) -> ByteClassElements {
+ ByteClassElements { classes: self, class, byte: 0 }
+ }
+
+ /// Returns an iterator of byte ranges in the given equivalence class.
+ ///
+ /// That is, a sequence of contiguous ranges are returned. Typically, every
+ /// class maps to a single contiguous range.
+ fn element_ranges(&self, class: Unit) -> ByteClassElementRanges {
+ ByteClassElementRanges { elements: self.elements(class), range: None }
+ }
+}
+
+impl core::fmt::Debug for ByteClasses {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ if self.is_singleton() {
+ write!(f, "ByteClasses({{singletons}})")
+ } else {
+ write!(f, "ByteClasses(")?;
+ for (i, class) in self.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?} => [", class.as_usize())?;
+ for (start, end) in self.element_ranges(class) {
+ if start == end {
+ write!(f, "{:?}", start)?;
+ } else {
+ write!(f, "{:?}-{:?}", start, end)?;
+ }
+ }
+ write!(f, "]")?;
+ }
+ write!(f, ")")
+ }
+ }
+}
+
+/// An iterator over each equivalence class.
+#[derive(Debug)]
+pub struct ByteClassIter<'a> {
+ classes: &'a ByteClasses,
+ i: usize,
+}
+
+impl<'a> Iterator for ByteClassIter<'a> {
+ type Item = Unit;
+
+ fn next(&mut self) -> Option<Unit> {
+ if self.i + 1 == self.classes.alphabet_len() {
+ self.i += 1;
+ Some(self.classes.eoi())
+ } else if self.i < self.classes.alphabet_len() {
+ let class = self.i as u8;
+ self.i += 1;
+ Some(Unit::u8(class))
+ } else {
+ None
+ }
+ }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+ classes: &'a ByteClasses,
+ byte: usize,
+ last_class: Option<u8>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+ type Item = Unit;
+
+ fn next(&mut self) -> Option<Unit> {
+ while self.byte < 256 {
+ let byte = self.byte as u8;
+ let class = self.classes.get(byte);
+ self.byte += 1;
+
+ if self.last_class != Some(class) {
+ self.last_class = Some(class);
+ return Some(Unit::u8(byte));
+ }
+ }
+ if self.byte == 256 {
+ self.byte += 1;
+ return Some(self.classes.eoi());
+ }
+ None
+ }
+}
+
+/// An iterator over all elements in an equivalence class.
+#[derive(Debug)]
+pub struct ByteClassElements<'a> {
+ classes: &'a ByteClasses,
+ class: Unit,
+ byte: usize,
+}
+
+impl<'a> Iterator for ByteClassElements<'a> {
+ type Item = Unit;
+
+ fn next(&mut self) -> Option<Unit> {
+ while self.byte < 256 {
+ let byte = self.byte as u8;
+ self.byte += 1;
+ if self.class.as_u8() == Some(self.classes.get(byte)) {
+ return Some(Unit::u8(byte));
+ }
+ }
+ if self.byte < 257 {
+ self.byte += 1;
+ if self.class.is_eoi() {
+ return Some(Unit::eoi(256));
+ }
+ }
+ None
+ }
+}
+
+/// An iterator over all elements in an equivalence class expressed as a
+/// sequence of contiguous ranges.
+#[derive(Debug)]
+pub struct ByteClassElementRanges<'a> {
+ elements: ByteClassElements<'a>,
+ range: Option<(Unit, Unit)>,
+}
+
+impl<'a> Iterator for ByteClassElementRanges<'a> {
+ type Item = (Unit, Unit);
+
+ fn next(&mut self) -> Option<(Unit, Unit)> {
+ loop {
+ let element = match self.elements.next() {
+ None => return self.range.take(),
+ Some(element) => element,
+ };
+ match self.range.take() {
+ None => {
+ self.range = Some((element, element));
+ }
+ Some((start, end)) => {
+ if end.as_usize() + 1 != element.as_usize()
+ || element.is_eoi()
+ {
+ self.range = Some((element, element));
+ return Some((start, end));
+ }
+ self.range = Some((start, element));
+ }
+ }
+ }
+ }
+}
+
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
+/// same equivalence class because it never matters whether an `a` or a `b` is
+/// seen, and no combination of `a`s and `b`s in the text can discriminate a
+/// match.
+///
+/// Note though that this does not compute the minimal set of equivalence
+/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
+/// same equivalence class for the same reason that `a` and `b` are in the
+/// same equivalence class in the aforementioned regex. However, in this
+/// implementation, `a` and `c` are put into distinct equivalence classes. The
+/// reason for this is implementation complexity. In the future, we should
+/// endeavor to compute the minimal equivalence classes since they can have a
+/// rather large impact on the size of the DFA. (Doing this will likely require
+/// rethinking how equivalence classes are computed, including changing the
+/// representation here, which is only able to group contiguous bytes into the
+/// same equivalence class.)
+#[derive(Clone, Debug)]
+pub struct ByteClassSet(ByteSet);
+
+impl ByteClassSet {
+ /// Create a new set of byte classes where all bytes are part of the same
+ /// equivalence class.
+ #[cfg(feature = "alloc")]
+ pub fn empty() -> Self {
+ ByteClassSet(ByteSet::empty())
+ }
+
+ /// Indicate the the range of byte given (inclusive) can discriminate a
+ /// match between it and all other bytes outside of the range.
+ #[cfg(feature = "alloc")]
+ pub fn set_range(&mut self, start: u8, end: u8) {
+ debug_assert!(start <= end);
+ if start > 0 {
+ self.0.add(start - 1);
+ }
+ self.0.add(end);
+ }
+
+ /// Add the contiguous ranges in the set given to this byte class set.
+ #[cfg(feature = "alloc")]
+ pub fn add_set(&mut self, set: &ByteSet) {
+ for (start, end) in set.iter_ranges() {
+ self.set_range(start, end);
+ }
+ }
+
+ /// Convert this boolean set to a map that maps all byte values to their
+ /// corresponding equivalence class. The last mapping indicates the largest
+ /// equivalence class identifier (which is never bigger than 255).
+ #[cfg(feature = "alloc")]
+ pub fn byte_classes(&self) -> ByteClasses {
+ let mut classes = ByteClasses::empty();
+ let mut class = 0u8;
+ let mut b = 0u8;
+ loop {
+ classes.set(b, class);
+ if b == 255 {
+ break;
+ }
+ if self.0.contains(b) {
+ class = class.checked_add(1).unwrap();
+ }
+ b = b.checked_add(1).unwrap();
+ }
+ classes
+ }
+}
+
+/// A simple set of bytes that is reasonably cheap to copy and allocation free.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub struct ByteSet {
+ bits: BitSet,
+}
+
+/// The representation of a byte set. Split out so that we can define a
+/// convenient Debug impl for it while keeping "ByteSet" in the output.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+struct BitSet([u128; 2]);
+
+impl ByteSet {
+ /// Create an empty set of bytes.
+ #[cfg(feature = "alloc")]
+ pub fn empty() -> ByteSet {
+ ByteSet { bits: BitSet([0; 2]) }
+ }
+
+ /// Add a byte to this set.
+ ///
+ /// If the given byte already belongs to this set, then this is a no-op.
+ #[cfg(feature = "alloc")]
+ pub fn add(&mut self, byte: u8) {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[bucket as usize] |= 1 << bit;
+ }
+
+ /// Add an inclusive range of bytes.
+ #[cfg(feature = "alloc")]
+ pub fn add_all(&mut self, start: u8, end: u8) {
+ for b in start..=end {
+ self.add(b);
+ }
+ }
+
+ /// Remove a byte from this set.
+ ///
+ /// If the given byte is not in this set, then this is a no-op.
+ #[cfg(feature = "alloc")]
+ pub fn remove(&mut self, byte: u8) {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[bucket as usize] &= !(1 << bit);
+ }
+
+ /// Remove an inclusive range of bytes.
+ #[cfg(feature = "alloc")]
+ pub fn remove_all(&mut self, start: u8, end: u8) {
+ for b in start..=end {
+ self.remove(b);
+ }
+ }
+
+ /// Return true if and only if the given byte is in this set.
+ pub fn contains(&self, byte: u8) -> bool {
+ let bucket = byte / 128;
+ let bit = byte % 128;
+ self.bits.0[bucket as usize] & (1 << bit) > 0
+ }
+
+ /// Return true if and only if the given inclusive range of bytes is in
+ /// this set.
+ #[cfg(feature = "alloc")]
+ pub fn contains_range(&self, start: u8, end: u8) -> bool {
+ (start..=end).all(|b| self.contains(b))
+ }
+
+ /// Returns an iterator over all bytes in this set.
+ #[cfg(feature = "alloc")]
+ pub fn iter(&self) -> ByteSetIter {
+ ByteSetIter { set: self, b: 0 }
+ }
+
+ /// Returns an iterator over all contiguous ranges of bytes in this set.
+ #[cfg(feature = "alloc")]
+ pub fn iter_ranges(&self) -> ByteSetRangeIter {
+ ByteSetRangeIter { set: self, b: 0 }
+ }
+
+ /// Return the number of bytes in this set.
+ #[cfg(feature = "alloc")]
+ pub fn len(&self) -> usize {
+ (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize
+ }
+
+ /// Return true if and only if this set is empty.
+ #[cfg(feature = "alloc")]
+ pub fn is_empty(&self) -> bool {
+ self.bits.0 == [0, 0]
+ }
+}
+
+impl core::fmt::Debug for BitSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut fmtd = f.debug_set();
+ for b in (0..256).map(|b| b as u8) {
+ if (ByteSet { bits: *self }).contains(b) {
+ fmtd.entry(&b);
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+#[derive(Debug)]
+pub struct ByteSetIter<'a> {
+ set: &'a ByteSet,
+ b: usize,
+}
+
+impl<'a> Iterator for ByteSetIter<'a> {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ while self.b <= 255 {
+ let b = self.b as u8;
+ self.b += 1;
+ if self.set.contains(b) {
+ return Some(b);
+ }
+ }
+ None
+ }
+}
+
+#[derive(Debug)]
+pub struct ByteSetRangeIter<'a> {
+ set: &'a ByteSet,
+ b: usize,
+}
+
+impl<'a> Iterator for ByteSetRangeIter<'a> {
+ type Item = (u8, u8);
+
+ fn next(&mut self) -> Option<(u8, u8)> {
+ while self.b <= 255 {
+ let start = self.b as u8;
+ self.b += 1;
+ if !self.set.contains(start) {
+ continue;
+ }
+
+ let mut end = start;
+ while self.b <= 255 && self.set.contains(self.b as u8) {
+ end = self.b as u8;
+ self.b += 1;
+ }
+ return Some((start, end));
+ }
+ None
+ }
+}
+
+#[cfg(test)]
+#[cfg(feature = "alloc")]
+mod tests {
+ use alloc::{vec, vec::Vec};
+
+ use super::*;
+
+ #[test]
+ fn byte_classes() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'a', b'z');
+
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(b'a' - 1), 0);
+ assert_eq!(classes.get(b'a'), 1);
+ assert_eq!(classes.get(b'm'), 1);
+ assert_eq!(classes.get(b'z'), 1);
+ assert_eq!(classes.get(b'z' + 1), 2);
+ assert_eq!(classes.get(254), 2);
+ assert_eq!(classes.get(255), 2);
+
+ let mut set = ByteClassSet::empty();
+ set.set_range(0, 2);
+ set.set_range(4, 6);
+ let classes = set.byte_classes();
+ assert_eq!(classes.get(0), 0);
+ assert_eq!(classes.get(1), 0);
+ assert_eq!(classes.get(2), 0);
+ assert_eq!(classes.get(3), 1);
+ assert_eq!(classes.get(4), 2);
+ assert_eq!(classes.get(5), 2);
+ assert_eq!(classes.get(6), 2);
+ assert_eq!(classes.get(7), 3);
+ assert_eq!(classes.get(255), 3);
+ }
+
+ #[test]
+ fn full_byte_classes() {
+ let mut set = ByteClassSet::empty();
+ for i in 0..256u16 {
+ set.set_range(i as u8, i as u8);
+ }
+ assert_eq!(set.byte_classes().alphabet_len(), 257);
+ }
+
+ #[test]
+ fn elements_typical() {
+ let mut set = ByteClassSet::empty();
+ set.set_range(b'b', b'd');
+ set.set_range(b'g', b'm');
+ set.set_range(b'z', b'z');
+ let classes = set.byte_classes();
+ // class 0: \x00-a
+ // class 1: b-d
+ // class 2: e-f
+ // class 3: g-m
+ // class 4: n-y
+ // class 5: z-z
+ // class 6: \x7B-\xFF
+ // class 7: EOI
+ assert_eq!(classes.alphabet_len(), 8);
+
+ let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 98);
+ assert_eq!(elements[0], Unit::u8(b'\x00'));
+ assert_eq!(elements[97], Unit::u8(b'a'));
+
+ let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>();
+ assert_eq!(
+ elements,
+ vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')],
+ );
+
+ let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],);
+
+ let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>();
+ assert_eq!(
+ elements,
+ vec![
+ Unit::u8(b'g'),
+ Unit::u8(b'h'),
+ Unit::u8(b'i'),
+ Unit::u8(b'j'),
+ Unit::u8(b'k'),
+ Unit::u8(b'l'),
+ Unit::u8(b'm'),
+ ],
+ );
+
+ let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 12);
+ assert_eq!(elements[0], Unit::u8(b'n'));
+ assert_eq!(elements[11], Unit::u8(b'y'));
+
+ let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::u8(b'z')]);
+
+ let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 133);
+ assert_eq!(elements[0], Unit::u8(b'\x7B'));
+ assert_eq!(elements[132], Unit::u8(b'\xFF'));
+
+ let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::eoi(256)]);
+ }
+
+ #[test]
+ fn elements_singletons() {
+ let classes = ByteClasses::singletons();
+ assert_eq!(classes.alphabet_len(), 257);
+
+ let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::u8(b'a')]);
+
+ let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::eoi(256)]);
+ }
+
+ #[test]
+ fn elements_empty() {
+ let classes = ByteClasses::empty();
+ assert_eq!(classes.alphabet_len(), 2);
+
+ let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
+ assert_eq!(elements.len(), 256);
+ assert_eq!(elements[0], Unit::u8(b'\x00'));
+ assert_eq!(elements[255], Unit::u8(b'\xFF'));
+
+ let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>();
+ assert_eq!(elements, vec![Unit::eoi(256)]);
+ }
+}
diff --git a/vendor/regex-automata/src/util/bytes.rs b/vendor/regex-automata/src/util/bytes.rs
new file mode 100644
index 000000000..5877bb149
--- /dev/null
+++ b/vendor/regex-automata/src/util/bytes.rs
@@ -0,0 +1,950 @@
+/*
+A collection of helper functions, types and traits for serializing automata.
+
+This crate defines its own bespoke serialization mechanism for some structures
+provided in the public API, namely, DFAs. A bespoke mechanism was developed
+primarily because structures like automata demand a specific binary format.
+Attempting to encode their rich structure in an existing serialization
+format is just not feasible. Moreover, the format for each structure is
+generally designed such that deserialization is cheap. More specifically, that
+deserialization can be done in constant time. (The idea being that you can
+embed it into your binary or mmap it, and then use it immediately.)
+
+In order to achieve this, most of the structures in this crate use an in-memory
+representation that very closely corresponds to its binary serialized form.
+This pervades and complicates everything, and in some cases, requires dealing
+with alignment and reasoning about safety.
+
+This technique does have major advantages. In particular, it permits doing
+the potentially costly work of compiling a finite state machine in an offline
+manner, and then loading it at runtime not only without having to re-compile
+the regex, but even without the code required to do the compilation. This, for
+example, permits one to use a pre-compiled DFA not only in environments without
+Rust's standard library, but also in environments without a heap.
+
+In the code below, whenever we insert some kind of padding, it's to enforce a
+4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type
+supported. (In a previous version of this library, DFAs were generic over the
+state ID representation.)
+
+Also, serialization generally requires the caller to specify endianness,
+where as deserialization always assumes native endianness (otherwise cheap
+deserialization would be impossible). This implies that serializing a structure
+generally requires serializing both its big-endian and little-endian variants,
+and then loading the correct one based on the target's endianness.
+*/
+
+use core::{
+ cmp,
+ convert::{TryFrom, TryInto},
+ mem::size_of,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::{vec, vec::Vec};
+
+use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError};
+
+/// An error that occurs when serializing an object from this crate.
+///
+/// Serialization, as used in this crate, universally refers to the process
+/// of transforming a structure (like a DFA) into a custom binary format
+/// represented by `&[u8]`. To this end, serialization is generally infallible.
+/// However, it can fail when caller provided buffer sizes are too small. When
+/// that occurs, a serialization error is reported.
+///
+/// A `SerializeError` provides no introspection capabilities. Its only
+/// supported operation is conversion to a human readable error message.
+///
+/// This error type implements the `std::error::Error` trait only when the
+/// `std` feature is enabled. Otherwise, this type is defined in all
+/// configurations.
+#[derive(Debug)]
+pub struct SerializeError {
+ /// The name of the thing that a buffer is too small for.
+ ///
+ /// Currently, the only kind of serialization error is one that is
+ /// committed by a caller: providing a destination buffer that is too
+ /// small to fit the serialized object. This makes sense conceptually,
+ /// since every valid inhabitant of a type should be serializable.
+ ///
+ /// This is somewhat exposed in the public API of this crate. For example,
+ /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are
+ /// guaranteed to never panic or error. This is only possible because the
+ /// implementation guarantees that it will allocate a `Vec<u8>` that is
+ /// big enough.
+ ///
+ /// In summary, if a new serialization error kind needs to be added, then
+ /// it will need careful consideration.
+ what: &'static str,
+}
+
+impl SerializeError {
+ pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError {
+ SerializeError { what }
+ }
+}
+
+impl core::fmt::Display for SerializeError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "destination buffer is too small to write {}", self.what)
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for SerializeError {}
+
+/// An error that occurs when deserializing an object defined in this crate.
+///
+/// Serialization, as used in this crate, universally refers to the process
+/// of transforming a structure (like a DFA) into a custom binary format
+/// represented by `&[u8]`. Deserialization, then, refers to the process of
+/// cheaply converting this binary format back to the object's in-memory
+/// representation as defined in this crate. To the extent possible,
+/// deserialization will report this error whenever this process fails.
+///
+/// A `DeserializeError` provides no introspection capabilities. Its only
+/// supported operation is conversion to a human readable error message.
+///
+/// This error type implements the `std::error::Error` trait only when the
+/// `std` feature is enabled. Otherwise, this type is defined in all
+/// configurations.
+#[derive(Debug)]
+pub struct DeserializeError(DeserializeErrorKind);
+
+#[derive(Debug)]
+enum DeserializeErrorKind {
+ Generic { msg: &'static str },
+ BufferTooSmall { what: &'static str },
+ InvalidUsize { what: &'static str },
+ InvalidVarint { what: &'static str },
+ VersionMismatch { expected: u32, found: u32 },
+ EndianMismatch { expected: u32, found: u32 },
+ AlignmentMismatch { alignment: usize, address: usize },
+ LabelMismatch { expected: &'static str },
+ ArithmeticOverflow { what: &'static str },
+ PatternID { err: PatternIDError, what: &'static str },
+ StateID { err: StateIDError, what: &'static str },
+}
+
+impl DeserializeError {
+ pub(crate) fn generic(msg: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::Generic { msg })
+ }
+
+ pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::BufferTooSmall { what })
+ }
+
+ pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::InvalidUsize { what })
+ }
+
+ fn invalid_varint(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::InvalidVarint { what })
+ }
+
+ fn version_mismatch(expected: u32, found: u32) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::VersionMismatch {
+ expected,
+ found,
+ })
+ }
+
+ fn endian_mismatch(expected: u32, found: u32) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::EndianMismatch {
+ expected,
+ found,
+ })
+ }
+
+ fn alignment_mismatch(
+ alignment: usize,
+ address: usize,
+ ) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::AlignmentMismatch {
+ alignment,
+ address,
+ })
+ }
+
+ fn label_mismatch(expected: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::LabelMismatch { expected })
+ }
+
+ fn arithmetic_overflow(what: &'static str) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what })
+ }
+
+ pub(crate) fn pattern_id_error(
+ err: PatternIDError,
+ what: &'static str,
+ ) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::PatternID { err, what })
+ }
+
+ pub(crate) fn state_id_error(
+ err: StateIDError,
+ what: &'static str,
+ ) -> DeserializeError {
+ DeserializeError(DeserializeErrorKind::StateID { err, what })
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for DeserializeError {}
+
+impl core::fmt::Display for DeserializeError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ use self::DeserializeErrorKind::*;
+
+ match self.0 {
+ Generic { msg } => write!(f, "{}", msg),
+ BufferTooSmall { what } => {
+ write!(f, "buffer is too small to read {}", what)
+ }
+ InvalidUsize { what } => {
+ write!(f, "{} is too big to fit in a usize", what)
+ }
+ InvalidVarint { what } => {
+ write!(f, "could not decode valid varint for {}", what)
+ }
+ VersionMismatch { expected, found } => write!(
+ f,
+ "unsupported version: \
+ expected version {} but found version {}",
+ expected, found,
+ ),
+ EndianMismatch { expected, found } => write!(
+ f,
+ "endianness mismatch: expected 0x{:X} but got 0x{:X}. \
+ (Are you trying to load an object serialized with a \
+ different endianness?)",
+ expected, found,
+ ),
+ AlignmentMismatch { alignment, address } => write!(
+ f,
+ "alignment mismatch: slice starts at address \
+ 0x{:X}, which is not aligned to a {} byte boundary",
+ address, alignment,
+ ),
+ LabelMismatch { expected } => write!(
+ f,
+ "label mismatch: start of serialized object should \
+ contain a NUL terminated {:?} label, but a different \
+ label was found",
+ expected,
+ ),
+ ArithmeticOverflow { what } => {
+ write!(f, "arithmetic overflow for {}", what)
+ }
+ PatternID { ref err, what } => {
+ write!(f, "failed to read pattern ID for {}: {}", what, err)
+ }
+ StateID { ref err, what } => {
+ write!(f, "failed to read state ID for {}: {}", what, err)
+ }
+ }
+ }
+}
+
+/// Checks that the given slice has an alignment that matches `T`.
+///
+/// This is useful for checking that a slice has an appropriate alignment
+/// before casting it to a &[T]. Note though that alignment is not itself
+/// sufficient to perform the cast for any `T`.
+pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> {
+ let alignment = core::mem::align_of::<T>();
+ let address = slice.as_ptr() as usize;
+ if address % alignment == 0 {
+ return Ok(());
+ }
+ Err(DeserializeError::alignment_mismatch(alignment, address))
+}
+
+/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning
+/// of the given slice. All padding bytes must be NUL bytes.
+///
+/// This is useful because it can be theoretically necessary to pad the
+/// beginning of a serialized object with NUL bytes to ensure that it starts
+/// at a correctly aligned address. These padding bytes should come immediately
+/// before the label.
+///
+/// This returns the number of bytes read from the given slice.
+pub fn skip_initial_padding(slice: &[u8]) -> usize {
+ let mut nread = 0;
+ while nread < 7 && nread < slice.len() && slice[nread] == 0 {
+ nread += 1;
+ }
+ nread
+}
+
+/// Allocate a byte buffer of the given size, along with some initial padding
+/// such that `buf[padding..]` has the same alignment as `T`, where the
+/// alignment of `T` must be at most `8`. In particular, callers should treat
+/// the first N bytes (second return value) as padding bytes that must not be
+/// overwritten. In all cases, the following identity holds:
+///
+/// ```ignore
+/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE);
+/// assert_eq!(SIZE, buf[padding..].len());
+/// ```
+///
+/// In practice, padding is often zero.
+///
+/// The requirement for `8` as a maximum here is somewhat arbitrary. In
+/// practice, we never need anything bigger in this crate, and so this function
+/// does some sanity asserts under the assumption of a max alignment of `8`.
+#[cfg(feature = "alloc")]
+pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
+ // FIXME: This is a kludge because there's no easy way to allocate a
+ // Vec<u8> with an alignment guaranteed to be greater than 1. We could
+ // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8>
+ // without concern, since reallocing or dropping the Vec<u8> is UB
+ // (different alignment than the initial allocation). We could define a
+ // wrapper type to manage this for us, but it seems like more machinery
+ // than it's worth.
+ let mut buf = vec![0; size];
+ let align = core::mem::align_of::<T>();
+ let address = buf.as_ptr() as usize;
+ if address % align == 0 {
+ return (buf, 0);
+ }
+ // It's not quite clear how to robustly test this code, since the allocator
+ // in my environment appears to always return addresses aligned to at
+ // least 8 bytes, even when the alignment requirement is smaller. A feeble
+ // attempt at ensuring correctness is provided with asserts.
+ let padding = ((address & !0b111).checked_add(8).unwrap())
+ .checked_sub(address)
+ .unwrap();
+ assert!(padding <= 7, "padding of {} is bigger than 7", padding);
+ buf.extend(core::iter::repeat(0).take(padding));
+ assert_eq!(size + padding, buf.len());
+ assert_eq!(
+ 0,
+ buf[padding..].as_ptr() as usize % align,
+ "expected end of initial padding to be aligned to {}",
+ align,
+ );
+ (buf, padding)
+}
+
+/// Reads a NUL terminated label starting at the beginning of the given slice.
+///
+/// If a NUL terminated label could not be found, then an error is returned.
+/// Similary, if a label is found but doesn't match the expected label, then
+/// an error is returned.
+///
+/// Upon success, the total number of bytes read (including padding bytes) is
+/// returned.
+pub fn read_label(
+ slice: &[u8],
+ expected_label: &'static str,
+) -> Result<usize, DeserializeError> {
+ // Set an upper bound on how many bytes we scan for a NUL. Since no label
+ // in this crate is longer than 256 bytes, if we can't find one within that
+ // range, then we have corrupted data.
+ let first_nul =
+ slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0);
+ let first_nul = match first_nul {
+ Some(first_nul) => first_nul,
+ None => {
+ return Err(DeserializeError::generic(
+ "could not find NUL terminated label \
+ at start of serialized object",
+ ));
+ }
+ };
+ let len = first_nul + padding_len(first_nul);
+ if slice.len() < len {
+ return Err(DeserializeError::generic(
+ "could not find properly sized label at start of serialized object"
+ ));
+ }
+ if expected_label.as_bytes() != &slice[..first_nul] {
+ return Err(DeserializeError::label_mismatch(expected_label));
+ }
+ Ok(len)
+}
+
+/// Writes the given label to the buffer as a NUL terminated string. The label
+/// given must not contain NUL, otherwise this will panic. Similarly, the label
+/// must not be longer than 255 bytes, otherwise this will panic.
+///
+/// Additional NUL bytes are written as necessary to ensure that the number of
+/// bytes written is always a multiple of 4.
+///
+/// Upon success, the total number of bytes written (including padding) is
+/// returned.
+pub fn write_label(
+ label: &str,
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let nwrite = write_label_len(label);
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("label"));
+ }
+ dst[..label.len()].copy_from_slice(label.as_bytes());
+ for i in 0..(nwrite - label.len()) {
+ dst[label.len() + i] = 0;
+ }
+ assert_eq!(nwrite % 4, 0);
+ Ok(nwrite)
+}
+
+/// Returns the total number of bytes (including padding) that would be written
+/// for the given label. This panics if the given label contains a NUL byte or
+/// is longer than 255 bytes. (The size restriction exists so that searching
+/// for a label during deserialization can be done in small bounded space.)
+pub fn write_label_len(label: &str) -> usize {
+ if label.len() > 255 {
+ panic!("label must not be longer than 255 bytes");
+ }
+ if label.as_bytes().iter().position(|&b| b == 0).is_some() {
+ panic!("label must not contain NUL bytes");
+ }
+ let label_len = label.len() + 1; // +1 for the NUL terminator
+ label_len + padding_len(label_len)
+}
+
+/// Reads the endianness check from the beginning of the given slice and
+/// confirms that the endianness of the serialized object matches the expected
+/// endianness. If the slice is too small or if the endianness check fails,
+/// this returns an error.
+///
+/// Upon success, the total number of bytes read is returned.
+pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> {
+ let (n, nr) = try_read_u32(slice, "endianness check")?;
+ assert_eq!(nr, write_endianness_check_len());
+ if n != 0xFEFF {
+ return Err(DeserializeError::endian_mismatch(0xFEFF, n));
+ }
+ Ok(nr)
+}
+
+/// Writes 0xFEFF as an integer using the given endianness.
+///
+/// This is useful for writing into the header of a serialized object. It can
+/// be read during deserialization as a sanity check to ensure the proper
+/// endianness is used.
+///
+/// Upon success, the total number of bytes written is returned.
+pub fn write_endianness_check<E: Endian>(
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let nwrite = write_endianness_check_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("endianness check"));
+ }
+ E::write_u32(0xFEFF, dst);
+ Ok(nwrite)
+}
+
+/// Returns the number of bytes written by the endianness check.
+pub fn write_endianness_check_len() -> usize {
+ size_of::<u32>()
+}
+
+/// Reads a version number from the beginning of the given slice and confirms
+/// that is matches the expected version number given. If the slice is too
+/// small or if the version numbers aren't equivalent, this returns an error.
+///
+/// Upon success, the total number of bytes read is returned.
+///
+/// N.B. Currently, we require that the version number is exactly equivalent.
+/// In the future, if we bump the version number without a semver bump, then
+/// we'll need to relax this a bit and support older versions.
+pub fn read_version(
+ slice: &[u8],
+ expected_version: u32,
+) -> Result<usize, DeserializeError> {
+ let (n, nr) = try_read_u32(slice, "version")?;
+ assert_eq!(nr, write_version_len());
+ if n != expected_version {
+ return Err(DeserializeError::version_mismatch(expected_version, n));
+ }
+ Ok(nr)
+}
+
+/// Writes the given version number to the beginning of the given slice.
+///
+/// This is useful for writing into the header of a serialized object. It can
+/// be read during deserialization as a sanity check to ensure that the library
+/// code supports the format of the serialized object.
+///
+/// Upon success, the total number of bytes written is returned.
+pub fn write_version<E: Endian>(
+ version: u32,
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let nwrite = write_version_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("version number"));
+ }
+ E::write_u32(version, dst);
+ Ok(nwrite)
+}
+
+/// Returns the number of bytes written by writing the version number.
+pub fn write_version_len() -> usize {
+ size_of::<u32>()
+}
+
+/// Reads a pattern ID from the given slice. If the slice has insufficient
+/// length, then this panics. If the deserialized integer exceeds the pattern
+/// ID limit for the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn read_pattern_id(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(PatternID, usize), DeserializeError> {
+ let bytes: [u8; PatternID::SIZE] =
+ slice[..PatternID::SIZE].try_into().unwrap();
+ let pid = PatternID::from_ne_bytes(bytes)
+ .map_err(|err| DeserializeError::pattern_id_error(err, what))?;
+ Ok((pid, PatternID::SIZE))
+}
+
+/// Reads a pattern ID from the given slice. If the slice has insufficient
+/// length, then this panics. Otherwise, the deserialized integer is assumed
+/// to be a valid pattern ID.
+///
+/// This also returns the number of bytes read.
+pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
+ let pid = PatternID::from_ne_bytes_unchecked(
+ slice[..PatternID::SIZE].try_into().unwrap(),
+ );
+ (pid, PatternID::SIZE)
+}
+
+/// Write the given pattern ID to the beginning of the given slice of bytes
+/// using the specified endianness. The given slice must have length at least
+/// `PatternID::SIZE`, or else this panics. Upon success, the total number of
+/// bytes written is returned.
+pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize {
+ E::write_u32(pid.as_u32(), dst);
+ PatternID::SIZE
+}
+
+/// Attempts to read a state ID from the given slice. If the slice has an
+/// insufficient number of bytes or if the state ID exceeds the limit for
+/// the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_state_id(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(StateID, usize), DeserializeError> {
+ if slice.len() < StateID::SIZE {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ read_state_id(slice, what)
+}
+
+/// Reads a state ID from the given slice. If the slice has insufficient
+/// length, then this panics. If the deserialized integer exceeds the state ID
+/// limit for the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn read_state_id(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(StateID, usize), DeserializeError> {
+ let bytes: [u8; StateID::SIZE] =
+ slice[..StateID::SIZE].try_into().unwrap();
+ let sid = StateID::from_ne_bytes(bytes)
+ .map_err(|err| DeserializeError::state_id_error(err, what))?;
+ Ok((sid, StateID::SIZE))
+}
+
+/// Reads a state ID from the given slice. If the slice has insufficient
+/// length, then this panics. Otherwise, the deserialized integer is assumed
+/// to be a valid state ID.
+///
+/// This also returns the number of bytes read.
+pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
+ let sid = StateID::from_ne_bytes_unchecked(
+ slice[..StateID::SIZE].try_into().unwrap(),
+ );
+ (sid, StateID::SIZE)
+}
+
+/// Write the given state ID to the beginning of the given slice of bytes
+/// using the specified endianness. The given slice must have length at least
+/// `StateID::SIZE`, or else this panics. Upon success, the total number of
+/// bytes written is returned.
+pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize {
+ E::write_u32(sid.as_u32(), dst);
+ StateID::SIZE
+}
+
+/// Try to read a u16 as a usize from the beginning of the given slice in
+/// native endian format. If the slice has fewer than 2 bytes or if the
+/// deserialized number cannot be represented by usize, then this returns an
+/// error. The error message will include the `what` description of what is
+/// being deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u16_as_usize(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+ try_read_u16(slice, what).and_then(|(n, nr)| {
+ usize::try_from(n)
+ .map(|n| (n, nr))
+ .map_err(|_| DeserializeError::invalid_usize(what))
+ })
+}
+
+/// Try to read a u32 as a usize from the beginning of the given slice in
+/// native endian format. If the slice has fewer than 4 bytes or if the
+/// deserialized number cannot be represented by usize, then this returns an
+/// error. The error message will include the `what` description of what is
+/// being deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u32_as_usize(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+ try_read_u32(slice, what).and_then(|(n, nr)| {
+ usize::try_from(n)
+ .map(|n| (n, nr))
+ .map_err(|_| DeserializeError::invalid_usize(what))
+ })
+}
+
+/// Try to read a u16 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 2 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u16(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u16, usize), DeserializeError> {
+ if slice.len() < size_of::<u16>() {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ Ok((read_u16(slice), size_of::<u16>()))
+}
+
+/// Try to read a u32 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 4 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u32(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u32, usize), DeserializeError> {
+ if slice.len() < size_of::<u32>() {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ Ok((read_u32(slice), size_of::<u32>()))
+}
+
+/// Read a u16 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 2 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u16(slice: &[u8]) -> u16 {
+ let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap();
+ u16::from_ne_bytes(bytes)
+}
+
+/// Read a u32 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 4 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u32(slice: &[u8]) -> u32 {
+ let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap();
+ u32::from_ne_bytes(bytes)
+}
+
+/// Read a u64 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 8 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u64(slice: &[u8]) -> u64 {
+ let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap();
+ u64::from_ne_bytes(bytes)
+}
+
+/// Write a variable sized integer and return the total number of bytes
+/// written. If the slice was not big enough to contain the bytes, then this
+/// returns an error including the "what" description in it. This does no
+/// padding.
+///
+/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn write_varu64(
+ mut n: u64,
+ what: &'static str,
+ dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+ let mut i = 0;
+ while n >= 0b1000_0000 {
+ if i >= dst.len() {
+ return Err(SerializeError::buffer_too_small(what));
+ }
+ dst[i] = (n as u8) | 0b1000_0000;
+ n >>= 7;
+ i += 1;
+ }
+ if i >= dst.len() {
+ return Err(SerializeError::buffer_too_small(what));
+ }
+ dst[i] = n as u8;
+ Ok(i + 1)
+}
+
+/// Returns the total number of bytes that would be writen to encode n as a
+/// variable sized integer.
+///
+/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn write_varu64_len(mut n: u64) -> usize {
+ let mut i = 0;
+ while n >= 0b1000_0000 {
+ n >>= 7;
+ i += 1;
+ }
+ i + 1
+}
+
+/// Like read_varu64, but attempts to cast the result to usize. If the integer
+/// cannot fit into a usize, then an error is returned.
+#[allow(dead_code)]
+pub fn read_varu64_as_usize(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+ let (n, nread) = read_varu64(slice, what)?;
+ let n = usize::try_from(n)
+ .map_err(|_| DeserializeError::invalid_usize(what))?;
+ Ok((n, nread))
+}
+
+/// Reads a variable sized integer from the beginning of slice, and returns the
+/// integer along with the total number of bytes read. If a valid variable
+/// sized integer could not be found, then an error is returned that includes
+/// the "what" description in it.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn read_varu64(
+ slice: &[u8],
+ what: &'static str,
+) -> Result<(u64, usize), DeserializeError> {
+ let mut n: u64 = 0;
+ let mut shift: u32 = 0;
+ // The biggest possible value is u64::MAX, which needs all 64 bits which
+ // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading
+ // an unnecessary number of bytes.
+ let limit = cmp::min(slice.len(), 10);
+ for (i, &b) in slice[..limit].iter().enumerate() {
+ if b < 0b1000_0000 {
+ return match (b as u64).checked_shl(shift) {
+ None => Err(DeserializeError::invalid_varint(what)),
+ Some(b) => Ok((n | b, i + 1)),
+ };
+ }
+ match ((b as u64) & 0b0111_1111).checked_shl(shift) {
+ None => return Err(DeserializeError::invalid_varint(what)),
+ Some(b) => n |= b,
+ }
+ shift += 7;
+ }
+ Err(DeserializeError::invalid_varint(what))
+}
+
+/// Checks that the given slice has some minimal length. If it's smaller than
+/// the bound given, then a "buffer too small" error is returned with `what`
+/// describing what the buffer represents.
+pub fn check_slice_len<T>(
+ slice: &[T],
+ at_least_len: usize,
+ what: &'static str,
+) -> Result<(), DeserializeError> {
+ if slice.len() < at_least_len {
+ return Err(DeserializeError::buffer_too_small(what));
+ }
+ Ok(())
+}
+
+/// Multiply the given numbers, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn mul(
+ a: usize,
+ b: usize,
+ what: &'static str,
+) -> Result<usize, DeserializeError> {
+ match a.checked_mul(b) {
+ Some(c) => Ok(c),
+ None => Err(DeserializeError::arithmetic_overflow(what)),
+ }
+}
+
+/// Add the given numbers, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn add(
+ a: usize,
+ b: usize,
+ what: &'static str,
+) -> Result<usize, DeserializeError> {
+ match a.checked_add(b) {
+ Some(c) => Ok(c),
+ None => Err(DeserializeError::arithmetic_overflow(what)),
+ }
+}
+
+/// Shift `a` left by `b`, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn shl(
+ a: usize,
+ b: usize,
+ what: &'static str,
+) -> Result<usize, DeserializeError> {
+ let amount = u32::try_from(b)
+ .map_err(|_| DeserializeError::arithmetic_overflow(what))?;
+ match a.checked_shl(amount) {
+ Some(c) => Ok(c),
+ None => Err(DeserializeError::arithmetic_overflow(what)),
+ }
+}
+
+/// A simple trait for writing code generic over endianness.
+///
+/// This is similar to what byteorder provides, but we only need a very small
+/// subset.
+pub trait Endian {
+ /// Writes a u16 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 2, then
+ /// this panics.
+ fn write_u16(n: u16, dst: &mut [u8]);
+
+ /// Writes a u32 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 4, then
+ /// this panics.
+ fn write_u32(n: u32, dst: &mut [u8]);
+
+ /// Writes a u64 to the given destination buffer in a particular
+ /// endianness. If the destination buffer has a length smaller than 8, then
+ /// this panics.
+ fn write_u64(n: u64, dst: &mut [u8]);
+}
+
+/// Little endian writing.
+pub enum LE {}
+/// Big endian writing.
+pub enum BE {}
+
+#[cfg(target_endian = "little")]
+pub type NE = LE;
+#[cfg(target_endian = "big")]
+pub type NE = BE;
+
+impl Endian for LE {
+ fn write_u16(n: u16, dst: &mut [u8]) {
+ dst[..2].copy_from_slice(&n.to_le_bytes());
+ }
+
+ fn write_u32(n: u32, dst: &mut [u8]) {
+ dst[..4].copy_from_slice(&n.to_le_bytes());
+ }
+
+ fn write_u64(n: u64, dst: &mut [u8]) {
+ dst[..8].copy_from_slice(&n.to_le_bytes());
+ }
+}
+
+impl Endian for BE {
+ fn write_u16(n: u16, dst: &mut [u8]) {
+ dst[..2].copy_from_slice(&n.to_be_bytes());
+ }
+
+ fn write_u32(n: u32, dst: &mut [u8]) {
+ dst[..4].copy_from_slice(&n.to_be_bytes());
+ }
+
+ fn write_u64(n: u64, dst: &mut [u8]) {
+ dst[..8].copy_from_slice(&n.to_be_bytes());
+ }
+}
+
+/// Returns the number of additional bytes required to add to the given length
+/// in order to make the total length a multiple of 4. The return value is
+/// always less than 4.
+pub fn padding_len(non_padding_len: usize) -> usize {
+ (4 - (non_padding_len & 0b11)) & 0b11
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn labels() {
+ let mut buf = [0; 1024];
+
+ let nwrite = write_label("fooba", &mut buf).unwrap();
+ assert_eq!(nwrite, 8);
+ assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00");
+
+ let nread = read_label(&buf, "fooba").unwrap();
+ assert_eq!(nread, 8);
+ }
+
+ #[test]
+ #[should_panic]
+ fn bad_label_interior_nul() {
+ // interior NULs are not allowed
+ write_label("foo\x00bar", &mut [0; 1024]).unwrap();
+ }
+
+ #[test]
+ fn bad_label_almost_too_long() {
+ // ok
+ write_label(&"z".repeat(255), &mut [0; 1024]).unwrap();
+ }
+
+ #[test]
+ #[should_panic]
+ fn bad_label_too_long() {
+ // labels longer than 255 bytes are banned
+ write_label(&"z".repeat(256), &mut [0; 1024]).unwrap();
+ }
+
+ #[test]
+ fn padding() {
+ assert_eq!(0, padding_len(8));
+ assert_eq!(3, padding_len(9));
+ assert_eq!(2, padding_len(10));
+ assert_eq!(1, padding_len(11));
+ assert_eq!(0, padding_len(12));
+ assert_eq!(3, padding_len(13));
+ assert_eq!(2, padding_len(14));
+ assert_eq!(1, padding_len(15));
+ assert_eq!(0, padding_len(16));
+ }
+}
diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs
new file mode 100644
index 000000000..b384de8e1
--- /dev/null
+++ b/vendor/regex-automata/src/util/determinize/mod.rs
@@ -0,0 +1,493 @@
+/*!
+This module contains types and routines for implementing determinization.
+
+In this crate, there are at least two places where we implement
+determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
+lazily compiled DFAs in the `hybrid` module. The stuff in this module
+corresponds to the things that are in common between these implementations.
+
+There are three broad things that our implementations of determinization have
+in common, as defined by this module:
+
+* The classification of start states. That is, whether we're dealing with
+word boundaries, line boundaries, etc., is all the same. This also includes
+the look-behind assertions that are satisfied by each starting state
+classification.
+
+* The representation of DFA states as sets of NFA states, including
+convenience types for building these DFA states that are amenable to reusing
+allocations.
+
+* Routines for the "classical" parts of determinization: computing the
+epsilon closure, tracking match states (with corresponding pattern IDs, since
+we support multi-pattern finite automata) and, of course, computing the
+transition function between states for units of input.
+
+I did consider a couple of alternatives to this particular form of code reuse:
+
+1. Don't do any code reuse. The problem here is that we *really* want both
+forms of determinization to do exactly identical things when it comes to
+their handling of NFA states. While our tests generally ensure this, the code
+is tricky and large enough where not reusing code is a pretty big bummer.
+
+2. Implement all of determinization once and make it generic over fully
+compiled DFAs and lazily compiled DFAs. While I didn't actually try this
+approach, my instinct is that it would be more complex than is needed here.
+And the interface required would be pretty hairy. Instead, I think splitting
+it into logical sub-components works better.
+*/
+
+use alloc::vec::Vec;
+
+pub(crate) use self::state::{
+ State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
+};
+
+use crate::{
+ nfa::thompson::{self, Look, LookSet},
+ util::{
+ alphabet,
+ id::StateID,
+ matchtypes::MatchKind,
+ sparse_set::{SparseSet, SparseSets},
+ start::Start,
+ },
+};
+
+mod state;
+
+/// Compute the set of all eachable NFA states, including the full epsilon
+/// closure, from a DFA state for a single unit of input. The set of reachable
+/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
+/// also includes any look-behind assertions satisfied by `unit`, in addition
+/// to whether it is a match state. For multi-pattern DFAs, the builder will
+/// also include the pattern IDs that match (in the order seen).
+///
+/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
+/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
+/// must have capacity equivalent to `nfa.len()`.
+///
+/// `match_kind` should correspond to the match semantics implemented by the
+/// DFA being built. Generally speaking, for leftmost-first match semantics,
+/// states that appear after the first NFA match state will not be included in
+/// the `StateBuilderNFA` returned since they are impossible to visit.
+///
+/// `sparses` is used as scratch space for NFA traversal. Other than their
+/// capacity requirements (detailed above), there are no requirements on what's
+/// contained within them (if anything). Similarly, what's inside of them once
+/// this routine returns is unspecified.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+///
+/// `state` corresponds to the current DFA state on which one wants to compute
+/// the transition for the input `unit`.
+///
+/// `empty_builder` corresponds to the builder allocation to use to produce a
+/// complete `StateBuilderNFA` state. If the state is not needed (or is already
+/// cached), then it can be cleared and reused without needing to create a new
+/// `State`. The `StateBuilderNFA` state returned is final and ready to be
+/// turned into a `State` if necessary.
+pub(crate) fn next(
+ nfa: &thompson::NFA,
+ match_kind: MatchKind,
+ sparses: &mut SparseSets,
+ stack: &mut Vec<StateID>,
+ state: &State,
+ unit: alphabet::Unit,
+ empty_builder: StateBuilderEmpty,
+) -> StateBuilderNFA {
+ sparses.clear();
+
+ // Put the NFA state IDs into a sparse set in case we need to
+ // re-compute their epsilon closure.
+ //
+ // Doing this state shuffling is technically not necessary unless some
+ // kind of look-around is used in the DFA. Some ad hoc experiments
+ // suggested that avoiding this didn't lead to much of an improvement,
+ // but perhaps more rigorous experimentation should be done. And in
+ // particular, avoiding this check requires some light refactoring of
+ // the code below.
+ state.iter_nfa_state_ids(|nfa_id| {
+ sparses.set1.insert(nfa_id);
+ });
+
+ // Compute look-ahead assertions originating from the current state.
+ // Based on the input unit we're transitioning over, some additional
+ // set of assertions may be true. Thus, we re-compute this state's
+ // epsilon closure (but only if necessary).
+ if !state.look_need().is_empty() {
+ // Add look-ahead assertions that are now true based on the current
+ // input unit.
+ let mut look_have = state.look_have().clone();
+ match unit.as_u8() {
+ Some(b'\n') => {
+ look_have.insert(Look::EndLine);
+ }
+ Some(_) => {}
+ None => {
+ look_have.insert(Look::EndText);
+ look_have.insert(Look::EndLine);
+ }
+ }
+ if state.is_from_word() == unit.is_word_byte() {
+ look_have.insert(Look::WordBoundaryUnicodeNegate);
+ look_have.insert(Look::WordBoundaryAsciiNegate);
+ } else {
+ look_have.insert(Look::WordBoundaryUnicode);
+ look_have.insert(Look::WordBoundaryAscii);
+ }
+ // If we have new assertions satisfied that are among the set of
+ // assertions that exist in this state (that is, just because
+ // we added an EndLine assertion above doesn't mean there is an
+ // EndLine conditional epsilon transition in this state), then we
+ // re-compute this state's epsilon closure using the updated set of
+ // assertions.
+ if !look_have
+ .subtract(state.look_have())
+ .intersect(state.look_need())
+ .is_empty()
+ {
+ for nfa_id in &sparses.set1 {
+ epsilon_closure(
+ nfa,
+ nfa_id,
+ look_have,
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ sparses.swap();
+ sparses.set2.clear();
+ }
+ }
+
+ // Convert our empty builder into one that can record assertions and match
+ // pattern IDs.
+ let mut builder = empty_builder.into_matches();
+ // Set whether the StartLine look-behind assertion is true for this
+ // transition or not. The look-behind assertion for ASCII word boundaries
+ // is handled below.
+ if nfa.has_any_anchor() {
+ if unit.as_u8().map_or(false, |b| b == b'\n') {
+ // Why only handle StartLine here and not StartText? That's
+ // because StartText can only impact the starting state, which
+ // is speical cased in start state handling.
+ builder.look_have().insert(Look::StartLine);
+ }
+ }
+ for nfa_id in &sparses.set1 {
+ match *nfa.state(nfa_id) {
+ thompson::State::Union { .. }
+ | thompson::State::Fail
+ | thompson::State::Look { .. }
+ | thompson::State::Capture { .. } => {}
+ thompson::State::Match { id } => {
+ // Notice here that we are calling the NEW state a match
+ // state if the OLD state we are transitioning from
+ // contains an NFA match state. This is precisely how we
+ // delay all matches by one byte and also what therefore
+ // guarantees that starting states cannot be match states.
+ //
+ // If we didn't delay matches by one byte, then whether
+ // a DFA is a matching state or not would be determined
+ // by whether one of its own constituent NFA states
+ // was a match state. (And that would be done in
+ // 'add_nfa_states'.)
+ //
+ // Also, 'add_match_pattern_id' requires that callers never
+ // pass duplicative pattern IDs. We do in fact uphold that
+ // guarantee here, but it's subtle. In particular, a Thompson
+ // NFA guarantees that each pattern has exactly one match
+ // state. Moreover, since we're iterating over the NFA state
+ // IDs in a set, we are guarateed not to have any duplicative
+ // match states. Thus, it is impossible to add the same pattern
+ // ID more than once.
+ builder.add_match_pattern_id(id);
+ if !match_kind.continue_past_first_match() {
+ break;
+ }
+ }
+ thompson::State::Range { range: ref r } => {
+ if r.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ r.next,
+ *builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ thompson::State::Sparse(ref sparse) => {
+ if let Some(next) = sparse.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ next,
+ *builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ }
+ }
+ // We only set the word byte if there's a word boundary look-around
+ // anywhere in this regex. Otherwise, there's no point in bloating the
+ // number of states if we don't have one.
+ //
+ // We also only set it when the state has a non-zero number of NFA states.
+ // Otherwise, we could wind up with states that *should* be DEAD states
+ // but are otherwise distinct from DEAD states because of this look-behind
+ // assertion being set. While this can't technically impact correctness *in
+ // theory*, it can create pathological DFAs that consume input until EOI or
+ // a quit byte is seen. Consuming until EOI isn't a correctness problem,
+ // but a (serious) perf problem. Hitting a quit byte, however, could be a
+ // correctness problem since it could cause search routines to report an
+ // error instead of a detected match once the quit state is entered. (The
+ // search routine could be made to be a bit smarter by reporting a match
+ // if one was detected once it enters a quit state (and indeed, the search
+ // routines in this crate do just that), but it seems better to prevent
+ // these things by construction if possible.)
+ if nfa.has_word_boundary()
+ && unit.is_word_byte()
+ && !sparses.set2.is_empty()
+ {
+ builder.set_is_from_word();
+ }
+ let mut builder_nfa = builder.into_nfa();
+ add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
+ builder_nfa
+}
+
+/// Compute the epsilon closure for the given NFA state. The epsilon closure
+/// consists of all NFA state IDs, including `start_nfa_id`, that can be
+/// reached from `start_nfa_id` without consuming any input. These state IDs
+/// are written to `set` in the order they are visited, but only if they are
+/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
+/// given.
+///
+/// `look_have` consists of the satisfied assertions at the current
+/// position. For conditional look-around epsilon transitions, these are
+/// only followed if they are satisfied by `look_have`.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+pub(crate) fn epsilon_closure(
+ nfa: &thompson::NFA,
+ start_nfa_id: StateID,
+ look_have: LookSet,
+ stack: &mut Vec<StateID>,
+ set: &mut SparseSet,
+) {
+ assert!(stack.is_empty());
+ // If this isn't an epsilon state, then the epsilon closure is always just
+ // itself, so there's no need to spin up the machinery below to handle it.
+ if !nfa.state(start_nfa_id).is_epsilon() {
+ set.insert(start_nfa_id);
+ return;
+ }
+
+ stack.push(start_nfa_id);
+ while let Some(mut id) = stack.pop() {
+ // In many cases, we can avoid stack operations when an NFA state only
+ // adds one new state to visit. In that case, we just set our ID to
+ // that state and mush on. We only use the stack when an NFA state
+ // introduces multiple new states to visit.
+ loop {
+ // Insert this NFA state, and if it's already in the set and thus
+ // already visited, then we can move on to the next one.
+ if !set.insert(id) {
+ break;
+ }
+ match *nfa.state(id) {
+ thompson::State::Range { .. }
+ | thompson::State::Sparse { .. }
+ | thompson::State::Fail
+ | thompson::State::Match { .. } => break,
+ thompson::State::Look { look, next } => {
+ if !look_have.contains(look) {
+ break;
+ }
+ id = next;
+ }
+ thompson::State::Union { ref alternates } => {
+ id = match alternates.get(0) {
+ None => break,
+ Some(&id) => id,
+ };
+ // We need to process our alternates in order to preserve
+ // match preferences, so put the earliest alternates closer
+ // to the top of the stack.
+ stack.extend(alternates[1..].iter().rev());
+ }
+ thompson::State::Capture { next, .. } => {
+ id = next;
+ }
+ }
+ }
+ }
+}
+
+/// Add the NFA state IDs in the given `set` to the given DFA builder state.
+/// The order in which states are added corresponds to the order in which they
+/// were added to `set`.
+///
+/// The DFA builder state given should already have its complete set of match
+/// pattern IDs added (if any) and any look-behind assertions (StartLine,
+/// StartText and whether this state is being generated for a transition over a
+/// word byte when applicable) that are true immediately prior to transitioning
+/// into this state (via `builder.look_have()`). The match pattern IDs should
+/// correspond to matches that occured on the previous transition, since all
+/// matches are delayed by one byte. The things that should _not_ be set are
+/// look-ahead assertions (EndLine, EndText and whether the next byte is a
+/// word byte or not). The builder state should also not have anything in
+/// `look_need` set, as this routine will compute that for you.
+///
+/// The given NFA should be able to resolve all identifiers in `set` to a
+/// particular NFA state. Additionally, `set` must have capacity equivalent
+/// to `nfa.len()`.
+pub(crate) fn add_nfa_states(
+ nfa: &thompson::NFA,
+ set: &SparseSet,
+ builder: &mut StateBuilderNFA,
+) {
+ for nfa_id in set {
+ match *nfa.state(nfa_id) {
+ thompson::State::Range { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
+ thompson::State::Sparse { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
+ thompson::State::Look { look, .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ builder.look_need().insert(look);
+ }
+ thompson::State::Union { .. }
+ | thompson::State::Capture { .. } => {
+ // Pure epsilon transitions don't need to be tracked
+ // as part of the DFA state. Tracking them is actually
+ // superfluous; they won't cause any harm other than making
+ // determinization slower.
+ //
+ // Why aren't these needed? Well, in an NFA, epsilon
+ // transitions are really just jumping points to other
+ // states. So once you hit an epsilon transition, the same
+ // set of resulting states always appears. Therefore,
+ // putting them in a DFA's set of ordered NFA states is
+ // strictly redundant.
+ //
+ // Look-around states are also epsilon transitions, but
+ // they are *conditional*. So their presence could be
+ // discriminatory, and thus, they are tracked above.
+ //
+ // But wait... why are epsilon states in our `set` in the
+ // first place? Why not just leave them out? They're in
+ // our `set` because it was generated by computing an
+ // epsilon closure, and we want to keep track of all states
+ // we visited to avoid re-visiting them. In exchange, we
+ // have to do this second iteration over our collected
+ // states to finalize our DFA state.
+ //
+ // Note that this optimization requires that we re-compute
+ // the epsilon closure to account for look-ahead in 'next'
+ // *only when necessary*. Namely, only when the set of
+ // look-around assertions changes and only when those
+ // changes are within the set of assertions that are
+ // needed in order to step through the closure correctly.
+ // Otherwise, if we re-do the epsilon closure needlessly,
+ // it could change based on the fact that we are omitting
+ // epsilon states here.
+ }
+ thompson::State::Fail => {
+ break;
+ }
+ thompson::State::Match { .. } => {
+ // Normally, the NFA match state doesn't actually need to
+ // be inside the DFA state. But since we delay matches by
+ // one byte, the matching DFA state corresponds to states
+ // that transition from the one we're building here. And
+ // the way we detect those cases is by looking for an NFA
+ // match state. See 'next' for how this is handled.
+ builder.add_nfa_state_id(nfa_id);
+ }
+ }
+ }
+ // If we know this state contains no look-around assertions, then
+ // there's no reason to track which look-around assertions were
+ // satisfied when this state was created.
+ if builder.look_need().is_empty() {
+ builder.look_have().clear();
+ }
+}
+
+/// Sets the appropriate look-behind assertions on the given state based on
+/// this starting configuration.
+pub(crate) fn set_lookbehind_from_start(
+ start: &Start,
+ builder: &mut StateBuilderMatches,
+) {
+ match *start {
+ Start::NonWordByte => {}
+ Start::WordByte => {
+ builder.set_is_from_word();
+ }
+ Start::Text => {
+ builder.look_have().insert(Look::StartText);
+ builder.look_have().insert(Look::StartLine);
+ }
+ Start::Line => {
+ builder.look_have().insert(Look::StartLine);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Start;
+
+ #[test]
+ #[should_panic]
+ fn start_fwd_bad_range() {
+ Start::from_position_fwd(&[], 0, 1);
+ }
+
+ #[test]
+ #[should_panic]
+ fn start_rev_bad_range() {
+ Start::from_position_rev(&[], 0, 1);
+ }
+
+ #[test]
+ fn start_fwd() {
+ let f = Start::from_position_fwd;
+
+ assert_eq!(Start::Text, f(&[], 0, 0));
+ assert_eq!(Start::Text, f(b"abc", 0, 3));
+ assert_eq!(Start::Text, f(b"\nabc", 0, 3));
+
+ assert_eq!(Start::Line, f(b"\nabc", 1, 3));
+
+ assert_eq!(Start::WordByte, f(b"abc", 1, 3));
+
+ assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
+ }
+
+ #[test]
+ fn start_rev() {
+ let f = Start::from_position_rev;
+
+ assert_eq!(Start::Text, f(&[], 0, 0));
+ assert_eq!(Start::Text, f(b"abc", 0, 3));
+ assert_eq!(Start::Text, f(b"abc\n", 0, 4));
+
+ assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
+
+ assert_eq!(Start::WordByte, f(b"abc", 0, 2));
+
+ assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
+ }
+}
diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs
new file mode 100644
index 000000000..567e600d6
--- /dev/null
+++ b/vendor/regex-automata/src/util/determinize/state.rs
@@ -0,0 +1,873 @@
+/*!
+This module defines a DFA state representation and builders for constructing
+DFA states.
+
+This representation is specifically for use in implementations of NFA-to-DFA
+conversion via powerset construction. (Also called "determinization" in this
+crate.)
+
+The term "DFA state" is somewhat overloaded in this crate. In some cases, it
+refers to the set of transitions over an alphabet for a particular state. In
+other cases, it refers to a set of NFA states. The former is really about the
+final representation of a state in a DFA's transition table, where as the
+latter---what this module is focusedon---is closer to an intermediate form that
+is used to help eventually build the transition table.
+
+This module exports four types. All four types represent the same idea: an
+ordered set of NFA states. This ordered set represents the epsilon closure of a
+particular NFA state, where the "epsilon closure" is the set of NFA states that
+can be transitioned to without consuming any input. i.e., Follow all of theNFA
+state's epsilon transitions. In addition, this implementation of DFA states
+cares about two other things: the ordered set of pattern IDs corresponding
+to the patterns that match if the state is a match state, and the set of
+look-behind assertions that were true when the state was created.
+
+The first, `State`, is a frozen representation of a state that cannot be
+modified. It may be cheaply cloned without copying the state itself and can be
+accessed safely from multiple threads simultaneously. This type is useful for
+when one knows that the DFA state being constructed is distinct from any other
+previously constructed states. Namely, powerset construction, in practice,
+requires one to keep a cache of previously created DFA states. Otherwise,
+the number of DFA states created in memory balloons to an impractically
+large number. For this reason, equivalent states should endeavor to have an
+equivalent byte-level representation. (In general, "equivalency" here means,
+"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
+full DFA minimization be implemented here. This form of equivalency is only
+surface deep and is more-or-less a practical necessity.)
+
+The other three types represent different phases in the construction of a
+DFA state. Internally, these three types (and `State`) all use the same
+byte-oriented representation. That means one can use any of the builder types
+to check whether the state it represents already exists or not. If it does,
+then there is no need to freeze it into a `State` (which requires an alloc and
+a copy). Here are the three types described succinctly:
+
+* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
+and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
+`StateBuilderEmpty` can only be used to query its underlying memory capacity,
+or to convert into a builder for recording pattern IDs and/or assertions.
+* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
+or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
+can only be used for adding pattern IDs and recording assertions.
+* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
+more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
+can only be used for adding NFA state IDs and recording some assertions.
+
+The expected flow here is to use the above builders to construct a candidate
+DFA state to check if it already exists. If it does, then there's no need to
+freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
+can be called to freeze the builder into an immutable `State`. In either
+case, `clear` should be called on the builder to turn it back into a
+`StateBuilderEmpty` that reuses the underyling memory.
+
+The main purpose for splitting the builder into these distinct types is to
+make it impossible to do things like adding a pattern ID after adding an NFA
+state ID. Namely, this makes it simpler to use a space-and-time efficient
+binary representation for the state. (The format is documented on the `Repr`
+type below.) If we just used one type for everything, it would be possible for
+callers to use an incorrect interleaving of calls and thus result in a corrupt
+representation. I chose to use more type machinery to make this impossible to
+do because 1) determinization is itself pretty complex and it wouldn't be too
+hard to foul this up and 2) there isn't too much machinery involve and it's
+well contained.
+
+As an optimization, sometimes states won't have certain things set. For
+example, if the underlying NFA has no word boundary assertions, then there is
+no reason to set a state's look-behind assertion as to whether it was generated
+from a word byte or not. Similarly, if a state has no NFA states corresponding
+to look-around assertions, then there is no reason to set `look_have` to a
+non-empty set. Finally, callers usually omit unconditional epsilon transitions
+when adding NFA state IDs since they aren't discriminatory.
+
+Finally, the binary representation used by these states is, thankfully, not
+serialized anywhere. So any kind of change can be made with reckless abandon,
+as long as everything in this module agrees.
+*/
+
+use core::{convert::TryFrom, mem};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::{
+ nfa::thompson::LookSet,
+ util::{
+ bytes::{self, Endian},
+ id::{PatternID, StateID},
+ },
+};
+
+/// A DFA state that, at its core, is represented by an ordered set of NFA
+/// states.
+///
+/// This type is intended to be used only in NFA-to-DFA conversion via powerset
+/// construction.
+///
+/// It may be cheaply cloned and accessed safely from mulitple threads
+/// simultaneously.
+#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub(crate) struct State(Arc<[u8]>);
+
+/// This Borrow impl permits us to lookup any state in a map by its byte
+/// representation. This is particularly convenient when one has a StateBuilder
+/// and we want to see if a correspondingly equivalent state already exists. If
+/// one does exist, then we can reuse the allocation required by StateBuilder
+/// without having to convert it into a State first.
+impl core::borrow::Borrow<[u8]> for State {
+ fn borrow(&self) -> &[u8] {
+ &*self.0
+ }
+}
+
+impl core::fmt::Debug for State {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("State").field(&self.repr()).finish()
+ }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl State {
+ pub(crate) fn dead() -> State {
+ StateBuilderEmpty::new().into_matches().into_nfa().to_state()
+ }
+
+ pub(crate) fn is_match(&self) -> bool {
+ self.repr().is_match()
+ }
+
+ pub(crate) fn is_from_word(&self) -> bool {
+ self.repr().is_from_word()
+ }
+
+ pub(crate) fn look_have(&self) -> LookSet {
+ self.repr().look_have()
+ }
+
+ pub(crate) fn look_need(&self) -> LookSet {
+ self.repr().look_need()
+ }
+
+ pub(crate) fn match_count(&self) -> usize {
+ self.repr().match_count()
+ }
+
+ pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
+ self.repr().match_pattern(index)
+ }
+
+ pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+ self.repr().match_pattern_ids()
+ }
+
+ pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
+ self.repr().iter_match_pattern_ids(f)
+ }
+
+ pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
+ self.repr().iter_nfa_state_ids(f)
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.0.len()
+ }
+
+ fn repr(&self) -> Repr<'_> {
+ Repr(&*self.0)
+ }
+}
+
+/// A state builder that represents an empty state.
+///
+/// This is a useful "initial condition" for state construction. It has no
+/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
+/// made when new() is called. Its main use is for being converted into a
+/// builder that can capture assertions and pattern IDs.
+#[derive(Clone, Debug)]
+pub(crate) struct StateBuilderEmpty(Vec<u8>);
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderEmpty {
+ pub(crate) fn new() -> StateBuilderEmpty {
+ StateBuilderEmpty(alloc::vec![])
+ }
+
+ pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
+ self.0.extend_from_slice(&[0, 0, 0]);
+ StateBuilderMatches(self.0)
+ }
+
+ fn clear(&mut self) {
+ self.0.clear();
+ }
+
+ pub(crate) fn capacity(&self) -> usize {
+ self.0.capacity()
+ }
+}
+
+/// A state builder that collects assertions and pattern IDs.
+///
+/// When collecting pattern IDs is finished, this can be converted into a
+/// builder that collects NFA state IDs.
+#[derive(Clone)]
+pub(crate) struct StateBuilderMatches(Vec<u8>);
+
+impl core::fmt::Debug for StateBuilderMatches {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish()
+ }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderMatches {
+ pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
+ self.repr_vec().close_match_pattern_ids();
+ StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
+ }
+
+ pub(crate) fn clear(self) -> StateBuilderEmpty {
+ let mut builder = StateBuilderEmpty(self.0);
+ builder.clear();
+ builder
+ }
+
+ pub(crate) fn is_match(&self) -> bool {
+ self.repr().is_match()
+ }
+
+ pub(crate) fn is_from_word(&self) -> bool {
+ self.repr().is_from_word()
+ }
+
+ pub(crate) fn set_is_from_word(&mut self) {
+ self.repr_vec().set_is_from_word()
+ }
+
+ pub(crate) fn look_have(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[1])
+ }
+
+ pub(crate) fn look_need(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[2])
+ }
+
+ pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
+ self.repr_vec().add_match_pattern_id(pid)
+ }
+
+ fn repr(&self) -> Repr<'_> {
+ Repr(&self.0)
+ }
+
+ fn repr_vec(&mut self) -> ReprVec<'_> {
+ ReprVec(&mut self.0)
+ }
+}
+
+/// A state builder that collects some assertions and NFA state IDs.
+///
+/// When collecting NFA state IDs is finished, this can be used to build a
+/// `State` if necessary.
+///
+/// When dont with building a state (regardless of whether it got kept or not),
+/// it's usually a good idea to call `clear` to get an empty builder back so
+/// that it can be reused to build the next state.
+#[derive(Clone)]
+pub(crate) struct StateBuilderNFA {
+ repr: Vec<u8>,
+ prev_nfa_state_id: StateID,
+}
+
+impl core::fmt::Debug for StateBuilderNFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish()
+ }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderNFA {
+ pub(crate) fn to_state(&self) -> State {
+ State(Arc::from(&*self.repr))
+ }
+
+ pub(crate) fn clear(self) -> StateBuilderEmpty {
+ let mut builder = StateBuilderEmpty(self.repr);
+ builder.clear();
+ builder
+ }
+
+ pub(crate) fn is_match(&self) -> bool {
+ self.repr().is_match()
+ }
+
+ pub(crate) fn is_from_word(&self) -> bool {
+ self.repr().is_from_word()
+ }
+
+ pub(crate) fn look_have(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.repr[1])
+ }
+
+ pub(crate) fn look_need(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.repr[2])
+ }
+
+ pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
+ ReprVec(&mut self.repr)
+ .add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
+ }
+
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.repr.len()
+ }
+
+ pub(crate) fn as_bytes(&self) -> &[u8] {
+ &self.repr
+ }
+
+ fn repr(&self) -> Repr<'_> {
+ Repr(&self.repr)
+ }
+
+ fn repr_vec(&mut self) -> ReprVec<'_> {
+ ReprVec(&mut self.repr)
+ }
+}
+
+/// Repr is a read-only view into the representation of a DFA state.
+///
+/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
+/// in one place, and then use a Repr to implement the various methods on the
+/// public state types.
+///
+/// The format is as follows:
+///
+/// The first three bytes correspond to bitsets.
+///
+/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
+/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
+/// if the state has pattern IDs explicitly written to it. (This is a flag that
+/// is not meant to be set by determinization, but rather, is used as part of
+/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
+/// generated by a transition over a "word" byte. (Callers may not always set
+/// this. For example, if the NFA has no word boundary assertion, then needing
+/// to track whether a state came from a word byte or not is superfluous and
+/// wasteful.)
+///
+/// Byte 1 corresponds to the look-behind assertions that were satisfied by
+/// the transition that created this state. This generally only includes the
+/// StartLine and StartText assertions. (Look-ahead assertions are not tracked
+/// as part of states. Instead, these are applied by re-computing the epsilon
+/// closure of a state when computing the transition function. See `next` in
+/// the parent module.)
+///
+/// Byte 2 corresponds to the set of look-around assertions (including both
+/// look-behind and look-ahead) that appear somewhere in this state's set of
+/// NFA state IDs. This is used to determine whether this state's epsilon
+/// closure should be re-computed when computing the transition function.
+/// Namely, look-around assertions are "just" conditional epsilon transitions,
+/// so if there are new assertions available when computing the transition
+/// function, we should only re-compute the epsilon closure if those new
+/// assertions are relevant to this particular state.
+///
+/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
+/// corresponding to the number of patterns encoded in this state. If the state
+/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
+/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
+/// offset 3 is the position at which the first NFA state ID is encoded.
+///
+/// For a match state with at least one non-ZERO pattern ID, the next bytes
+/// correspond to a sequence of 32-bit native endian encoded integers that
+/// represent each pattern ID, in order, that this match state represents.
+///
+/// After the pattern IDs (if any), NFA state IDs are delta encoded as
+/// varints.[1] The first NFA state ID is encoded as itself, and each
+/// subsequent NFA state ID is encoded as the difference between itself and the
+/// previous NFA state ID.
+///
+/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
+struct Repr<'a>(&'a [u8]);
+
+impl<'a> Repr<'a> {
+ /// Returns true if and only if this is a match state.
+ ///
+ /// If callers have added pattern IDs to this state, then callers MUST set
+ /// this state as a match state explicitly. However, as a special case,
+ /// states that are marked as match states but with no pattern IDs, then
+ /// the state is treated as if it had a single pattern ID equivalent to
+ /// PatternID::ZERO.
+ fn is_match(&self) -> bool {
+ self.0[0] & (1 << 0) > 0
+ }
+
+ /// Returns true if and only if this state has had at least one pattern
+ /// ID added to it.
+ ///
+ /// This is an internal-only flag that permits the representation to save
+ /// space in the common case of an NFA with one pattern in it. In that
+ /// case, a match state can only ever have exactly one pattern ID:
+ /// PatternID::ZERO. So there's no need to represent it.
+ fn has_pattern_ids(&self) -> bool {
+ self.0[0] & (1 << 1) > 0
+ }
+
+ /// Returns true if and only if this state is marked as having been created
+ /// from a transition over a word byte. This is useful for checking whether
+ /// a word boundary assertion is true or not, which requires look-behind
+ /// (whether the current state came from a word byte or not) and look-ahead
+ /// (whether the transition byte is a word byte or not).
+ ///
+ /// Since states with this set are distinct from states that don't have
+ /// this set (even if they are otherwise equivalent), callers should not
+ /// set this assertion unless the underlying NFA has at least one word
+ /// boundary assertion somewhere. Otherwise, a superfluous number of states
+ /// may be created.
+ fn is_from_word(&self) -> bool {
+ self.0[0] & (1 << 2) > 0
+ }
+
+ /// The set of look-behind assertions that were true in the transition that
+ /// created this state.
+ ///
+ /// Generally, this should be empty if 'look_need' is empty, since there is
+ /// no reason to track which look-behind assertions are true if the state
+ /// has no conditional epsilon transitions.
+ ///
+ /// Satisfied look-ahead assertions are not tracked in states. Instead,
+ /// these are re-computed on demand via epsilon closure when computing the
+ /// transition function.
+ fn look_have(&self) -> LookSet {
+ LookSet::from_repr(self.0[1])
+ }
+
+ /// The set of look-around (both behind and ahead) assertions that appear
+ /// at least once in this state's set of NFA states.
+ ///
+ /// This is used to determine whether the epsilon closure needs to be
+ /// re-computed when computing the transition function. Namely, if the
+ /// state has no conditional epsilon transitions, then there is no need
+ /// to re-compute the epsilon closure.
+ fn look_need(&self) -> LookSet {
+ LookSet::from_repr(self.0[2])
+ }
+
+ /// Returns the total number of match pattern IDs in this state.
+ ///
+ /// If this state is not a match state, then this always returns 0.
+ fn match_count(&self) -> usize {
+ if !self.is_match() {
+ return 0;
+ } else if !self.has_pattern_ids() {
+ 1
+ } else {
+ self.encoded_pattern_count()
+ }
+ }
+
+ /// Returns the pattern ID for this match state at the given index.
+ ///
+ /// If the given index is greater than or equal to `match_count()` for this
+ /// state, then this could panic or return incorrect results.
+ fn match_pattern(&self, index: usize) -> PatternID {
+ if !self.has_pattern_ids() {
+ PatternID::ZERO
+ } else {
+ let offset = 7 + index * PatternID::SIZE;
+ // This is OK since we only ever serialize valid PatternIDs to
+ // states.
+ bytes::read_pattern_id_unchecked(&self.0[offset..]).0
+ }
+ }
+
+ /// Returns a copy of all match pattern IDs in this state. If this state
+ /// is not a match state, then this returns None.
+ fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+ if !self.is_match() {
+ return None;
+ }
+ let mut pids = alloc::vec![];
+ self.iter_match_pattern_ids(|pid| pids.push(pid));
+ Some(pids)
+ }
+
+ /// Calls the given function on every pattern ID in this state.
+ fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
+ if !self.is_match() {
+ return;
+ }
+ // As an optimization for a very common case, when this is a match
+ // state for an NFA with only one pattern, we don't actually write the
+ // pattern ID to the state representation. Instead, we know it must
+ // be there since it is the only possible choice.
+ if !self.has_pattern_ids() {
+ f(PatternID::ZERO);
+ return;
+ }
+ let mut pids = &self.0[7..self.pattern_offset_end()];
+ while !pids.is_empty() {
+ let pid = bytes::read_u32(pids);
+ pids = &pids[PatternID::SIZE..];
+ // This is OK since we only ever serialize valid PatternIDs to
+ // states. And since pattern IDs can never exceed a usize, the
+ // unwrap is OK.
+ f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
+ }
+ }
+
+ /// Calls the given function on every NFA state ID in this state.
+ fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
+ let mut sids = &self.0[self.pattern_offset_end()..];
+ let mut prev = 0i32;
+ while !sids.is_empty() {
+ let (delta, nr) = read_vari32(sids);
+ sids = &sids[nr..];
+ let sid = prev + delta;
+ prev = sid;
+ // This is OK since we only ever serialize valid StateIDs to
+ // states. And since state IDs can never exceed an isize, they must
+ // always be able to fit into a usize, and thus cast is OK.
+ f(StateID::new_unchecked(sid as usize))
+ }
+ }
+
+ /// Returns the offset into this state's representation where the pattern
+ /// IDs end and the NFA state IDs begin.
+ fn pattern_offset_end(&self) -> usize {
+ let encoded = self.encoded_pattern_count();
+ if encoded == 0 {
+ return 3;
+ }
+ // This arithmetic is OK since we were able to address this many bytes
+ // when writing to the state, thus, it must fit into a usize.
+ encoded.checked_mul(4).unwrap().checked_add(7).unwrap()
+ }
+
+ /// Returns the total number of *encoded* pattern IDs in this state.
+ ///
+ /// This may return 0 even when this is a match state, since the pattern
+ /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
+ /// the match state (the overwhelming common case).
+ fn encoded_pattern_count(&self) -> usize {
+ if !self.has_pattern_ids() {
+ return 0;
+ }
+ // This unwrap is OK since the total number of patterns is always
+ // guaranteed to fit into a usize.
+ usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap()
+ }
+}
+
+impl<'a> core::fmt::Debug for Repr<'a> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut nfa_ids = alloc::vec![];
+ self.iter_nfa_state_ids(|sid| nfa_ids.push(sid));
+ f.debug_struct("Repr")
+ .field("is_match", &self.is_match())
+ .field("is_from_word", &self.is_from_word())
+ .field("look_have", &self.look_have())
+ .field("look_need", &self.look_need())
+ .field("match_pattern_ids", &self.match_pattern_ids())
+ .field("nfa_state_ids", &nfa_ids)
+ .finish()
+ }
+}
+
+/// ReprVec is a write-only view into the representation of a DFA state.
+///
+/// See Repr for more details on the purpose of this type and also the format.
+///
+/// Note that not all possible combinations of methods may be called. This is
+/// precisely what the various StateBuilder types encapsulate: they only
+/// permit valid combinations via Rust's linear typing.
+struct ReprVec<'a>(&'a mut Vec<u8>);
+
+impl<'a> ReprVec<'a> {
+ /// Set this state as a match state.
+ ///
+ /// This should not be exposed explicitly outside of this module. It is
+ /// set automatically when a pattern ID is added.
+ fn set_is_match(&mut self) {
+ self.0[0] |= 1 << 0;
+ }
+
+ /// Set that this state has pattern IDs explicitly written to it.
+ ///
+ /// This should not be exposed explicitly outside of this module. This is
+ /// used internally as a space saving optimization. Namely, if the state
+ /// is a match state but does not have any pattern IDs written to it,
+ /// then it is automatically inferred to have a pattern ID of ZERO.
+ fn set_has_pattern_ids(&mut self) {
+ self.0[0] |= 1 << 1;
+ }
+
+ /// Set this state as being built from a transition over a word byte.
+ ///
+ /// Setting this is only necessary when one needs to deal with word
+ /// boundary assertions. Therefore, if the underlying NFA has no word
+ /// boundary assertions, callers should not set this.
+ fn set_is_from_word(&mut self) {
+ self.0[0] |= 1 << 2;
+ }
+
+ /// Return a mutable reference to the 'look_have' assertion set.
+ fn look_have_mut(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[1])
+ }
+
+ /// Return a mutable reference to the 'look_need' assertion set.
+ fn look_need_mut(&mut self) -> &mut LookSet {
+ LookSet::from_repr_mut(&mut self.0[2])
+ }
+
+ /// Add a pattern ID to this state. All match states must have at least
+ /// one pattern ID associated with it.
+ ///
+ /// Callers must never add duplicative pattern IDs.
+ ///
+ /// The order in which patterns are added must correspond to the order
+ /// in which patterns are reported as matches.
+ fn add_match_pattern_id(&mut self, pid: PatternID) {
+ // As a (somewhat small) space saving optimization, in the case where
+ // a matching state has exactly one pattern ID, PatternID::ZERO, we do
+ // not write either the pattern ID or the number of patterns encoded.
+ // Instead, all we do is set the 'is_match' bit on this state. Overall,
+ // this saves 8 bytes per match state for the overwhelming majority of
+ // match states.
+ //
+ // In order to know whether pattern IDs need to be explicitly read or
+ // not, we use another internal-only bit, 'has_pattern_ids', to
+ // indicate whether they have been explicitly written or not.
+ if !self.repr().has_pattern_ids() {
+ if pid == PatternID::ZERO {
+ self.set_is_match();
+ return;
+ }
+ // Make room for 'close_match_pattern_ids' to write the total
+ // number of pattern IDs written.
+ self.0.extend(core::iter::repeat(0).take(PatternID::SIZE));
+ self.set_has_pattern_ids();
+ // If this was already a match state, then the only way that's
+ // possible when the state doesn't have pattern IDs is if
+ // PatternID::ZERO was added by the caller previously. In this
+ // case, we are now adding a non-ZERO pattern ID after it, in
+ // which case, we want to make sure to represent ZERO explicitly
+ // now.
+ if self.repr().is_match() {
+ write_u32(self.0, 0)
+ } else {
+ // Otherwise, just make sure the 'is_match' bit is set.
+ self.set_is_match();
+ }
+ }
+ write_u32(self.0, pid.as_u32());
+ }
+
+ /// Indicate that no more pattern IDs will be added to this state.
+ ///
+ /// Once this is called, callers must not call it or 'add_match_pattern_id'
+ /// again.
+ ///
+ /// This should not be exposed explicitly outside of this module. It
+ /// should be called only when converting a StateBuilderMatches into a
+ /// StateBuilderNFA.
+ fn close_match_pattern_ids(&mut self) {
+ // If we never wrote any pattern IDs, then there's nothing to do here.
+ if !self.repr().has_pattern_ids() {
+ return;
+ }
+ let patsize = PatternID::SIZE;
+ let pattern_bytes = self.0.len() - 7;
+ // Every pattern ID uses 4 bytes, so number of bytes should be
+ // divisible by 4.
+ assert_eq!(pattern_bytes % patsize, 0);
+ // This unwrap is OK since we are guaranteed that the maximum number
+ // of possible patterns fits into a u32.
+ let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
+ bytes::NE::write_u32(count32, &mut self.0[3..7]);
+ }
+
+ /// Add an NFA state ID to this state. The order in which NFA states are
+ /// added matters. It is the caller's responsibility to ensure that
+ /// duplicate NFA state IDs are not added.
+ fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
+ let delta = sid.as_i32() - prev.as_i32();
+ write_vari32(self.0, delta);
+ *prev = sid;
+ }
+
+ /// Return a read-only view of this state's representation.
+ fn repr(&self) -> Repr<'_> {
+ Repr(self.0.as_slice())
+ }
+}
+
+/// Write a signed 32-bit integer using zig-zag encoding.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+ let mut un = (n as u32) << 1;
+ if n < 0 {
+ un = !un;
+ }
+ write_varu32(data, un)
+}
+
+/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
+/// number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+ let (un, i) = read_varu32(data);
+ let mut n = (un >> 1) as i32;
+ if un & 1 != 0 {
+ n = !n;
+ }
+ (n, i)
+}
+
+/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
+/// as a sequence of bytes where all bytes except for the last one have the
+/// most significant bit set. The least significant 7 bits correspond to the
+/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
+/// very common cases, it uses fewer than 4.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+ while n >= 0b1000_0000 {
+ data.push((n as u8) | 0b1000_0000);
+ n >>= 7;
+ }
+ data.push(n as u8);
+}
+
+/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+ // N.B. We can assume correctness here since we know that all varuints are
+ // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
+ // is all okay.
+ let mut n: u32 = 0;
+ let mut shift: u32 = 0;
+ for (i, &b) in data.iter().enumerate() {
+ if b < 0b1000_0000 {
+ return (n | ((b as u32) << shift), i + 1);
+ }
+ n |= ((b as u32) & 0b0111_1111) << shift;
+ shift += 7;
+ }
+ (0, 0)
+}
+
+/// Push a native-endian encoded `n` on to `dst`.
+fn write_u32(dst: &mut Vec<u8>, n: u32) {
+ use crate::util::bytes::{Endian, NE};
+
+ let start = dst.len();
+ dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
+ NE::write_u32(n, &mut dst[start..]);
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec;
+
+ use quickcheck::quickcheck;
+
+ use super::*;
+
+ quickcheck! {
+ fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
+ // Builders states do not permit duplicate IDs.
+ let sids = dedup_state_ids(sids);
+
+ let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
+ for &sid in &sids {
+ b.add_nfa_state_id(sid);
+ }
+ let s = b.to_state();
+ let mut got = vec![];
+ s.iter_nfa_state_ids(|sid| got.push(sid));
+ got == sids
+ }
+
+ fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
+ // Builders states do not permit duplicate IDs.
+ let pids = dedup_pattern_ids(pids);
+
+ let mut b = StateBuilderEmpty::new().into_matches();
+ for &pid in &pids {
+ b.add_match_pattern_id(pid);
+ }
+ let s = b.into_nfa().to_state();
+ let mut got = vec![];
+ s.iter_match_pattern_ids(|pid| got.push(pid));
+ got == pids
+ }
+
+ fn prop_state_read_write_nfa_state_and_pattern_ids(
+ sids: Vec<StateID>,
+ pids: Vec<PatternID>
+ ) -> bool {
+ // Builders states do not permit duplicate IDs.
+ let sids = dedup_state_ids(sids);
+ let pids = dedup_pattern_ids(pids);
+
+ let mut b = StateBuilderEmpty::new().into_matches();
+ for &pid in &pids {
+ b.add_match_pattern_id(pid);
+ }
+
+ let mut b = b.into_nfa();
+ for &sid in &sids {
+ b.add_nfa_state_id(sid);
+ }
+
+ let s = b.to_state();
+ let mut got_pids = vec![];
+ s.iter_match_pattern_ids(|pid| got_pids.push(pid));
+ let mut got_sids = vec![];
+ s.iter_nfa_state_ids(|sid| got_sids.push(sid));
+ got_pids == pids && got_sids == sids
+ }
+
+ fn prop_read_write_varu32(n: u32) -> bool {
+ let mut buf = vec![];
+ write_varu32(&mut buf, n);
+ let (got, nread) = read_varu32(&buf);
+ nread == buf.len() && got == n
+ }
+
+ fn prop_read_write_vari32(n: i32) -> bool {
+ let mut buf = vec![];
+ write_vari32(&mut buf, n);
+ let (got, nread) = read_vari32(&buf);
+ nread == buf.len() && got == n
+ }
+ }
+
+ fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
+ let mut set = alloc::collections::BTreeSet::new();
+ let mut deduped = vec![];
+ for sid in sids {
+ if set.contains(&sid) {
+ continue;
+ }
+ set.insert(sid);
+ deduped.push(sid);
+ }
+ deduped
+ }
+
+ fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
+ let mut set = alloc::collections::BTreeSet::new();
+ let mut deduped = vec![];
+ for pid in pids {
+ if set.contains(&pid) {
+ continue;
+ }
+ set.insert(pid);
+ deduped.push(pid);
+ }
+ deduped
+ }
+}
diff --git a/vendor/regex-automata/src/util/id.rs b/vendor/regex-automata/src/util/id.rs
new file mode 100644
index 000000000..70bf0a93b
--- /dev/null
+++ b/vendor/regex-automata/src/util/id.rs
@@ -0,0 +1,608 @@
+/*!
+Type definitions for identifier types.
+
+A [`StateID`] represents the possible set of identifiers used in regex engine
+implementations in this crate. For example, they are used to identify both NFA
+and DFA states.
+
+A [`PatternID`] represents the possible set of identifiers for patterns. All
+regex engine implementations in this crate support searching for multiple
+patterns simultaneously. A `PatternID` is how each pattern is uniquely
+identified for a particular instance of a regex engine. Namely, a pattern is
+assigned an auto-incrementing integer, starting at `0`, based on the order of
+patterns supplied during the construction of the regex engine.
+
+These identifier types represent a way for this crate to make correctness
+guarantees around the possible set of values that a `StateID` or a `PatternID`
+might represent. Similarly, they also provide a way of constraining the size of
+these identifiers to reduce space usage while still guaranteeing that all such
+identifiers are repsentable by a `usize` for the current target.
+
+Moreover, the identifier types clamp the range of permissible values to a range
+that is typically smaller than its internal representation. (With the maximum
+value being, e.g., `StateID::MAX`.) Users of these types may not rely this
+clamping for the purpose of memory safety. Users may, however, rely on these
+invariants to avoid panics or other types of logic bugs.
+*/
+
+// Continuing from the above comment about correctness guarantees, an example
+// of a way in which we use the guarantees on these types is delta encoding.
+// Namely, we require that IDs can be at most 2^31 - 2, which means the
+// difference between any two IDs is always representable as an i32.
+
+use core::{
+ convert::{Infallible, TryFrom},
+ mem, ops,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+/// An identifier for a regex pattern.
+///
+/// The identifier for a pattern corresponds to its relative position among
+/// other patterns in a single finite state machine. Namely, when building
+/// a multi-pattern regex engine, one must supply a sequence of patterns to
+/// match. The position (starting at 0) of each pattern in that sequence
+/// represents its identifier. This identifier is in turn used to identify and
+/// report matches of that pattern in various APIs.
+///
+/// A pattern ID is guaranteed to be representable by a `usize`. Similarly,
+/// the number of patterns in any regex engine in this crate is guaranteed to
+/// be representable by a `usize`. This applies to regex engines that have
+/// been deserialized; a deserialization error will be returned if it contains
+/// pattern IDs that violate these requirements in your current environment.
+///
+/// For extra convenience in some cases, this type also guarantees that all
+/// IDs can fit into an `i32` and an `isize` without overflowing.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `PatternID` to index slices.
+///
+/// # Safety
+///
+/// While a `PatternID` is meant to guarantee that its value fits into `usize`
+/// (while using a possibly smaller representation than `usize` on some
+/// targets), callers must not rely on this property for safety. Callers may
+/// choose to rely on this property for correctness however.
+#[repr(transparent)]
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct PatternID(u32);
+
+impl PatternID {
+ /// The maximum pattern ID value, represented as a `usize`.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: PatternID =
+ PatternID::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum pattern ID value, represented as a `usize`.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of patterns that are allowed in any single regex
+ /// engine.
+ pub const LIMIT: usize = PatternID::MAX.as_usize() + 1;
+
+ /// The zero pattern ID value.
+ pub const ZERO: PatternID = PatternID::new_unchecked(0);
+
+ /// The number of bytes that a single `PatternID` uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<PatternID>();
+
+ /// Create a new pattern ID.
+ ///
+ /// If the given identifier exceeds [`PatternID::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(id: usize) -> Result<PatternID, PatternIDError> {
+ PatternID::try_from(id)
+ }
+
+ /// Create a new pattern ID without checking whether the given value
+ /// exceeds [`PatternID::MAX`].
+ ///
+ /// While this is unchecked, providing an incorrect value must never
+ /// sacrifice memory safety, as documented above.
+ #[inline]
+ pub const fn new_unchecked(id: usize) -> PatternID {
+ PatternID(id as u32)
+ }
+
+ /// Like [`PatternID::new`], but panics if the given ID is not valid.
+ #[inline]
+ pub fn must(id: usize) -> PatternID {
+ PatternID::new(id).unwrap()
+ }
+
+ /// Return this pattern ID as a `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0 as usize
+ }
+
+ /// Return the internal u32 of this pattern ID.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal u32 of this pattern ID represented as an i32.
+ ///
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0 as i32
+ }
+
+ /// Returns one more than this pattern ID as a usize.
+ ///
+ /// Since a pattern ID has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize` (and a `u32`).
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize().checked_add(1).unwrap()
+ }
+
+ /// Decode this pattern ID from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// If the decoded integer is not representable as a pattern ID for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > PatternID::MAX.as_u32() {
+ return Err(PatternIDError { attempted: id as u64 });
+ }
+ Ok(PatternID::new_unchecked(id as usize))
+ }
+
+ /// Decode this pattern ID from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`PatternID::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a pattern ID.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID {
+ PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
+ }
+
+ /// Return the underlying pattern ID integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all pattern IDs from 0 up to and not including
+ /// the given length.
+ ///
+ /// If the given length exceeds [`PatternID::LIMIT`], then this panics.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn iter(len: usize) -> PatternIDIter {
+ PatternIDIter::new(len)
+ }
+}
+
+/// This error occurs when a pattern ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum pattern ID value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct PatternIDError {
+ attempted: u64,
+}
+
+impl PatternIDError {
+ /// Returns the value that failed to constructed a pattern ID.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for PatternIDError {}
+
+impl core::fmt::Display for PatternIDError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create PatternID from {:?}, which exceeds {:?}",
+ self.attempted(),
+ PatternID::MAX,
+ )
+ }
+}
+
+/// An identifier for a state in a regex engine.
+///
+/// A state ID is guaranteed to be representable by a `usize`. Similarly, the
+/// number of states in any regex engine in this crate is guaranteed to be
+/// representable by a `usize`. This applies to regex engines that have been
+/// deserialized; a deserialization error will be returned if it contains state
+/// IDs that violate these requirements in your current environment.
+///
+/// For extra convenience in some cases, this type also guarantees that all
+/// IDs can fit into an `i32` and an `isize` without overflowing.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `StateID` to index slices.
+///
+/// # Safety
+///
+/// While a `StateID` is meant to guarantee that its value fits into `usize`
+/// (while using a possibly smaller representation than `usize` on some
+/// targets), callers must not rely on this property for safety. Callers may
+/// choose to rely on this property for correctness however.
+#[repr(transparent)]
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct StateID(u32);
+
+impl StateID {
+ /// The maximum state ID value.
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ pub const MAX: StateID =
+ StateID::new_unchecked(core::i32::MAX as usize - 1);
+
+ /// The maximum state ID value.
+ #[cfg(target_pointer_width = "16")]
+ pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1);
+
+ /// The total number of states that are allowed in any single regex
+ /// engine, represented as a `usize`.
+ pub const LIMIT: usize = StateID::MAX.as_usize() + 1;
+
+ /// The zero state ID value.
+ pub const ZERO: StateID = StateID::new_unchecked(0);
+
+ /// The number of bytes that a single `StateID` uses in memory.
+ pub const SIZE: usize = core::mem::size_of::<StateID>();
+
+ /// Create a new state ID.
+ ///
+ /// If the given identifier exceeds [`StateID::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub fn new(id: usize) -> Result<StateID, StateIDError> {
+ StateID::try_from(id)
+ }
+
+ /// Create a new state ID without checking whether the given value
+ /// exceeds [`StateID::MAX`].
+ ///
+ /// While this is unchecked, providing an incorrect value must never
+ /// sacrifice memory safety, as documented above.
+ #[inline]
+ pub const fn new_unchecked(id: usize) -> StateID {
+ StateID(id as u32)
+ }
+
+ /// Like [`StateID::new`], but panics if the given ID is not valid.
+ #[inline]
+ pub fn must(id: usize) -> StateID {
+ StateID::new(id).unwrap()
+ }
+
+ /// Return this state ID as a `usize`.
+ #[inline]
+ pub const fn as_usize(&self) -> usize {
+ self.0 as usize
+ }
+
+ /// Return the internal u32 of this state ID.
+ #[inline]
+ pub const fn as_u32(&self) -> u32 {
+ self.0
+ }
+
+ /// Return the internal u32 of this pattern ID represented as an i32.
+ ///
+ /// This is guaranteed to never overflow an `i32`.
+ #[inline]
+ pub const fn as_i32(&self) -> i32 {
+ self.0 as i32
+ }
+
+ /// Returns one more than this state ID as a usize.
+ ///
+ /// Since a state ID has constraints on its maximum value, adding `1` to
+ /// it will always fit in a `usize` (and a `u32`).
+ #[inline]
+ pub fn one_more(&self) -> usize {
+ self.as_usize().checked_add(1).unwrap()
+ }
+
+ /// Decode this state ID from the bytes given using the native endian byte
+ /// order for the current target.
+ ///
+ /// If the decoded integer is not representable as a state ID for the
+ /// current target, then this returns an error.
+ #[inline]
+ pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> {
+ let id = u32::from_ne_bytes(bytes);
+ if id > StateID::MAX.as_u32() {
+ return Err(StateIDError { attempted: id as u64 });
+ }
+ Ok(StateID::new_unchecked(id as usize))
+ }
+
+ /// Decode this state ID from the bytes given using the native endian
+ /// byte order for the current target.
+ ///
+ /// This is analogous to [`StateID::new_unchecked`] in that is does not
+ /// check whether the decoded integer is representable as a state ID.
+ #[inline]
+ pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID {
+ StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
+ }
+
+ /// Return the underlying state ID integer as raw bytes in native endian
+ /// format.
+ #[inline]
+ pub fn to_ne_bytes(&self) -> [u8; 4] {
+ self.0.to_ne_bytes()
+ }
+
+ /// Returns an iterator over all state IDs from 0 up to and not including
+ /// the given length.
+ ///
+ /// If the given length exceeds [`StateID::LIMIT`], then this panics.
+ #[cfg(feature = "alloc")]
+ pub(crate) fn iter(len: usize) -> StateIDIter {
+ StateIDIter::new(len)
+ }
+}
+
+/// This error occurs when a state ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum state ID value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct StateIDError {
+ attempted: u64,
+}
+
+impl StateIDError {
+ /// Returns the value that failed to constructed a state ID.
+ pub fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StateIDError {}
+
+impl core::fmt::Display for StateIDError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create StateID from {:?}, which exceeds {:?}",
+ self.attempted(),
+ StateID::MAX,
+ )
+ }
+}
+
+/// A macro for defining exactly identical (modulo names) impls for ID types.
+macro_rules! impls {
+ ($ty:ident, $tyerr:ident, $tyiter:ident) => {
+ #[derive(Clone, Debug)]
+ pub(crate) struct $tyiter {
+ rng: ops::Range<usize>,
+ }
+
+ impl $tyiter {
+ #[cfg(feature = "alloc")]
+ fn new(len: usize) -> $tyiter {
+ assert!(
+ len <= $ty::LIMIT,
+ "cannot create iterator with IDs when number of \
+ elements exceed {:?}",
+ $ty::LIMIT,
+ );
+ $tyiter { rng: 0..len }
+ }
+ }
+
+ impl Iterator for $tyiter {
+ type Item = $ty;
+
+ fn next(&mut self) -> Option<$ty> {
+ if self.rng.start >= self.rng.end {
+ return None;
+ }
+ let next_id = self.rng.start + 1;
+ let id = mem::replace(&mut self.rng.start, next_id);
+ // new_unchecked is OK since we asserted that the number of
+ // elements in this iterator will fit in an ID at construction.
+ Some($ty::new_unchecked(id))
+ }
+ }
+
+ impl<T> core::ops::Index<$ty> for [T] {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $ty) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ impl<T> core::ops::IndexMut<$ty> for [T] {
+ #[inline]
+ fn index_mut(&mut self, index: $ty) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<T> core::ops::Index<$ty> for Vec<T> {
+ type Output = T;
+
+ #[inline]
+ fn index(&self, index: $ty) -> &T {
+ &self[index.as_usize()]
+ }
+ }
+
+ #[cfg(feature = "alloc")]
+ impl<T> core::ops::IndexMut<$ty> for Vec<T> {
+ #[inline]
+ fn index_mut(&mut self, index: $ty) -> &mut T {
+ &mut self[index.as_usize()]
+ }
+ }
+
+ impl TryFrom<usize> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: usize) -> Result<$ty, $tyerr> {
+ if id > $ty::MAX.as_usize() {
+ return Err($tyerr { attempted: id as u64 });
+ }
+ Ok($ty::new_unchecked(id))
+ }
+ }
+
+ impl TryFrom<u8> for $ty {
+ type Error = Infallible;
+
+ fn try_from(id: u8) -> Result<$ty, Infallible> {
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ impl TryFrom<u16> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: u16) -> Result<$ty, $tyerr> {
+ if id as u32 > $ty::MAX.as_u32() {
+ return Err($tyerr { attempted: id as u64 });
+ }
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ impl TryFrom<u32> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: u32) -> Result<$ty, $tyerr> {
+ if id > $ty::MAX.as_u32() {
+ return Err($tyerr { attempted: id as u64 });
+ }
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ impl TryFrom<u64> for $ty {
+ type Error = $tyerr;
+
+ fn try_from(id: u64) -> Result<$ty, $tyerr> {
+ if id > $ty::MAX.as_u32() as u64 {
+ return Err($tyerr { attempted: id });
+ }
+ Ok($ty::new_unchecked(id as usize))
+ }
+ }
+
+ #[cfg(test)]
+ impl quickcheck::Arbitrary for $ty {
+ fn arbitrary(gen: &mut quickcheck::Gen) -> $ty {
+ use core::cmp::max;
+
+ let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
+ if id > $ty::MAX.as_i32() {
+ $ty::MAX
+ } else {
+ $ty::new(usize::try_from(id).unwrap()).unwrap()
+ }
+ }
+ }
+ };
+}
+
+impls!(PatternID, PatternIDError, PatternIDIter);
+impls!(StateID, StateIDError, StateIDIter);
+
+/// A utility trait that defines a couple of adapters for making it convenient
+/// to access indices as ID types. We require ExactSizeIterator so that
+/// iterator construction can do a single check to make sure the index of each
+/// element is representable by its ID type.
+#[cfg(feature = "alloc")]
+pub(crate) trait IteratorIDExt: Iterator {
+ fn with_pattern_ids(self) -> WithPatternIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithPatternIDIter::new(self)
+ }
+
+ fn with_state_ids(self) -> WithStateIDIter<Self>
+ where
+ Self: Sized + ExactSizeIterator,
+ {
+ WithStateIDIter::new(self)
+ }
+}
+
+#[cfg(feature = "alloc")]
+impl<I: Iterator> IteratorIDExt for I {}
+
+#[cfg(feature = "alloc")]
+macro_rules! iditer {
+ ($ty:ident, $iterty:ident, $withiterty:ident) => {
+ /// An iterator adapter that is like std::iter::Enumerate, but attaches
+ /// IDs. It requires ExactSizeIterator. At construction, it ensures
+ /// that the index of each element in the iterator is representable in
+ /// the corresponding ID type.
+ #[derive(Clone, Debug)]
+ pub(crate) struct $withiterty<I> {
+ it: I,
+ ids: $iterty,
+ }
+
+ impl<I: Iterator + ExactSizeIterator> $withiterty<I> {
+ fn new(it: I) -> $withiterty<I> {
+ let ids = $ty::iter(it.len());
+ $withiterty { it, ids }
+ }
+ }
+
+ impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> {
+ type Item = ($ty, I::Item);
+
+ fn next(&mut self) -> Option<($ty, I::Item)> {
+ let item = self.it.next()?;
+ // Number of elements in this iterator must match, according
+ // to contract of ExactSizeIterator.
+ let id = self.ids.next().unwrap();
+ Some((id, item))
+ }
+ }
+ };
+}
+
+#[cfg(feature = "alloc")]
+iditer!(PatternID, PatternIDIter, WithPatternIDIter);
+#[cfg(feature = "alloc")]
+iditer!(StateID, StateIDIter, WithStateIDIter);
diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs
new file mode 100644
index 000000000..d8cac6ef4
--- /dev/null
+++ b/vendor/regex-automata/src/util/lazy.rs
@@ -0,0 +1,31 @@
+use core::{
+ cell::Cell,
+ ptr,
+ sync::atomic::{AtomicPtr, Ordering},
+};
+
+use alloc::{boxed::Box, vec::Vec};
+
+#[inline(always)]
+pub(crate) fn get_or_init<T: Send + Sync + 'static>(
+ location: &'static AtomicPtr<T>,
+ init: impl FnOnce() -> T,
+) -> &'static T {
+ let mut ptr = location.load(Ordering::Acquire);
+ if ptr.is_null() {
+ let new_dfa = Box::new(init());
+ ptr = Box::into_raw(new_dfa);
+ let result = location.compare_exchange(
+ ptr::null_mut(),
+ ptr,
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ );
+ if let Err(old) = result {
+ let redundant = unsafe { Box::from_raw(ptr) };
+ drop(redundant);
+ ptr = old;
+ }
+ }
+ unsafe { &*ptr }
+}
diff --git a/vendor/regex-automata/src/util/matchtypes.rs b/vendor/regex-automata/src/util/matchtypes.rs
new file mode 100644
index 000000000..de0fa65bf
--- /dev/null
+++ b/vendor/regex-automata/src/util/matchtypes.rs
@@ -0,0 +1,356 @@
+use crate::util::id::PatternID;
+
+/// The kind of match semantics to use for a DFA.
+///
+/// The default match kind is `LeftmostFirst`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+ /// Report all possible matches.
+ All,
+ /// Report only the leftmost matches. When multiple leftmost matches exist,
+ /// report the match corresponding to the part of the regex that appears
+ /// first in the syntax.
+ LeftmostFirst,
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+ // There is prior art in RE2 that shows that we should be able to add
+ // LeftmostLongest too. The tricky part of it is supporting ungreedy
+ // repetitions. Instead of treating all NFA states as having equivalent
+ // priority (as in 'All') or treating all NFA states as having distinct
+ // priority based on order (as in 'LeftmostFirst'), we instead group NFA
+ // states into sets, and treat members of each set as having equivalent
+ // priority, but having greater priority than all following members
+ // of different sets.
+ //
+ // However, it's not clear whether it's really worth adding this. After
+ // all, leftmost-longest can be emulated when using literals by using
+ // leftmost-first and sorting the literals by length in descending order.
+ // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
+ // always match `a` in `ab` when using leftmost-first, but leftmost-longest
+ // would match `ab`.
+}
+
+impl MatchKind {
+ #[cfg(feature = "alloc")]
+ pub(crate) fn continue_past_first_match(&self) -> bool {
+ *self == MatchKind::All
+ }
+}
+
+impl Default for MatchKind {
+ fn default() -> MatchKind {
+ MatchKind::LeftmostFirst
+ }
+}
+
+/// A representation of a match reported by a regex engine.
+///
+/// A match records the start and end offsets of the match in the haystack.
+///
+/// Every match guarantees that `start <= end`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ /// The start offset of the match, inclusive.
+ start: usize,
+ /// The end offset of the match, exclusive.
+ end: usize,
+}
+
+impl Match {
+ /// Create a new match from a byte offset span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ #[inline]
+ pub fn new(start: usize, end: usize) -> Match {
+ assert!(start <= end);
+ Match { start, end }
+ }
+
+ /// The starting position of the match.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// The ending position of the match.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the match location as a range.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns true if and only if this match is empty. That is, when
+ /// `start() == end()`.
+ ///
+ /// An empty match can only be returned when the empty string was among
+ /// the patterns used to build the Aho-Corasick automaton.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+}
+
+/// A representation of a match reported by a DFA.
+///
+/// This is called a "half" match because it only includes the end location
+/// (or start location for a reverse match) of a match. This corresponds to the
+/// information that a single DFA scan can report. Getting the other half of
+/// the match requires a second scan with a reversed DFA.
+///
+/// A half match also includes the pattern that matched. The pattern is
+/// identified by an ID, which corresponds to its position (starting from `0`)
+/// relative to other patterns used to construct the corresponding DFA. If only
+/// a single pattern is provided to the DFA, then all matches are guaranteed to
+/// have a pattern ID of `0`.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct HalfMatch {
+ /// The pattern ID.
+ pub(crate) pattern: PatternID,
+ /// The offset of the match.
+ ///
+ /// For forward searches, the offset is exclusive. For reverse searches,
+ /// the offset is inclusive.
+ pub(crate) offset: usize,
+}
+
+impl HalfMatch {
+ /// Create a new half match from a pattern ID and a byte offset.
+ #[inline]
+ pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
+ HalfMatch { pattern, offset }
+ }
+
+ /// Create a new half match from a pattern ID and a byte offset.
+ ///
+ /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ #[inline]
+ pub fn must(pattern: usize, offset: usize) -> HalfMatch {
+ HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding DFA. The first pattern has
+ /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The position of the match.
+ ///
+ /// If this match was produced by a forward search, then the offset is
+ /// exclusive. If this match was produced by a reverse search, then the
+ /// offset is inclusive.
+ #[inline]
+ pub fn offset(&self) -> usize {
+ self.offset
+ }
+}
+
+/// A representation of a multi match reported by a regex engine.
+///
+/// A multi match has two essential pieces of information: the identifier of
+/// the pattern that matched, along with the start and end offsets of the match
+/// in the haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding regex engine. If only a single pattern is provided, then all
+/// multi matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every multi match guarantees that `start <= end`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct MultiMatch {
+ /// The pattern ID.
+ pattern: PatternID,
+ /// The start offset of the match, inclusive.
+ start: usize,
+ /// The end offset of the match, exclusive.
+ end: usize,
+}
+
+impl MultiMatch {
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start`.
+ #[inline]
+ pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
+ assert!(start <= end);
+ MultiMatch { pattern, start, end }
+ }
+
+ /// Create a new match from a pattern ID and a byte offset span.
+ ///
+ /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
+ /// [`PatternID`]. This panics if the given `usize` is not representable
+ /// as a `PatternID`.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+ #[inline]
+ pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
+ MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
+ }
+
+ /// Returns the ID of the pattern that matched.
+ ///
+ /// The ID of a pattern is derived from the position in which it was
+ /// originally inserted into the corresponding regex engine. The first
+ /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
+ /// so on.
+ #[inline]
+ pub fn pattern(&self) -> PatternID {
+ self.pattern
+ }
+
+ /// The starting position of the match.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// The ending position of the match.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Returns the match location as a range.
+ #[inline]
+ pub fn range(&self) -> core::ops::Range<usize> {
+ self.start..self.end
+ }
+
+ /// Returns true if and only if this match is empty. That is, when
+ /// `start() == end()`.
+ ///
+ /// An empty match can only be returned when the empty string was among
+ /// the patterns used to build the Aho-Corasick automaton.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+}
+
+/// An error type indicating that a search stopped prematurely without finding
+/// a match.
+///
+/// This error type implies that one cannot assume that no matches occur, since
+/// the search stopped before completing.
+///
+/// Normally, when one searches for something, the response is either an
+/// affirmative "it was found at this location" or a negative "not found at
+/// all." However, in some cases, a regex engine can be configured to stop its
+/// search before concluding whether a match exists or not. When this happens,
+/// it may be important for the caller to know why the regex engine gave up and
+/// where in the input it gave up at. This error type exposes the 'why' and the
+/// 'where.'
+///
+/// For example, the DFAs provided by this library generally cannot correctly
+/// implement Unicode word boundaries. Instead, they provide an option to
+/// eagerly support them on ASCII text (since Unicode word boundaries are
+/// equivalent to ASCII word boundaries when searching ASCII text), but will
+/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
+/// required to either report the failure to the caller (unergonomic) or
+/// otherwise fall back to some other regex engine (ergonomic, but potentially
+/// costly).
+///
+/// More generally, some regex engines offer the ability for callers to specify
+/// certain bytes that will trigger the regex engine to automatically quit if
+/// they are seen.
+///
+/// Still yet, there may be other reasons for a failed match. For example,
+/// the hybrid DFA provided by this crate can be configured to give up if it
+/// believes that it is not efficient. This in turn permits callers to choose a
+/// different regex engine.
+///
+/// # Advice
+///
+/// While this form of error reporting adds complexity, it is generally
+/// possible for callers to configure regex engines to never give up a search,
+/// and thus never return an error. Indeed, the default configuration for every
+/// regex engine in this crate is such that they will never stop searching
+/// early. Therefore, the only way to get a match error is if the regex engine
+/// is explicitly configured to do so. Options that enable this behavior
+/// document the new error conditions they imply.
+///
+/// Regex engines for which no errors are possible for any configuration will
+/// return the normal `Option<Match>` and not use this error type at all.
+///
+/// For example, regex engines in the `dfa` sub-module will only report
+/// `MatchError::Quit` if instructed by either
+/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
+/// or by
+/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum MatchError {
+ // Note that the first version of this type was called `SearchError` and it
+ // included a third `None` variant to indicate that the search completed
+ // and no match was found. However, this was problematic for iterator
+ // APIs where the `None` sentinel for stopping iteration corresponds
+ // precisely to the "match not found" case. The fact that the `None`
+ // variant was buried inside this type was in turn quite awkward. So
+ // instead, I removed the `None` variant, renamed the type and used
+ // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
+ // conceptually simpler `Result<Match, MatchError>`. However, we "regain"
+ // ergonomics by only putting the more complex API in the `try_` variants
+ // ("fallible") of search methods. The infallible APIs will instead just
+ // return `Option<Match>` and panic on error.
+ /// The search saw a "quit" byte at which it was instructed to stop
+ /// searching.
+ Quit {
+ /// The "quit" byte that was observed that caused the search to stop.
+ byte: u8,
+ /// The offset at which the quit byte was observed.
+ offset: usize,
+ },
+ /// The search, based on heuristics, determined that it would be better
+ /// to stop, typically to provide the caller an opportunity to use an
+ /// alternative regex engine.
+ ///
+ /// Currently, the only way for this to occur is via the lazy DFA and
+ /// only when it is configured to do so (it will not return this error by
+ /// default).
+ GaveUp {
+ /// The offset at which the search stopped. This corresponds to the
+ /// position immediately following the last byte scanned.
+ offset: usize,
+ },
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ match *self {
+ MatchError::Quit { byte, offset } => write!(
+ f,
+ "quit search after observing byte \\x{:02X} at offset {}",
+ byte, offset,
+ ),
+ MatchError::GaveUp { offset } => {
+ write!(f, "gave up searching at offset {}", offset)
+ }
+ }
+ }
+}
diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs
new file mode 100644
index 000000000..798507da2
--- /dev/null
+++ b/vendor/regex-automata/src/util/mod.rs
@@ -0,0 +1,275 @@
+/*!
+TODO
+*/
+
+use core::{ascii, fmt, str};
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+pub mod alphabet;
+pub(crate) mod bytes;
+#[cfg(feature = "alloc")]
+pub(crate) mod determinize;
+pub mod id;
+#[cfg(feature = "alloc")]
+pub(crate) mod lazy;
+pub(crate) mod matchtypes;
+pub mod prefilter;
+#[cfg(feature = "alloc")]
+pub(crate) mod sparse_set;
+pub(crate) mod start;
+#[cfg(feature = "alloc")]
+pub(crate) mod syntax;
+
+/// The offset, in bytes, that a match is delayed by in the DFAs generated by
+/// this crate. (This includes lazy DFAs.)
+///
+/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
+/// and $. In particular, both of these operators may require the
+/// identification of the end of input in order to confirm a match. Not only
+/// does this mean that all matches must therefore be delayed by a single byte,
+/// but that a special EOI value is added to the alphabet of all DFAs. (Which
+/// means that even though the alphabet of a DFA is typically all byte values,
+/// the actual maximum alphabet size is 257 due to the extra EOI value.)
+///
+/// Since we delay matches by only 1 byte, this can't fully support a
+/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
+/// DFAs in this crate do not support it. (It's not as simple as just
+/// increasing the match offset to do it---otherwise we would---but building
+/// the full Unicode-aware word boundary detection into an automaton is quite
+/// tricky.)
+pub(crate) const MATCH_OFFSET: usize = 1;
+
+/// A type that wraps a single byte with a convenient fmt::Debug impl that
+/// escapes the byte.
+pub(crate) struct DebugByte(pub u8);
+
+impl fmt::Debug for DebugByte {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ // 10 bytes is enough to cover any output from ascii::escape_default.
+ let mut bytes = [0u8; 10];
+ let mut len = 0;
+ for (i, mut b) in ascii::escape_default(self.0).enumerate() {
+ // capitalize \xab to \xAB
+ if i >= 2 && b'a' <= b && b <= b'f' {
+ b -= 32;
+ }
+ bytes[len] = b;
+ len += 1;
+ }
+ write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
+ }
+}
+
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+///
+/// For all inputs, including invalid UTF-8 and any value of `i`, the return
+/// value is guaranteed to be greater than `i`.
+///
+/// Generally speaking, this should only be called on `text` when it is
+/// permitted to assume that it is valid UTF-8 and where either `i >=
+/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
+#[inline(always)]
+pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
+ let b = match text.get(i) {
+ None => return i.checked_add(1).unwrap(),
+ Some(&b) => b,
+ };
+ // For cases where we see an invalid UTF-8 byte, there isn't much we can do
+ // other than just start at the next byte.
+ let inc = utf8_len(b).unwrap_or(1);
+ i.checked_add(inc).unwrap()
+}
+
+/// Returns true if and only if the given byte is considered a word character.
+/// This only applies to ASCII.
+///
+/// This was copied from regex-syntax so that we can use it to determine the
+/// starting DFA state while searching without depending on regex-syntax. The
+/// definition is never going to change, so there's no maintenance/bit-rot
+/// hazard here.
+#[inline(always)]
+pub(crate) fn is_word_byte(b: u8) -> bool {
+ match b {
+ b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
+ _ => false,
+ }
+}
+
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the given
+/// byte slice, then the first byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[inline(always)]
+pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
+ if bytes.is_empty() {
+ return None;
+ }
+ let len = match utf8_len(bytes[0]) {
+ None => return Some(Err(bytes[0])),
+ Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
+ Some(1) => return Some(Ok(bytes[0] as char)),
+ Some(len) => len,
+ };
+ match str::from_utf8(&bytes[..len]) {
+ Ok(s) => Some(Ok(s.chars().next().unwrap())),
+ Err(_) => Some(Err(bytes[0])),
+ }
+}
+
+/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the end of the given byte
+/// slice, then the last byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[inline(always)]
+pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
+ if bytes.is_empty() {
+ return None;
+ }
+ let mut start = bytes.len() - 1;
+ let limit = bytes.len().saturating_sub(4);
+ while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
+ start -= 1;
+ }
+ match decode_utf8(&bytes[start..]) {
+ None => None,
+ Some(Ok(ch)) => Some(Ok(ch)),
+ Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
+ }
+}
+
+/// Given a UTF-8 leading byte, this returns the total number of code units
+/// in the following encoded codepoint.
+///
+/// If the given byte is not a valid UTF-8 leading byte, then this returns
+/// `None`.
+#[inline(always)]
+fn utf8_len(byte: u8) -> Option<usize> {
+ if byte <= 0x7F {
+ return Some(1);
+ } else if byte & 0b1100_0000 == 0b1000_0000 {
+ return None;
+ } else if byte <= 0b1101_1111 {
+ Some(2)
+ } else if byte <= 0b1110_1111 {
+ Some(3)
+ } else if byte <= 0b1111_0111 {
+ Some(4)
+ } else {
+ None
+ }
+}
+
+/// Returns true if and only if the given byte is either a valid leading UTF-8
+/// byte, or is otherwise an invalid byte that can never appear anywhere in a
+/// valid UTF-8 sequence.
+#[inline(always)]
+fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
+ // In the ASCII case, the most significant bit is never set. The leading
+ // byte of a 2/3/4-byte sequence always has the top two most significant
+ // bits set. For bytes that can never appear anywhere in valid UTF-8, this
+ // also returns true, since every such byte has its two most significant
+ // bits set:
+ //
+ // \xC0 :: 11000000
+ // \xC1 :: 11000001
+ // \xF5 :: 11110101
+ // \xF6 :: 11110110
+ // \xF7 :: 11110111
+ // \xF8 :: 11111000
+ // \xF9 :: 11111001
+ // \xFA :: 11111010
+ // \xFB :: 11111011
+ // \xFC :: 11111100
+ // \xFD :: 11111101
+ // \xFE :: 11111110
+ // \xFF :: 11111111
+ (b & 0b1100_0000) != 0b1000_0000
+}
+
+#[cfg(feature = "alloc")]
+#[inline(always)]
+pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
+ use core::{ptr, sync::atomic::AtomicPtr};
+
+ use crate::{
+ dfa::{
+ dense::{self, DFA},
+ Automaton,
+ },
+ util::lazy,
+ };
+
+ static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
+
+ let dfa = lazy::get_or_init(&WORD, || {
+ // TODO: Should we use a lazy DFA here instead? It does complicate
+ // things somewhat, since we then need a mutable cache, which probably
+ // means a thread local.
+ dense::Builder::new()
+ .configure(dense::Config::new().anchored(true))
+ .build(r"\w")
+ .unwrap()
+ });
+ // This is OK since '\w' contains no look-around.
+ let mut sid = dfa.universal_start_state();
+ while at < bytes.len() {
+ let byte = bytes[at];
+ sid = dfa.next_state(sid, byte);
+ at += 1;
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ return true;
+ } else if dfa.is_dead_state(sid) {
+ return false;
+ }
+ }
+ }
+ dfa.is_match_state(dfa.next_eoi_state(sid))
+}
+
+#[cfg(feature = "alloc")]
+#[inline(always)]
+pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
+ use core::{ptr, sync::atomic::AtomicPtr};
+
+ use crate::{
+ dfa::{
+ dense::{self, DFA},
+ Automaton,
+ },
+ nfa::thompson::NFA,
+ };
+
+ static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
+
+ let dfa = lazy::get_or_init(&WORD, || {
+ dense::Builder::new()
+ .configure(dense::Config::new().anchored(true))
+ .thompson(NFA::config().reverse(true).shrink(true))
+ .build(r"\w")
+ .unwrap()
+ });
+
+ // This is OK since '\w' contains no look-around.
+ let mut sid = dfa.universal_start_state();
+ while at > 0 {
+ at -= 1;
+ let byte = bytes[at];
+ sid = dfa.next_state(sid, byte);
+ if dfa.is_special_state(sid) {
+ if dfa.is_match_state(sid) {
+ return true;
+ } else if dfa.is_dead_state(sid) {
+ return false;
+ }
+ }
+ }
+ dfa.is_match_state(dfa.next_eoi_state(sid))
+}
diff --git a/vendor/regex-automata/src/util/prefilter.rs b/vendor/regex-automata/src/util/prefilter.rs
new file mode 100644
index 000000000..5fe151524
--- /dev/null
+++ b/vendor/regex-automata/src/util/prefilter.rs
@@ -0,0 +1,281 @@
+use crate::Match;
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position. The result is one of no match, a confirmed match or
+/// a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+ /// The prefilter reports that no match is possible. Prefilter
+ /// implementations will never report false negatives.
+ None,
+ /// The prefilter reports that a match has been confirmed at the provided
+ /// byte offsets. When this variant is reported, the prefilter is
+ /// guaranteeing a match. No false positives are permitted.
+ Match(Match),
+ /// The prefilter reports that a match *may* start at the given position.
+ /// When this variant is reported, it may correspond to a false positive.
+ PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+ /// Convert this candidate into an option. This is useful when callers do
+ /// not distinguish between true positives and false positives (i.e., the
+ /// caller must always confirm the match in order to update some other
+ /// state).
+ ///
+ /// The byte offset in the option returned corresponds to the starting
+ /// position of the possible match.
+ pub fn into_option(self) -> Option<usize> {
+ match self {
+ Candidate::None => None,
+ Candidate::Match(ref m) => Some(m.start()),
+ Candidate::PossibleStartOfMatch(start) => Some(start),
+ }
+ }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+pub trait Prefilter: core::fmt::Debug {
+ /// Returns the next possible match candidate. This may yield false
+ /// positives, so callers must confirm a match starting at the position
+ /// returned. This, however, must never produce false negatives. That is,
+ /// this must, at minimum, return the starting position of the next match
+ /// in the given haystack after or at the given position.
+ fn next_candidate(
+ &self,
+ state: &mut State,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate;
+
+ /// Returns the approximate total amount of heap used by this prefilter, in
+ /// units of bytes.
+ fn heap_bytes(&self) -> usize;
+
+ /// Returns true if and only if this prefilter may return false positives
+ /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
+ /// when false positives are not posssible (in which case, implementations
+ /// should return false), which may allow completely avoiding heavier regex
+ /// machinery when the prefilter can quickly confirm its own matches.
+ ///
+ /// By default, this returns true, which is conservative; it is always
+ /// correct to return `true`. Returning `false` here and reporting a false
+ /// positive will result in incorrect searches.
+ fn reports_false_positives(&self) -> bool {
+ true
+ }
+}
+
+impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
+ #[inline]
+ fn next_candidate(
+ &self,
+ state: &mut State,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ (**self).next_candidate(state, haystack, at)
+ }
+
+ fn heap_bytes(&self) -> usize {
+ (**self).heap_bytes()
+ }
+
+ fn reports_false_positives(&self) -> bool {
+ (**self).reports_false_positives()
+ }
+}
+
+#[derive(Clone)]
+pub struct Scanner<'p> {
+ prefilter: &'p dyn Prefilter,
+ state: State,
+}
+
+impl<'p> Scanner<'p> {
+ pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
+ Scanner { prefilter, state: State::new() }
+ }
+
+ pub(crate) fn is_effective(&mut self, at: usize) -> bool {
+ self.state.is_effective(at)
+ }
+
+ pub(crate) fn reports_false_positives(&self) -> bool {
+ self.prefilter.reports_false_positives()
+ }
+
+ pub(crate) fn next_candidate(
+ &mut self,
+ bytes: &[u8],
+ at: usize,
+ ) -> Candidate {
+ let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
+ match cand {
+ Candidate::None => {
+ self.state.update_skipped_bytes(bytes.len() - at);
+ }
+ Candidate::Match(ref m) => {
+ self.state.update_skipped_bytes(m.start() - at);
+ }
+ Candidate::PossibleStartOfMatch(i) => {
+ self.state.update_skipped_bytes(i - at);
+ }
+ }
+ cand
+ }
+}
+
+impl<'p> core::fmt::Debug for Scanner<'p> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ f.debug_struct("Scanner").field("state", &self.state).finish()
+ }
+}
+
+/// State tracks state associated with the effectiveness of a
+/// prefilter. It is used to track how many bytes, on average, are skipped by
+/// the prefilter. If this average dips below a certain threshold over time,
+/// then the state renders the prefilter inert and stops using it.
+///
+/// A prefilter state should be created for each search. (Where creating an
+/// iterator via, e.g., `find_iter`, is treated as a single search.)
+#[derive(Clone, Debug)]
+pub struct State {
+ /// The number of skips that has been executed.
+ skips: usize,
+ /// The total number of bytes that have been skipped.
+ skipped: usize,
+ /// Once this heuristic has been deemed permanently ineffective, it will be
+ /// inert throughout the rest of its lifetime. This serves as a cheap way
+ /// to check inertness.
+ inert: bool,
+ /// The last (absolute) position at which a prefilter scanned to.
+ /// Prefilters can use this position to determine whether to re-scan or
+ /// not.
+ ///
+ /// Unlike other things that impact effectiveness, this is a fleeting
+ /// condition. That is, a prefilter can be considered ineffective if it is
+ /// at a position before `last_scan_at`, but can become effective again
+ /// once the search moves past `last_scan_at`.
+ ///
+ /// The utility of this is to both avoid additional overhead from calling
+ /// the prefilter and to avoid quadratic behavior. This ensures that a
+ /// prefilter will scan any particular byte at most once. (Note that some
+ /// prefilters, like the start-byte prefilter, do not need to use this
+ /// field at all, since it only looks for starting bytes.)
+ last_scan_at: usize,
+}
+
+impl State {
+ /// The minimum number of skip attempts to try before considering whether
+ /// a prefilter is effective or not.
+ const MIN_SKIPS: usize = 40;
+
+ /// The minimum amount of bytes that skipping must average.
+ ///
+ /// That is, after MIN_SKIPS have occurred, if the average number of bytes
+ /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
+ /// rendered inert.
+ const MIN_AVG_SKIP: usize = 16;
+
+ /// Create a fresh prefilter state.
+ pub fn new() -> State {
+ State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 }
+ }
+
+ /// Updates the position at which the last scan stopped. This may be
+ /// greater than the position of the last candidate reported. For example,
+ /// searching for the byte `z` in `abczdef` for the pattern `abcz` will
+ /// report a candidate at position `0`, but the end of its last scan will
+ /// be at position `3`.
+ ///
+ /// This position factors into the effectiveness of this prefilter. If the
+ /// current position is less than the last position at which a scan ended,
+ /// then the prefilter should not be re-run until the search moves past
+ /// that position.
+ ///
+ /// It is always correct to never update the last scan position. In fact,
+ /// it is also always correct to set the last scan position to an arbitrary
+ /// value. The key is setting it to a position in the future at which it
+ /// makes sense to restart the prefilter.
+ pub fn update_last_scan(&mut self, at: usize) {
+ if at > self.last_scan_at {
+ self.last_scan_at = at;
+ }
+ }
+
+ /// Return true if and only if this state indicates that a prefilter is
+ /// still effective. If the prefilter is not effective, then this state
+ /// is rendered "inert." At which point, all subsequent calls to
+ /// `is_effective` on this state will return `false`.
+ ///
+ /// `at` should correspond to the current starting position of the search.
+ ///
+ /// Callers typically do not need to use this, as it represents the
+ /// default implementation of
+ /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
+ fn is_effective(&mut self, at: usize) -> bool {
+ if self.inert {
+ return false;
+ }
+ if at < self.last_scan_at {
+ return false;
+ }
+ if self.skips < State::MIN_SKIPS {
+ return true;
+ }
+
+ if self.skipped >= State::MIN_AVG_SKIP * self.skips {
+ return true;
+ }
+
+ // We're inert.
+ self.inert = true;
+ false
+ }
+
+ /// Update this state with the number of bytes skipped on the last
+ /// invocation of the prefilter.
+ fn update_skipped_bytes(&mut self, skipped: usize) {
+ self.skips += 1;
+ self.skipped += skipped;
+ }
+}
+
+/// A `Prefilter` implementation that reports a possible match at every
+/// position.
+///
+/// This should generally not be used as an actual prefilter. It is only
+/// useful when one needs to represent the absence of a prefilter in a generic
+/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
+/// uses this prefilter by default to indicate that no prefilter should be
+/// used.
+///
+/// A `None` prefilter value cannot be constructed.
+#[derive(Clone, Debug)]
+pub struct None {
+ _priv: (),
+}
+
+impl Prefilter for None {
+ fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
+ Candidate::PossibleStartOfMatch(at)
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}
diff --git a/vendor/regex-automata/src/util/sparse_set.rs b/vendor/regex-automata/src/util/sparse_set.rs
new file mode 100644
index 000000000..bf59e4469
--- /dev/null
+++ b/vendor/regex-automata/src/util/sparse_set.rs
@@ -0,0 +1,229 @@
+use alloc::{boxed::Box, vec, vec::Vec};
+
+use crate::util::id::StateID;
+
+/// A pairse of sparse sets.
+///
+/// This is useful when one needs to compute NFA epsilon closures from a
+/// previous set of states derived from an epsilon closure. One set can be the
+/// starting states where as the other set can be the destination states after
+/// following the transitions for a particular byte of input.
+///
+/// There is no significance to 'set1' or 'set2'. They are both sparse sets of
+/// the same size.
+///
+/// The members of this struct are exposed so that callers may borrow 'set1'
+/// and 'set2' individually without being force to borrow both at the same
+/// time.
+#[derive(Clone, Debug)]
+pub(crate) struct SparseSets {
+ pub(crate) set1: SparseSet,
+ pub(crate) set2: SparseSet,
+}
+
+impl SparseSets {
+ /// Create a new pair of sparse sets where each set has the given capacity.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ pub(crate) fn new(capacity: usize) -> SparseSets {
+ SparseSets {
+ set1: SparseSet::new(capacity),
+ set2: SparseSet::new(capacity),
+ }
+ }
+
+ /// Resizes these sparse sets to have the new capacity given.
+ ///
+ /// The sets are automatically cleared.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ #[inline]
+ pub(crate) fn resize(&mut self, new_capacity: usize) {
+ self.set1.resize(new_capacity);
+ self.set2.resize(new_capacity);
+ }
+
+ /// Clear both sparse sets.
+ pub(crate) fn clear(&mut self) {
+ self.set1.clear();
+ self.set2.clear();
+ }
+
+ /// Swap set1 with set2.
+ pub(crate) fn swap(&mut self) {
+ core::mem::swap(&mut self.set1, &mut self.set2);
+ }
+
+ /// Returns the memory usage, in bytes, used by this pair of sparse sets.
+ pub(crate) fn memory_usage(&self) -> usize {
+ self.set1.memory_usage() + self.set2.memory_usage()
+ }
+}
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: https://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse sparse sets, so the initial allocation cost is bareable. However, its
+/// other properties listed above are extremely useful.
+#[derive(Clone)]
+pub(crate) struct SparseSet {
+ /// The number of elements currently in this set.
+ len: usize,
+ /// Dense contains the ids in the order in which they were inserted.
+ dense: Vec<StateID>,
+ /// Sparse maps ids to their location in dense.
+ ///
+ /// A state ID is in the set if and only if
+ /// sparse[id] < dense.len() && id == dense[sparse[id]].
+ sparse: Vec<StateID>,
+}
+
+impl SparseSet {
+ /// Create a new sparse set with the given capacity.
+ ///
+ /// Sparse sets have a fixed size and they cannot grow. Attempting to
+ /// insert more distinct elements than the total capacity of the set will
+ /// result in a panic.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ #[inline]
+ pub(crate) fn new(capacity: usize) -> SparseSet {
+ let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
+ set.resize(capacity);
+ set
+ }
+
+ /// Resizes this sparse set to have the new capacity given.
+ ///
+ /// This set is automatically cleared.
+ ///
+ /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+ #[inline]
+ pub(crate) fn resize(&mut self, new_capacity: usize) {
+ assert!(
+ new_capacity <= StateID::LIMIT,
+ "sparse set capacity cannot excced {:?}",
+ StateID::LIMIT
+ );
+ self.clear();
+ self.dense.resize(new_capacity, StateID::ZERO);
+ self.sparse.resize(new_capacity, StateID::ZERO);
+ }
+
+ /// Returns the capacity of this set.
+ ///
+ /// The capacity represents a fixed limit on the number of distinct
+ /// elements that are allowed in this set. The capacity cannot be changed.
+ #[inline]
+ pub(crate) fn capacity(&self) -> usize {
+ self.dense.len()
+ }
+
+ /// Returns the number of elements in this set.
+ #[inline]
+ pub(crate) fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns true if and only if this set is empty.
+ #[inline]
+ pub(crate) fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Insert the state ID value into this set and return true if the given
+ /// state ID was not previously in this set.
+ ///
+ /// This operation is idempotent. If the given value is already in this
+ /// set, then this is a no-op.
+ ///
+ /// If more than `capacity` ids are inserted, then this panics.
+ ///
+ /// This is marked as inline(always) since the compiler won't inline it
+ /// otherwise, and it's a fairly hot piece of code in DFA determinization.
+ #[inline(always)]
+ pub(crate) fn insert(&mut self, value: StateID) -> bool {
+ if self.contains(value) {
+ return false;
+ }
+
+ let i = self.len();
+ assert!(
+ i < self.capacity(),
+ "{:?} exceeds capacity of {:?} when inserting {:?}",
+ i,
+ self.capacity(),
+ value,
+ );
+ // OK since i < self.capacity() and self.capacity() is guaranteed to
+ // be <= StateID::LIMIT.
+ let id = StateID::new_unchecked(i);
+ self.dense[id] = value;
+ self.sparse[value] = id;
+ self.len += 1;
+ true
+ }
+
+ /// Returns true if and only if this set contains the given value.
+ #[inline]
+ pub(crate) fn contains(&self, value: StateID) -> bool {
+ let i = self.sparse[value];
+ i.as_usize() < self.len() && self.dense[i] == value
+ }
+
+ /// Returns the ith inserted element from this set.
+ ///
+ /// Panics when i >= self.len().
+ #[inline]
+ pub(crate) fn get(&self, i: usize) -> StateID {
+ self.dense[i]
+ }
+
+ /// Clear this set such that it has no members.
+ #[inline]
+ pub(crate) fn clear(&mut self) {
+ self.len = 0;
+ }
+
+ /// Returns the heap memory usage, in bytes, used by this sparse set.
+ #[inline]
+ pub(crate) fn memory_usage(&self) -> usize {
+ 2 * self.dense.len() * StateID::SIZE
+ }
+}
+
+impl core::fmt::Debug for SparseSet {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let elements: Vec<StateID> = self.into_iter().collect();
+ f.debug_tuple("SparseSet").field(&elements).finish()
+ }
+}
+
+/// An iterator over all elements in a sparse set.
+///
+/// The lifetime `'a` refers to the lifetime of the set being iterated over.
+#[derive(Debug)]
+pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
+
+impl<'a> IntoIterator for &'a SparseSet {
+ type Item = StateID;
+ type IntoIter = SparseSetIter<'a>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ SparseSetIter(self.dense[..self.len()].iter())
+ }
+}
+
+impl<'a> Iterator for SparseSetIter<'a> {
+ type Item = StateID;
+
+ #[inline(always)]
+ fn next(&mut self) -> Option<StateID> {
+ self.0.next().map(|value| *value)
+ }
+}
diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs
new file mode 100644
index 000000000..3c756fc26
--- /dev/null
+++ b/vendor/regex-automata/src/util/start.rs
@@ -0,0 +1,109 @@
+/// Represents the four possible starting configurations of a DFA search.
+///
+/// The starting configuration is determined by inspecting the the beginning of
+/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if
+/// specified) is what selects the start state to use in a DFA.
+///
+/// In a DFA that doesn't have starting states for each pattern, then it will
+/// have a maximum of four DFA start states. If the DFA was compiled with start
+/// states for each pattern, then it will have a maximum of four DFA start
+/// states for searching for any pattern, and then another maximum of four DFA
+/// start states for executing an anchored search for each pattern.
+///
+/// This ends up being represented as a table in the DFA (whether lazy or fully
+/// built) where the stride of that table is 4, and each entry is an index into
+/// the state transition table. Note though that multiple entries in the table
+/// might point to the same state if the states would otherwise be equivalent.
+/// (This is guaranteed by DFA minimization and may even be accomplished by
+/// normal determinization, since it attempts to reuse equivalent states too.)
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) enum Start {
+ /// This occurs when the starting position is not any of the ones below.
+ NonWordByte = 0,
+ /// This occurs when the byte immediately preceding the start of the search
+ /// is an ASCII word byte.
+ WordByte = 1,
+ /// This occurs when the starting position of the search corresponds to the
+ /// beginning of the haystack.
+ Text = 2,
+ /// This occurs when the byte immediately preceding the start of the search
+ /// is a line terminator. Specifically, `\n`.
+ Line = 3,
+}
+
+impl Start {
+ /// Return the starting state corresponding to the given integer. If no
+ /// starting state exists for the given integer, then None is returned.
+ pub(crate) fn from_usize(n: usize) -> Option<Start> {
+ match n {
+ 0 => Some(Start::NonWordByte),
+ 1 => Some(Start::WordByte),
+ 2 => Some(Start::Text),
+ 3 => Some(Start::Line),
+ _ => None,
+ }
+ }
+
+ /// Returns the total number of starting state configurations.
+ pub(crate) fn count() -> usize {
+ 4
+ }
+
+ /// Returns the starting state configuration for the given search
+ /// parameters. If the given offset range is not valid, then this panics.
+ #[inline(always)]
+ pub(crate) fn from_position_fwd(
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Start {
+ assert!(
+ bytes.get(start..end).is_some(),
+ "{}..{} is invalid",
+ start,
+ end
+ );
+ if start == 0 {
+ Start::Text
+ } else if bytes[start - 1] == b'\n' {
+ Start::Line
+ } else if crate::util::is_word_byte(bytes[start - 1]) {
+ Start::WordByte
+ } else {
+ Start::NonWordByte
+ }
+ }
+
+ /// Returns the starting state configuration for a reverse search with the
+ /// given search parameters. If the given offset range is not valid, then
+ /// this panics.
+ #[inline(always)]
+ pub(crate) fn from_position_rev(
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Start {
+ assert!(
+ bytes.get(start..end).is_some(),
+ "{}..{} is invalid",
+ start,
+ end
+ );
+ if end == bytes.len() {
+ Start::Text
+ } else if bytes[end] == b'\n' {
+ Start::Line
+ } else if crate::util::is_word_byte(bytes[end]) {
+ Start::WordByte
+ } else {
+ Start::NonWordByte
+ }
+ }
+
+ /// Return this starting configuration as an integer. It is guaranteed to
+ /// be less than `Start::count()`.
+ #[inline(always)]
+ pub(crate) fn as_usize(&self) -> usize {
+ *self as usize
+ }
+}
diff --git a/vendor/regex-automata/src/util/syntax.rs b/vendor/regex-automata/src/util/syntax.rs
new file mode 100644
index 000000000..88beeee75
--- /dev/null
+++ b/vendor/regex-automata/src/util/syntax.rs
@@ -0,0 +1,272 @@
+use regex_syntax::ParserBuilder;
+
+/// A common set of configuration options that apply to the syntax of a regex.
+///
+/// This represents a group of configuration options that specifically apply
+/// to how the concrete syntax of a regular expression is interpreted. In
+/// particular, they are generally forwarded to the
+/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
+/// in the
+/// [`regex-syntax`](https://docs.rs/regex-syntax)
+/// crate when building a regex from its concrete syntax directly.
+///
+/// These options are defined as a group since they apply to every regex engine
+/// in this crate. Instead of re-defining them on every engine's builder, they
+/// are instead provided here as one cohesive unit.
+#[derive(Clone, Copy, Debug)]
+pub struct SyntaxConfig {
+ case_insensitive: bool,
+ multi_line: bool,
+ dot_matches_new_line: bool,
+ swap_greed: bool,
+ ignore_whitespace: bool,
+ unicode: bool,
+ utf8: bool,
+ nest_limit: u32,
+ octal: bool,
+}
+
+impl SyntaxConfig {
+ /// Return a new default syntax configuration.
+ pub fn new() -> SyntaxConfig {
+ // These defaults match the ones used in regex-syntax.
+ SyntaxConfig {
+ case_insensitive: false,
+ multi_line: false,
+ dot_matches_new_line: false,
+ swap_greed: false,
+ ignore_whitespace: false,
+ unicode: true,
+ utf8: true,
+ nest_limit: 250,
+ octal: false,
+ }
+ }
+
+ /// Enable or disable the case insensitive flag by default.
+ ///
+ /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
+ /// Specifically, it will apply the "simple" case folding rules as
+ /// specified by Unicode.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `i` flag.
+ pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig {
+ self.case_insensitive = yes;
+ self
+ }
+
+ /// Enable or disable the multi-line matching flag by default.
+ ///
+ /// When this is enabled, the `^` and `$` look-around assertions will
+ /// match immediately after and immediately before a new line character,
+ /// respectively. Note that the `\A` and `\z` look-around assertions are
+ /// unaffected by this setting and always correspond to matching at the
+ /// beginning and end of the input.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `m` flag.
+ pub fn multi_line(mut self, yes: bool) -> SyntaxConfig {
+ self.multi_line = yes;
+ self
+ }
+
+ /// Enable or disable the "dot matches any character" flag by default.
+ ///
+ /// When this is enabled, `.` will match any character. When it's disabled,
+ /// then `.` will match any character except for a new line character.
+ ///
+ /// Note that `.` is impacted by whether the "unicode" setting is enabled
+ /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8
+ /// encoding of any Unicode scalar value (sans a new line, depending on
+ /// whether this "dot matches new line" option is enabled). When Unicode
+ /// mode is disabled, `.` will match any byte instead. Because of this,
+ /// when Unicode mode is disabled, `.` can only be used when the "allow
+ /// invalid UTF-8" option is enabled, since `.` could otherwise match
+ /// invalid UTF-8.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `s` flag.
+ pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig {
+ self.dot_matches_new_line = yes;
+ self
+ }
+
+ /// Enable or disable the "swap greed" flag by default.
+ ///
+ /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
+ /// will become greedy.
+ ///
+ /// By default this is disabled. It may alternatively be selectively
+ /// enabled in the regular expression itself via the `U` flag.
+ pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig {
+ self.swap_greed = yes;
+ self
+ }
+
+ /// Enable verbose mode in the regular expression.
+ ///
+ /// When enabled, verbose mode permits insigificant whitespace in many
+ /// places in the regular expression, as well as comments. Comments are
+ /// started using `#` and continue until the end of the line.
+ ///
+ /// By default, this is disabled. It may be selectively enabled in the
+ /// regular expression by using the `x` flag regardless of this setting.
+ pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig {
+ self.ignore_whitespace = yes;
+ self
+ }
+
+ /// Enable or disable the Unicode flag (`u`) by default.
+ ///
+ /// By default this is **enabled**. It may alternatively be selectively
+ /// disabled in the regular expression itself via the `u` flag.
+ ///
+ /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
+ /// default), a regular expression will fail to parse if Unicode mode is
+ /// disabled and a sub-expression could possibly match invalid UTF-8.
+ ///
+ /// **WARNING**: Unicode mode can greatly increase the size of the compiled
+ /// DFA, which can noticeably impact both memory usage and compilation
+ /// time. This is especially noticeable if your regex contains character
+ /// classes like `\w` that are impacted by whether Unicode is enabled or
+ /// not. If Unicode is not necessary, you are encouraged to disable it.
+ pub fn unicode(mut self, yes: bool) -> SyntaxConfig {
+ self.unicode = yes;
+ self
+ }
+
+ /// When disabled, the builder will permit the construction of a regular
+ /// expression that may match invalid UTF-8.
+ ///
+ /// For example, when [`SyntaxConfig::unicode`] is disabled, then
+ /// expressions like `[^a]` may match invalid UTF-8 since they can match
+ /// any single byte that is not `a`. By default, these sub-expressions
+ /// are disallowed to avoid returning offsets that split a UTF-8
+ /// encoded codepoint. However, in cases where matching at arbitrary
+ /// locations is desired, this option can be disabled to permit all such
+ /// sub-expressions.
+ ///
+ /// When enabled (the default), the builder is guaranteed to produce a
+ /// regex that will only ever match valid UTF-8 (otherwise, the builder
+ /// will return an error).
+ pub fn utf8(mut self, yes: bool) -> SyntaxConfig {
+ self.utf8 = yes;
+ self
+ }
+
+ /// Set the nesting limit used for the regular expression parser.
+ ///
+ /// The nesting limit controls how deep the abstract syntax tree is allowed
+ /// to be. If the AST exceeds the given limit (e.g., with too many nested
+ /// groups), then an error is returned by the parser.
+ ///
+ /// The purpose of this limit is to act as a heuristic to prevent stack
+ /// overflow when building a finite automaton from a regular expression's
+ /// abstract syntax tree. In particular, construction currently uses
+ /// recursion. In the future, the implementation may stop using recursion
+ /// and this option will no longer be necessary.
+ ///
+ /// This limit is not checked until the entire AST is parsed. Therefore,
+ /// if callers want to put a limit on the amount of heap space used, then
+ /// they should impose a limit on the length, in bytes, of the concrete
+ /// pattern string. In particular, this is viable since the parser will
+ /// limit itself to heap space proportional to the lenth of the pattern
+ /// string.
+ ///
+ /// Note that a nest limit of `0` will return a nest limit error for most
+ /// patterns but not all. For example, a nest limit of `0` permits `a` but
+ /// not `ab`, since `ab` requires a concatenation AST item, which results
+ /// in a nest depth of `1`. In general, a nest limit is not something that
+ /// manifests in an obvious way in the concrete syntax, therefore, it
+ /// should not be used in a granular way.
+ pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig {
+ self.nest_limit = limit;
+ self
+ }
+
+ /// Whether to support octal syntax or not.
+ ///
+ /// Octal syntax is a little-known way of uttering Unicode codepoints in
+ /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+ /// `\141` are all equivalent regular expressions, where the last example
+ /// shows octal syntax.
+ ///
+ /// While supporting octal syntax isn't in and of itself a problem, it does
+ /// make good error messages harder. That is, in PCRE based regex engines,
+ /// syntax like `\1` invokes a backreference, which is explicitly
+ /// unsupported in Rust's regex engine. However, many users expect it to
+ /// be supported. Therefore, when octal support is disabled, the error
+ /// message will explicitly mention that backreferences aren't supported.
+ ///
+ /// Octal syntax is disabled by default.
+ pub fn octal(mut self, yes: bool) -> SyntaxConfig {
+ self.octal = yes;
+ self
+ }
+
+ /// Returns whether "unicode" mode is enabled.
+ pub fn get_unicode(&self) -> bool {
+ self.unicode
+ }
+
+ /// Returns whether "case insensitive" mode is enabled.
+ pub fn get_case_insensitive(&self) -> bool {
+ self.case_insensitive
+ }
+
+ /// Returns whether "multi line" mode is enabled.
+ pub fn get_multi_line(&self) -> bool {
+ self.multi_line
+ }
+
+ /// Returns whether "dot matches new line" mode is enabled.
+ pub fn get_dot_matches_new_line(&self) -> bool {
+ self.dot_matches_new_line
+ }
+
+ /// Returns whether "swap greed" mode is enabled.
+ pub fn get_swap_greed(&self) -> bool {
+ self.swap_greed
+ }
+
+ /// Returns whether "ignore whitespace" mode is enabled.
+ pub fn get_ignore_whitespace(&self) -> bool {
+ self.ignore_whitespace
+ }
+
+ /// Returns whether UTF-8 mode is enabled.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8
+ }
+
+ /// Returns the "nest limit" setting.
+ pub fn get_nest_limit(&self) -> u32 {
+ self.nest_limit
+ }
+
+ /// Returns whether "octal" mode is enabled.
+ pub fn get_octal(&self) -> bool {
+ self.octal
+ }
+
+ /// Applies this configuration to the given parser.
+ pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
+ builder
+ .unicode(self.unicode)
+ .case_insensitive(self.case_insensitive)
+ .multi_line(self.multi_line)
+ .dot_matches_new_line(self.dot_matches_new_line)
+ .swap_greed(self.swap_greed)
+ .ignore_whitespace(self.ignore_whitespace)
+ .allow_invalid_utf8(!self.utf8)
+ .nest_limit(self.nest_limit)
+ .octal(self.octal);
+ }
+}
+
+impl Default for SyntaxConfig {
+ fn default() -> SyntaxConfig {
+ SyntaxConfig::new()
+ }
+}
diff --git a/vendor/regex-automata/tests/collection.rs b/vendor/regex-automata/tests/collection.rs
deleted file mode 100644
index 68b03229e..000000000
--- a/vendor/regex-automata/tests/collection.rs
+++ /dev/null
@@ -1,461 +0,0 @@
-use std::collections::BTreeMap;
-use std::env;
-use std::fmt::{self, Write};
-use std::thread;
-
-use regex;
-use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
-use serde_bytes;
-use toml;
-
-macro_rules! load {
- ($col:ident, $path:expr) => {
- $col.extend(RegexTests::load(
- concat!("../data/tests/", $path),
- include_bytes!(concat!("../data/tests/", $path)),
- ));
- };
-}
-
-lazy_static! {
- pub static ref SUITE: RegexTestCollection = {
- let mut col = RegexTestCollection::new();
- load!(col, "fowler/basic.toml");
- load!(col, "fowler/nullsubexpr.toml");
- load!(col, "fowler/repetition.toml");
- load!(col, "fowler/repetition-long.toml");
- load!(col, "crazy.toml");
- load!(col, "flags.toml");
- load!(col, "iter.toml");
- load!(col, "no-unicode.toml");
- load!(col, "unicode.toml");
- col
- };
-}
-
-#[derive(Clone, Debug)]
-pub struct RegexTestCollection {
- pub by_name: BTreeMap<String, RegexTest>,
-}
-
-#[derive(Clone, Debug, Deserialize)]
-pub struct RegexTests {
- pub tests: Vec<RegexTest>,
-}
-
-#[derive(Clone, Debug, Deserialize)]
-pub struct RegexTest {
- pub name: String,
- #[serde(default)]
- pub options: Vec<RegexTestOption>,
- pub pattern: String,
- #[serde(with = "serde_bytes")]
- pub input: Vec<u8>,
- #[serde(rename = "matches")]
- pub matches: Vec<Match>,
- #[serde(default)]
- pub captures: Vec<Option<Match>>,
- #[serde(default)]
- pub fowler_line_number: Option<u64>,
-}
-
-#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
-#[serde(rename_all = "kebab-case")]
-pub enum RegexTestOption {
- Anchored,
- CaseInsensitive,
- NoUnicode,
- Escaped,
- #[serde(rename = "invalid-utf8")]
- InvalidUTF8,
-}
-
-#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
-pub struct Match {
- pub start: usize,
- pub end: usize,
-}
-
-impl RegexTestCollection {
- fn new() -> RegexTestCollection {
- RegexTestCollection { by_name: BTreeMap::new() }
- }
-
- fn extend(&mut self, tests: RegexTests) {
- for test in tests.tests {
- let name = test.name.clone();
- if self.by_name.contains_key(&name) {
- panic!("found duplicate test {}", name);
- }
- self.by_name.insert(name, test);
- }
- }
-
- pub fn tests(&self) -> Vec<&RegexTest> {
- self.by_name.values().collect()
- }
-}
-
-impl RegexTests {
- fn load(path: &str, slice: &[u8]) -> RegexTests {
- let mut data: RegexTests = toml::from_slice(slice)
- .expect(&format!("failed to load {}", path));
- for test in &mut data.tests {
- if test.options.contains(&RegexTestOption::Escaped) {
- test.input = unescape_bytes(&test.input);
- }
- }
- data
- }
-}
-
-#[derive(Debug)]
-pub struct RegexTester {
- asserted: bool,
- results: RegexTestResults,
- skip_expensive: bool,
- whitelist: Vec<regex::Regex>,
- blacklist: Vec<regex::Regex>,
-}
-
-impl Drop for RegexTester {
- fn drop(&mut self) {
- // If we haven't asserted yet, then the test is probably buggy, so
- // fail it. But if we're already panicking (e.g., a bug in the regex
- // engine), then don't double-panic, which causes an immediate abort.
- if !thread::panicking() && !self.asserted {
- panic!("must call RegexTester::assert at end of test");
- }
- }
-}
-
-impl RegexTester {
- pub fn new() -> RegexTester {
- let mut tester = RegexTester {
- asserted: false,
- results: RegexTestResults::default(),
- skip_expensive: false,
- whitelist: vec![],
- blacklist: vec![],
- };
- for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
- let x = x.trim();
- if x.is_empty() {
- continue;
- }
- if x.starts_with("-") {
- tester = tester.blacklist(&x[1..]);
- } else {
- tester = tester.whitelist(x);
- }
- }
- tester
- }
-
- pub fn skip_expensive(mut self) -> RegexTester {
- self.skip_expensive = true;
- self
- }
-
- pub fn whitelist(mut self, name: &str) -> RegexTester {
- self.whitelist.push(regex::Regex::new(name).unwrap());
- self
- }
-
- pub fn blacklist(mut self, name: &str) -> RegexTester {
- self.blacklist.push(regex::Regex::new(name).unwrap());
- self
- }
-
- pub fn assert(&mut self) {
- self.asserted = true;
- self.results.assert();
- }
-
- pub fn build_regex<S: StateID>(
- &self,
- mut builder: RegexBuilder,
- test: &RegexTest,
- ) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
- if self.skip(test) {
- return None;
- }
- self.apply_options(test, &mut builder);
-
- match builder.build_with_size::<S>(&test.pattern) {
- Ok(re) => Some(re),
- Err(err) => {
- if let ErrorKind::Unsupported(_) = *err.kind() {
- None
- } else {
- panic!(
- "failed to build {:?} with pattern '{:?}': {}",
- test.name, test.pattern, err
- );
- }
- }
- }
- }
-
- pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
- where
- I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
- T: Iterator<Item = &'a RegexTest>,
- {
- for test in tests {
- let builder = builder.clone();
- let re: Regex = match self.build_regex(builder, test) {
- None => continue,
- Some(re) => re,
- };
- self.test(test, &re);
- }
- }
-
- pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
- self.test_is_match(test, re);
- self.test_find(test, re);
- // Some tests (namely, fowler) are designed only to detect the
- // first match even if there are more subsequent matches. To that
- // end, we only test match iteration when the number of matches
- // expected is not 1, or if the test name has 'iter' in it.
- if test.name.contains("iter") || test.matches.len() != 1 {
- self.test_find_iter(test, re);
- }
- }
-
- pub fn test_is_match<'a, D: DFA>(
- &mut self,
- test: &RegexTest,
- re: &Regex<D>,
- ) {
- self.asserted = false;
-
- let got = re.is_match(&test.input);
- let expected = test.matches.len() >= 1;
- if got == expected {
- self.results.succeeded.push(test.clone());
- return;
- }
- self.results.failed.push(RegexTestFailure {
- test: test.clone(),
- kind: RegexTestFailureKind::IsMatch,
- });
- }
-
- pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
- self.asserted = false;
-
- let got =
- re.find(&test.input).map(|(start, end)| Match { start, end });
- if got == test.matches.get(0).map(|&m| m) {
- self.results.succeeded.push(test.clone());
- return;
- }
- self.results.failed.push(RegexTestFailure {
- test: test.clone(),
- kind: RegexTestFailureKind::Find { got },
- });
- }
-
- pub fn test_find_iter<'a, D: DFA>(
- &mut self,
- test: &RegexTest,
- re: &Regex<D>,
- ) {
- self.asserted = false;
-
- let got: Vec<Match> = re
- .find_iter(&test.input)
- .map(|(start, end)| Match { start, end })
- .collect();
- if got == test.matches {
- self.results.succeeded.push(test.clone());
- return;
- }
- self.results.failed.push(RegexTestFailure {
- test: test.clone(),
- kind: RegexTestFailureKind::FindIter { got },
- });
- }
-
- fn skip(&self, test: &RegexTest) -> bool {
- if self.skip_expensive {
- if test.name.starts_with("repetition-long") {
- return true;
- }
- }
- if !self.blacklist.is_empty() {
- if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
- return true;
- }
- }
- if !self.whitelist.is_empty() {
- if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
- return true;
- }
- }
- false
- }
-
- fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
- for opt in &test.options {
- match *opt {
- RegexTestOption::Anchored => {
- builder.anchored(true);
- }
- RegexTestOption::CaseInsensitive => {
- builder.case_insensitive(true);
- }
- RegexTestOption::NoUnicode => {
- builder.unicode(false);
- }
- RegexTestOption::Escaped => {}
- RegexTestOption::InvalidUTF8 => {
- builder.allow_invalid_utf8(true);
- }
- }
- }
- }
-}
-
-#[derive(Clone, Debug, Default)]
-pub struct RegexTestResults {
- /// Tests that succeeded.
- pub succeeded: Vec<RegexTest>,
- /// Failed tests, indexed by group name.
- pub failed: Vec<RegexTestFailure>,
-}
-
-#[derive(Clone, Debug)]
-pub struct RegexTestFailure {
- test: RegexTest,
- kind: RegexTestFailureKind,
-}
-
-#[derive(Clone, Debug)]
-pub enum RegexTestFailureKind {
- IsMatch,
- Find { got: Option<Match> },
- FindIter { got: Vec<Match> },
-}
-
-impl RegexTestResults {
- pub fn assert(&self) {
- if self.failed.is_empty() {
- return;
- }
- let failures = self
- .failed
- .iter()
- .map(|f| f.to_string())
- .collect::<Vec<String>>()
- .join("\n\n");
- panic!(
- "found {} failures:\n{}\n{}\n{}\n\n\
- Set the REGEX_TEST environment variable to filter tests, \n\
- e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
- whose name contains crazy-misc but not crazy-misc2\n\n",
- self.failed.len(),
- "~".repeat(79),
- failures.trim(),
- "~".repeat(79)
- )
- }
-}
-
-impl fmt::Display for RegexTestFailure {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(
- f,
- "{}: {}\n \
- options: {:?}\n \
- pattern: {}\n \
- pattern (escape): {}\n \
- input: {}\n \
- input (escape): {}\n \
- input (hex): {}",
- self.test.name,
- self.kind.fmt(&self.test)?,
- self.test.options,
- self.test.pattern,
- escape_default(&self.test.pattern),
- nice_raw_bytes(&self.test.input),
- escape_bytes(&self.test.input),
- hex_bytes(&self.test.input)
- )
- }
-}
-
-impl RegexTestFailureKind {
- fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
- let mut buf = String::new();
- match *self {
- RegexTestFailureKind::IsMatch => {
- if let Some(&m) = test.matches.get(0) {
- write!(buf, "expected match (at {}), but none found", m)?
- } else {
- write!(buf, "expected no match, but found a match")?
- }
- }
- RegexTestFailureKind::Find { got } => write!(
- buf,
- "expected {:?}, but found {:?}",
- test.matches.get(0),
- got
- )?,
- RegexTestFailureKind::FindIter { ref got } => write!(
- buf,
- "expected {:?}, but found {:?}",
- test.matches, got
- )?,
- }
- Ok(buf)
- }
-}
-
-impl fmt::Display for Match {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "({}, {})", self.start, self.end)
- }
-}
-
-impl fmt::Debug for Match {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- write!(f, "({}, {})", self.start, self.end)
- }
-}
-
-fn nice_raw_bytes(bytes: &[u8]) -> String {
- use std::str;
-
- match str::from_utf8(bytes) {
- Ok(s) => s.to_string(),
- Err(_) => escape_bytes(bytes),
- }
-}
-
-fn escape_bytes(bytes: &[u8]) -> String {
- use std::ascii;
-
- let escaped = bytes
- .iter()
- .flat_map(|&b| ascii::escape_default(b))
- .collect::<Vec<u8>>();
- String::from_utf8(escaped).unwrap()
-}
-
-fn hex_bytes(bytes: &[u8]) -> String {
- bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
-}
-
-fn escape_default(s: &str) -> String {
- s.chars().flat_map(|c| c.escape_default()).collect()
-}
-
-fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
- use std::str;
- use unescape::unescape;
-
- unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
-}
diff --git a/vendor/regex-automata/tests/data/bytes.toml b/vendor/regex-automata/tests/data/bytes.toml
new file mode 100644
index 000000000..eb3a0942e
--- /dev/null
+++ b/vendor/regex-automata/tests/data/bytes.toml
@@ -0,0 +1,235 @@
+# These are tests specifically crafted for regexes that can match arbitrary
+# bytes. In some cases, we also test the Unicode variant as well, just because
+# it's good sense to do so. But also, these tests aren't really about Unicode,
+# but whether matches are only reported at valid UTF-8 boundaries. For most
+# tests in this entire collection, utf8 = true. But for these tests, we use
+# utf8 = false.
+
+[[tests]]
+name = "word-boundary-ascii"
+regex = ' \b'
+input = " δ"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-unicode"
+regex = ' \b'
+input = " δ"
+matches = [[0, 1]]
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "word-boundary-ascii-not"
+regex = ' \B'
+input = " δ"
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-unicode-not"
+regex = ' \B'
+input = " δ"
+matches = []
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "perl-word-ascii"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "perl-word-unicode"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 3]]
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "perl-decimal-ascii"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 1], [7, 8]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "perl-decimal-unicode"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "perl-whitespace-ascii"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "perl-whitespace-unicode"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 4]]
+unicode = true
+utf8 = false
+
+# The first `(.+)` matches two Unicode codepoints, but can't match the 5th
+# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
+# matches.
+[[tests]]
+name = "mixed-dot"
+regex = '(.+)(?-u)(.+)'
+input = '\xCE\x93\xCE\x94\xFF'
+captures = [
+ [[0, 5], [0, 4], [4, 5]],
+]
+unescape = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "case-one-ascii"
+regex = 'a'
+input = "A"
+matches = [[0, 1]]
+case_insensitive = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "case-one-unicode"
+regex = 'a'
+input = "A"
+matches = [[0, 1]]
+case_insensitive = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "case-class-simple-ascii"
+regex = '[a-z]+'
+input = "AaAaA"
+matches = [[0, 5]]
+case_insensitive = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "case-class-ascii"
+regex = '[a-z]+'
+input = "aA\u212AaA"
+matches = [[0, 2], [5, 7]]
+case_insensitive = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "case-class-unicode"
+regex = '[a-z]+'
+input = "aA\u212AaA"
+matches = [[0, 7]]
+case_insensitive = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "negate-ascii"
+regex = '[^a]'
+input = "δ"
+matches = [[0, 1], [1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "negate-unicode"
+regex = '[^a]'
+input = "δ"
+matches = [[0, 2]]
+unicode = true
+utf8 = false
+
+# When utf8=true, this won't match, because the implicit '.*?' prefix is
+# Unicode aware and will refuse to match through invalid UTF-8 bytes.
+[[tests]]
+name = "dotstar-prefix-ascii"
+regex = 'a'
+input = '\xFFa'
+matches = [[1, 2]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "dotstar-prefix-unicode"
+regex = 'a'
+input = '\xFFa'
+matches = [[1, 2]]
+unescape = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "null-bytes"
+regex = '(?P<cstr>[^\x00]+)\x00'
+input = 'foo\x00'
+captures = [
+ [[0, 4], [0, 3]],
+]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "invalid-utf8-anchor-100"
+regex = '\xCC?^'
+input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
+matches = [[0, 0]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "invalid-utf8-anchor-200"
+regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
+input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
+matches = [[22, 22]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "invalid-utf8-anchor-300"
+regex = '^|ddp\xff\xffdddddlQd@\x80'
+input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
+matches = [[0, 0]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-ascii-100"
+regex = '\Bx\B'
+input = "áxβ"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-ascii-200"
+regex = '\B'
+input = "0\U0007EF5E"
+matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
+unicode = false
+utf8 = false
diff --git a/vendor/regex-automata/tests/data/crazy.toml b/vendor/regex-automata/tests/data/crazy.toml
new file mode 100644
index 000000000..549b86cca
--- /dev/null
+++ b/vendor/regex-automata/tests/data/crazy.toml
@@ -0,0 +1,302 @@
+# TODO: There are still a couple of manually written tests in crazy.rs.
+
+[[tests]]
+name = "ranges"
+regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
+input = "num: 255"
+matches = [[5, 8]]
+
+[[tests]]
+name = "ranges-not"
+regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
+input = "num: 256"
+matches = []
+
+[[tests]]
+name = "float1"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "0.1"
+matches = [[0, 3]]
+
+[[tests]]
+name = "float2"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "0.1.2"
+matches = [[0, 3]]
+match_limit = 1
+
+[[tests]]
+name = "float3"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "a1.2"
+matches = [[1, 4]]
+
+[[tests]]
+name = "float4"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "1.a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "float5"
+regex = '^[-+]?[0-9]*\.?[0-9]+$'
+input = "1.a"
+matches = []
+
+[[tests]]
+name = "email"
+regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
+input = "mine is jam.slam@gmail.com "
+matches = [[8, 26]]
+
+[[tests]]
+name = "email-not"
+regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
+input = "mine is jam.slam@gmail "
+matches = []
+
+[[tests]]
+name = "email-big"
+regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
+input = "mine is jam.slam@gmail.com "
+matches = [[8, 26]]
+
+[[tests]]
+name = "date1"
+regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
+input = "1900-01-01"
+matches = [[0, 10]]
+
+[[tests]]
+name = "date2"
+regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
+input = "1900-00-01"
+matches = []
+
+[[tests]]
+name = "date3"
+regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
+input = "1900-13-01"
+matches = []
+
+[[tests]]
+name = "start-end-empty"
+regex = '^$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-rev"
+regex = '$^'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-many-1"
+regex = '^$^$^$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-many-2"
+regex = '^^^$$$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-rep"
+regex = '(?:^$)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "start-end-empty-rep-rev"
+regex = '(?:$^)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "neg-class-letter"
+regex = '[^ac]'
+input = "acx"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-letter-comma"
+regex = '[^a,]'
+input = "a,x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-letter-space"
+regex = '[^a[:space:]]'
+input = "a x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-comma"
+regex = '[^,]'
+input = ",,x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-space"
+regex = '[^[:space:]]'
+input = " a"
+matches = [[1, 2]]
+
+[[tests]]
+name = "neg-class-space-comma"
+regex = '[^,[:space:]]'
+input = ", a"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-comma-space"
+regex = '[^[:space:],]'
+input = " ,a"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-ascii"
+regex = '[^[:alpha:]Z]'
+input = "A1"
+matches = [[1, 2]]
+
+[[tests]]
+name = "lazy-many-many"
+regex = '((?:.*)*?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-many-optional"
+regex = '((?:.?)*?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-one-many-many"
+regex = '((?:.*)+?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-one-many-optional"
+regex = '((?:.?)+?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-range-min-many"
+regex = '((?:.*){1,}?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-range-many"
+regex = '((?:.*){1,2}?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-many-many"
+regex = '((?:.*)*)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-many-optional"
+regex = '((?:.?)*)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-one-many-many"
+regex = '((?:.*)+)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-one-many-optional"
+regex = '((?:.?)+)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-range-min-many"
+regex = '((?:.*){1,})='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-range-many"
+regex = '((?:.*){1,2})='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "empty1"
+regex = ''
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "empty2"
+regex = ''
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty3"
+regex = '()'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty4"
+regex = '()*'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty5"
+regex = '()+'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty6"
+regex = '()?'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty7"
+regex = '()()'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty8"
+regex = '()+|z'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty9"
+regex = 'z|()+'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty10"
+regex = '()+|b'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty11"
+regex = 'b|()+'
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
diff --git a/vendor/regex-automata/tests/data/earliest.toml b/vendor/regex-automata/tests/data/earliest.toml
new file mode 100644
index 000000000..6714a850b
--- /dev/null
+++ b/vendor/regex-automata/tests/data/earliest.toml
@@ -0,0 +1,48 @@
+[[tests]]
+name = "no-greedy-100"
+regex = 'a+'
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+search_kind = "earliest"
+
+[[tests]]
+name = "no-greedy-200"
+regex = 'abc+'
+input = "zzzabccc"
+matches = [[3, 6]]
+search_kind = "earliest"
+
+[[tests]]
+name = "is-ungreedy"
+regex = 'a+?'
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+search_kind = "earliest"
+
+[[tests]]
+name = "look-start-test"
+regex = '^(abc|a)'
+input = "abc"
+matches = [[0, 1]]
+search_kind = "earliest"
+
+[[tests]]
+name = "look-end-test"
+regex = '(abc|a)$'
+input = "abc"
+matches = [[0, 3]]
+search_kind = "earliest"
+
+[[tests]]
+name = "no-leftmost-first-100"
+regex = 'abc|a'
+input = "abc"
+matches = [[0, 1]]
+search_kind = "earliest"
+
+[[tests]]
+name = "no-leftmost-first-200"
+regex = 'aba|a'
+input = "aba"
+matches = [[0, 1], [2, 3]]
+search_kind = "earliest"
diff --git a/vendor/regex-automata/tests/data/empty.toml b/vendor/regex-automata/tests/data/empty.toml
new file mode 100644
index 000000000..ad703e601
--- /dev/null
+++ b/vendor/regex-automata/tests/data/empty.toml
@@ -0,0 +1,113 @@
+[[tests]]
+name = "100"
+regex = "|b"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "110"
+regex = "b|"
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "120"
+regex = "|z"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "130"
+regex = "z|"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "200"
+regex = "|"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "210"
+regex = "||"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "220"
+regex = "||b"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "230"
+regex = "b||"
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "240"
+regex = "||z"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "300"
+regex = "(?:)|b"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "310"
+regex = "b|(?:)"
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "320"
+regex = "(?:|)"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "330"
+regex = "(?:|)|z"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "400"
+regex = "a(?:)|b"
+input = "abc"
+matches = [[0, 1], [1, 2]]
+
+[[tests]]
+name = "500"
+regex = ""
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "510"
+regex = ""
+input = "a"
+matches = [[0, 0], [1, 1]]
+
+[[tests]]
+name = "520"
+regex = ""
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "600"
+regex = '(|a)*'
+input = "aaa"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "610"
+regex = '(|a)+'
+input = "aaa"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
diff --git a/vendor/regex-automata/tests/data/expensive.toml b/vendor/regex-automata/tests/data/expensive.toml
new file mode 100644
index 000000000..e062e3902
--- /dev/null
+++ b/vendor/regex-automata/tests/data/expensive.toml
@@ -0,0 +1,12 @@
+# These represent tests that may be expensive to run on some regex engines. For
+# example, tests that build a full DFA ahead of time and minimize it can take a
+# horrendously long time on regexes that are large (or result in an explosion
+# in the number of states). We group these tests together so that such engines
+# can simply skip these tests.
+
+# See: https://github.com/rust-lang/regex/issues/98
+[[tests]]
+name = "regression-many-repeat-no-stack-overflow"
+regex = '^.{1,2500}'
+input = "a"
+matches = [[0, 1]]
diff --git a/vendor/regex-automata/tests/data/flags.toml b/vendor/regex-automata/tests/data/flags.toml
new file mode 100644
index 000000000..2b631ef23
--- /dev/null
+++ b/vendor/regex-automata/tests/data/flags.toml
@@ -0,0 +1,67 @@
+[[tests]]
+name = "1"
+regex = "(?i)abc"
+input = "ABC"
+matches = [[0, 3]]
+
+[[tests]]
+name = "2"
+regex = "(?i)a(?-i)bc"
+input = "Abc"
+matches = [[0, 3]]
+
+[[tests]]
+name = "3"
+regex = "(?i)a(?-i)bc"
+input = "ABC"
+matches = []
+
+[[tests]]
+name = "4"
+regex = "(?is)a."
+input = "A\n"
+matches = [[0, 2]]
+
+[[tests]]
+name = "5"
+regex = "(?is)a.(?-is)a."
+input = "A\nab"
+matches = [[0, 4]]
+
+[[tests]]
+name = "6"
+regex = "(?is)a.(?-is)a."
+input = "A\na\n"
+matches = []
+
+[[tests]]
+name = "7"
+regex = "(?is)a.(?-is:a.)?"
+input = "A\na\n"
+matches = [[0, 2]]
+match_limit = 1
+
+[[tests]]
+name = "8"
+regex = "(?U)a+"
+input = "aa"
+matches = [[0, 1]]
+match_limit = 1
+
+[[tests]]
+name = "9"
+regex = "(?U)a+?"
+input = "aa"
+matches = [[0, 2]]
+
+[[tests]]
+name = "10"
+regex = "(?U)(?-U)a+"
+input = "aa"
+matches = [[0, 2]]
+
+[[tests]]
+name = "11"
+regex = '(?m)(?:^\d+$\n?)+'
+input = "123\n456\n789"
+matches = [[0, 11]]
diff --git a/vendor/regex-automata/tests/data/fowler/basic.toml b/vendor/regex-automata/tests/data/fowler/basic.toml
new file mode 100644
index 000000000..c965f26ff
--- /dev/null
+++ b/vendor/regex-automata/tests/data/fowler/basic.toml
@@ -0,0 +1,1638 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "basic3"
+regex = '''abracadabra$'''
+input = '''abracadabracadabra'''
+captures = [[[7, 18]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic4"
+regex = '''a...b'''
+input = '''abababbb'''
+captures = [[[2, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic5"
+regex = '''XXXXXX'''
+input = '''..XXXXXX'''
+captures = [[[2, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic6"
+regex = '''\)'''
+input = '''()'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic7"
+regex = '''a]'''
+input = '''a]a'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic9"
+regex = '''\}'''
+input = '''}'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic10"
+regex = '''\]'''
+input = ''']'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic12"
+regex = ''']'''
+input = ''']'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic15"
+regex = '''^a'''
+input = '''ax'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic16"
+regex = '''\^a'''
+input = '''a^a'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic17"
+regex = '''a\^'''
+input = '''a^'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic18"
+regex = '''a$'''
+input = '''aa'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic19"
+regex = '''a\$'''
+input = '''a$'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic20"
+regex = '''^$'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic21"
+regex = '''$^'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic22"
+regex = '''a($)'''
+input = '''aa'''
+captures = [[[1, 2], [2, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic23"
+regex = '''a*(^a)'''
+input = '''aa'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic24"
+regex = '''(..)*(...)*'''
+input = '''a'''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic25"
+regex = '''(..)*(...)*'''
+input = '''abcd'''
+captures = [[[0, 4], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic26"
+regex = '''(ab|a)(bc|c)'''
+input = '''abc'''
+captures = [[[0, 3], [0, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic27"
+regex = '''(ab)c|abc'''
+input = '''abc'''
+captures = [[[0, 3], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic28"
+regex = '''a{0}b'''
+input = '''ab'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic29"
+regex = '''(a*)(b?)(b+)b{3}'''
+input = '''aaabbbbbbb'''
+captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic30"
+regex = '''(a*)(b{0,1})(b{1,})b{3}'''
+input = '''aaabbbbbbb'''
+captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic32"
+regex = '''((a|a)|a)'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic33"
+regex = '''(a*)(a|aa)'''
+input = '''aaaa'''
+captures = [[[0, 4], [0, 3], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic34"
+regex = '''a*(a.|aa)'''
+input = '''aaaa'''
+captures = [[[0, 4], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic35"
+regex = '''a(b)|c(d)|a(e)f'''
+input = '''aef'''
+captures = [[[0, 3], [], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic36"
+regex = '''(a|b)?.*'''
+input = '''b'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic37"
+regex = '''(a|b)c|a(b|c)'''
+input = '''ac'''
+captures = [[[0, 2], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic38"
+regex = '''(a|b)c|a(b|c)'''
+input = '''ab'''
+captures = [[[0, 2], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic39"
+regex = '''(a|b)*c|(a|ab)*c'''
+input = '''abc'''
+captures = [[[0, 3], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic40"
+regex = '''(a|b)*c|(a|ab)*c'''
+input = '''xc'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic41"
+regex = '''(.a|.b).*|.*(.a|.b)'''
+input = '''xa'''
+captures = [[[0, 2], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic42"
+regex = '''a?(ab|ba)ab'''
+input = '''abab'''
+captures = [[[0, 4], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic43"
+regex = '''a?(ac{0}b|ba)ab'''
+input = '''abab'''
+captures = [[[0, 4], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic44"
+regex = '''ab|abab'''
+input = '''abbabab'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic45"
+regex = '''aba|bab|bba'''
+input = '''baaabbbaba'''
+captures = [[[5, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic46"
+regex = '''aba|bab'''
+input = '''baaabbbaba'''
+captures = [[[6, 9]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic47"
+regex = '''(aa|aaa)*|(a|aaaaa)'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic48"
+regex = '''(a.|.a.)*|(a|.a...)'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic49"
+regex = '''ab|a'''
+input = '''xabc'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic50"
+regex = '''ab|a'''
+input = '''xxabc'''
+captures = [[[2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic51"
+regex = '''(Ab|cD)*'''
+input = '''aBcD'''
+captures = [[[0, 4], [2, 4]]]
+match_limit = 1
+unescape = true
+case_insensitive = true
+
+[[tests]]
+name = "basic52"
+regex = '''[^-]'''
+input = '''--a'''
+captures = [[[2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic53"
+regex = '''[a-]*'''
+input = '''--a'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic54"
+regex = '''[a-m-]*'''
+input = '''--amoma--'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic55"
+regex = ''':::1:::0:|:::1:1:0:'''
+input = ''':::0:::1:::1:::0:'''
+captures = [[[8, 17]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic56"
+regex = ''':::1:::0:|:::1:1:1:'''
+input = ''':::0:::1:::1:::0:'''
+captures = [[[8, 17]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic57"
+regex = '''[[:upper:]]'''
+input = '''A'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic58"
+regex = '''[[:lower:]]+'''
+input = '''`az{'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic59"
+regex = '''[[:upper:]]+'''
+input = '''@AZ['''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic65"
+regex = '''\n'''
+input = '''\n'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic66"
+regex = '''\n'''
+input = '''\n'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic67"
+regex = '''[^a]'''
+input = '''\n'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic68"
+regex = '''\na'''
+input = '''\na'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic69"
+regex = '''(a)(b)(c)'''
+input = '''abc'''
+captures = [[[0, 3], [0, 1], [1, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic70"
+regex = '''xxx'''
+input = '''xxx'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic71"
+regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''feb 6,'''
+captures = [[[0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic72"
+regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''2/7'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic73"
+regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''feb 1,Feb 6'''
+captures = [[[5, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic74"
+regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))'''
+input = '''x'''
+captures = [[[0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic75"
+regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*'''
+input = '''xx'''
+captures = [[[0, 2], [1, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic76"
+regex = '''a?(ab|ba)*'''
+input = '''ababababababababababababababababababababababababababababababababababababababababa'''
+captures = [[[0, 81], [79, 81]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic77"
+regex = '''abaa|abbaa|abbbaa|abbbbaa'''
+input = '''ababbabbbabbbabbbbabbbbaa'''
+captures = [[[18, 25]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic78"
+regex = '''abaa|abbaa|abbbaa|abbbbaa'''
+input = '''ababbabbbabbbabbbbabaa'''
+captures = [[[18, 22]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic79"
+regex = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc'''
+input = '''baaabbbabac'''
+captures = [[[7, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic80"
+regex = '''.*'''
+input = '''\x01\x7f'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic81"
+regex = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll'''
+input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa'''
+captures = [[[53, 57]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic83"
+regex = '''a*a*a*a*a*b'''
+input = '''aaaaaaaaab'''
+captures = [[[0, 10]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic84"
+regex = '''^'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic85"
+regex = '''$'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic86"
+regex = '''^$'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic87"
+regex = '''^a$'''
+input = '''a'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic88"
+regex = '''abc'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic89"
+regex = '''abc'''
+input = '''xabcy'''
+captures = [[[1, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic90"
+regex = '''abc'''
+input = '''ababc'''
+captures = [[[2, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic91"
+regex = '''ab*c'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic92"
+regex = '''ab*bc'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic93"
+regex = '''ab*bc'''
+input = '''abbc'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic94"
+regex = '''ab*bc'''
+input = '''abbbbc'''
+captures = [[[0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic95"
+regex = '''ab+bc'''
+input = '''abbc'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic96"
+regex = '''ab+bc'''
+input = '''abbbbc'''
+captures = [[[0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic97"
+regex = '''ab?bc'''
+input = '''abbc'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic98"
+regex = '''ab?bc'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic99"
+regex = '''ab?c'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic100"
+regex = '''^abc$'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic101"
+regex = '''^abc'''
+input = '''abcc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic102"
+regex = '''abc$'''
+input = '''aabc'''
+captures = [[[1, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic103"
+regex = '''^'''
+input = '''abc'''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic104"
+regex = '''$'''
+input = '''abc'''
+captures = [[[3, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic105"
+regex = '''a.c'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic106"
+regex = '''a.c'''
+input = '''axc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic107"
+regex = '''a.*c'''
+input = '''axyzc'''
+captures = [[[0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic108"
+regex = '''a[bc]d'''
+input = '''abd'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic109"
+regex = '''a[b-d]e'''
+input = '''ace'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic110"
+regex = '''a[b-d]'''
+input = '''aac'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic111"
+regex = '''a[-b]'''
+input = '''a-'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic112"
+regex = '''a[b-]'''
+input = '''a-'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic113"
+regex = '''a]'''
+input = '''a]'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic114"
+regex = '''a[]]b'''
+input = '''a]b'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic115"
+regex = '''a[^bc]d'''
+input = '''aed'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic116"
+regex = '''a[^-b]c'''
+input = '''adc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic117"
+regex = '''a[^]b]c'''
+input = '''adc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic118"
+regex = '''ab|cd'''
+input = '''abc'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic119"
+regex = '''ab|cd'''
+input = '''abcd'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic120"
+regex = '''a\(b'''
+input = '''a(b'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic121"
+regex = '''a\(*b'''
+input = '''ab'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic122"
+regex = '''a\(*b'''
+input = '''a((b'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic123"
+regex = '''((a))'''
+input = '''abc'''
+captures = [[[0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic124"
+regex = '''(a)b(c)'''
+input = '''abc'''
+captures = [[[0, 3], [0, 1], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic125"
+regex = '''a+b+c'''
+input = '''aabbabc'''
+captures = [[[4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic126"
+regex = '''a*'''
+input = '''aaa'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic128"
+regex = '''(a*)*'''
+input = '''-'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic129"
+regex = '''(a*)+'''
+input = '''-'''
+captures = [[[0, 0], [0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic131"
+regex = '''(a*|b)*'''
+input = '''-'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic132"
+regex = '''(a+|b)*'''
+input = '''ab'''
+captures = [[[0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic133"
+regex = '''(a+|b)+'''
+input = '''ab'''
+captures = [[[0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic134"
+regex = '''(a+|b)?'''
+input = '''ab'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic135"
+regex = '''[^ab]*'''
+input = '''cde'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic137"
+regex = '''(^)*'''
+input = '''-'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic138"
+regex = '''a*'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic139"
+regex = '''([abc])*d'''
+input = '''abbbcd'''
+captures = [[[0, 6], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic140"
+regex = '''([abc])*bcd'''
+input = '''abcd'''
+captures = [[[0, 4], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic141"
+regex = '''a|b|c|d|e'''
+input = '''e'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic142"
+regex = '''(a|b|c|d|e)f'''
+input = '''ef'''
+captures = [[[0, 2], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic144"
+regex = '''((a*|b))*'''
+input = '''-'''
+captures = [[[0, 0], [], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic145"
+regex = '''abcd*efg'''
+input = '''abcdefg'''
+captures = [[[0, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic146"
+regex = '''ab*'''
+input = '''xabyabbbz'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic147"
+regex = '''ab*'''
+input = '''xayabbbz'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic148"
+regex = '''(ab|cd)e'''
+input = '''abcde'''
+captures = [[[2, 5], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic149"
+regex = '''[abhgefdc]ij'''
+input = '''hij'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic150"
+regex = '''(a|b)c*d'''
+input = '''abcd'''
+captures = [[[1, 4], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic151"
+regex = '''(ab|ab*)bc'''
+input = '''abc'''
+captures = [[[0, 3], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic152"
+regex = '''a([bc]*)c*'''
+input = '''abc'''
+captures = [[[0, 3], [1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic153"
+regex = '''a([bc]*)(c*d)'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 3], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic154"
+regex = '''a([bc]+)(c*d)'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 3], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic155"
+regex = '''a([bc]*)(c+d)'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 2], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic156"
+regex = '''a[bcd]*dcdcde'''
+input = '''adcdcde'''
+captures = [[[0, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic157"
+regex = '''(ab|a)b*c'''
+input = '''abc'''
+captures = [[[0, 3], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic158"
+regex = '''((a)(b)c)(d)'''
+input = '''abcd'''
+captures = [[[0, 4], [0, 3], [0, 1], [1, 2], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic159"
+regex = '''[A-Za-z_][A-Za-z0-9_]*'''
+input = '''alpha'''
+captures = [[[0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic160"
+regex = '''^a(bc+|b[eh])g|.h$'''
+input = '''abh'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic161"
+regex = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''effgz'''
+captures = [[[0, 5], [0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic162"
+regex = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''ij'''
+captures = [[[0, 2], [0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic163"
+regex = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''reffgz'''
+captures = [[[1, 6], [1, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic164"
+regex = '''(((((((((a)))))))))'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic165"
+regex = '''multiple words'''
+input = '''multiple words yeah'''
+captures = [[[0, 14]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic166"
+regex = '''(.*)c(.*)'''
+input = '''abcde'''
+captures = [[[0, 5], [0, 2], [3, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic167"
+regex = '''abcd'''
+input = '''abcd'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic168"
+regex = '''a(bc)d'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic169"
+regex = '''a[\x01-\x03]?c'''
+input = '''a\x02c'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic170"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Qaddafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic171"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mo'ammar Gadhafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic172"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Kaddafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic173"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Qadhafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic174"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Gadafi'''
+captures = [[[0, 14], [], [10, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic175"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mu'ammar Qadafi'''
+captures = [[[0, 15], [], [11, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic176"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moamar Gaddafi'''
+captures = [[[0, 14], [], [9, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic177"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mu'ammar Qadhdhafi'''
+captures = [[[0, 18], [], [13, 15]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic178"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Khaddafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic179"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghaddafy'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic180"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghadafi'''
+captures = [[[0, 15], [], [11, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic181"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghaddafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic182"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muamar Kaddafi'''
+captures = [[[0, 14], [], [9, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic183"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Quathafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic184"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Gheddafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic185"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moammar Khadafy'''
+captures = [[[0, 15], [], [11, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic186"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moammar Qudhafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic187"
+regex = '''a+(b|c)*d+'''
+input = '''aabcdd'''
+captures = [[[0, 6], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic188"
+regex = '''^.+$'''
+input = '''vivi'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic189"
+regex = '''^(.+)$'''
+input = '''vivi'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic190"
+regex = '''^([^!.]+).att.com!(.+)$'''
+input = '''gryphon.att.com!eby'''
+captures = [[[0, 19], [0, 7], [16, 19]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic191"
+regex = '''^([^!]+!)?([^!]+)$'''
+input = '''bas'''
+captures = [[[0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic192"
+regex = '''^([^!]+!)?([^!]+)$'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic193"
+regex = '''^([^!]+!)?([^!]+)$'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic194"
+regex = '''^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic195"
+regex = '''((foo)|(bar))!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic196"
+regex = '''((foo)|(bar))!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7], [], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic197"
+regex = '''((foo)|(bar))!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic198"
+regex = '''((foo)|bar)!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic199"
+regex = '''((foo)|bar)!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic200"
+regex = '''((foo)|bar)!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic201"
+regex = '''(foo|(bar))!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic202"
+regex = '''(foo|(bar))!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic203"
+regex = '''(foo|(bar))!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic204"
+regex = '''(foo|bar)!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic205"
+regex = '''(foo|bar)!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic206"
+regex = '''(foo|bar)!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic207"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic208"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''bas'''
+captures = [[[0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic209"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic210"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [], [], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic211"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic212"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''bas'''
+captures = [[[0, 3], [0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic213"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic214"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic215"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic216"
+regex = '''.*(/XXX).*'''
+input = '''/XXX'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic217"
+regex = '''.*(\\XXX).*'''
+input = '''\\XXX'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic218"
+regex = '''\\XXX'''
+input = '''\\XXX'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic219"
+regex = '''.*(/000).*'''
+input = '''/000'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic220"
+regex = '''.*(\\000).*'''
+input = '''\\000'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic221"
+regex = '''\\000'''
+input = '''\\000'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
diff --git a/vendor/regex-automata/data/tests/fowler/README b/vendor/regex-automata/tests/data/fowler/dat/README
index 55507f03f..e70072500 100644
--- a/vendor/regex-automata/data/tests/fowler/README
+++ b/vendor/regex-automata/tests/data/fowler/dat/README
@@ -14,10 +14,11 @@ by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
have been a bad idea, but I think being consistent with an established Regex
library is worth something.
-After some number of years, these tests were transformed into a JSON format
-using the fowler-to-json script in this directory, e.g.,
+After some number of years, these tests were transformed into a TOML format
+using the fowler-to-toml script in the 'scripts' directory. To re-generate the
+TOML files, then run the following from the root of this repository:
- ./fowler-to-json basic.dat > basic.json
+ ./scripts/fowler-to-toml tests/data/fowler tests/data/fowler/dat/*.dat
which brings them into a sensible structured format in which other tests can
be written.
diff --git a/vendor/regex-automata/data/fowler-tests/basic.dat b/vendor/regex-automata/tests/data/fowler/dat/basic.dat
index e55efaeec..e55efaeec 100644
--- a/vendor/regex-automata/data/fowler-tests/basic.dat
+++ b/vendor/regex-automata/tests/data/fowler/dat/basic.dat
diff --git a/vendor/regex-automata/data/fowler-tests/nullsubexpr.dat b/vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat
index 2e18fbb91..2e18fbb91 100644
--- a/vendor/regex-automata/data/fowler-tests/nullsubexpr.dat
+++ b/vendor/regex-automata/tests/data/fowler/dat/nullsubexpr.dat
diff --git a/vendor/regex-automata/data/tests/fowler/repetition-long.dat b/vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat
index c91580236..c91580236 100644
--- a/vendor/regex-automata/data/tests/fowler/repetition-long.dat
+++ b/vendor/regex-automata/tests/data/fowler/dat/repetition-expensive.dat
diff --git a/vendor/regex-automata/data/tests/fowler/repetition.dat b/vendor/regex-automata/tests/data/fowler/dat/repetition.dat
index 2dac0823f..2dac0823f 100644
--- a/vendor/regex-automata/data/tests/fowler/repetition.dat
+++ b/vendor/regex-automata/tests/data/fowler/dat/repetition.dat
diff --git a/vendor/regex-automata/tests/data/fowler/nullsubexpr.toml b/vendor/regex-automata/tests/data/fowler/nullsubexpr.toml
new file mode 100644
index 000000000..55d1d5b43
--- /dev/null
+++ b/vendor/regex-automata/tests/data/fowler/nullsubexpr.toml
@@ -0,0 +1,405 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "nullsubexpr3"
+regex = '''(a*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr5"
+regex = '''(a*)*'''
+input = '''x'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr6"
+regex = '''(a*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr7"
+regex = '''(a*)*'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr8"
+regex = '''(a*)+'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr9"
+regex = '''(a*)+'''
+input = '''x'''
+captures = [[[0, 0], [0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr10"
+regex = '''(a*)+'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr11"
+regex = '''(a*)+'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr12"
+regex = '''(a+)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr13"
+regex = '''(a+)*'''
+input = '''x'''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr14"
+regex = '''(a+)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr15"
+regex = '''(a+)*'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr16"
+regex = '''(a+)+'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr17"
+regex = '''(a+)+'''
+input = '''x'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr18"
+regex = '''(a+)+'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr19"
+regex = '''(a+)+'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr21"
+regex = '''([a]*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr23"
+regex = '''([a]*)*'''
+input = '''x'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr24"
+regex = '''([a]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr25"
+regex = '''([a]*)*'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr26"
+regex = '''([a]*)+'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr27"
+regex = '''([a]*)+'''
+input = '''x'''
+captures = [[[0, 0], [0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr28"
+regex = '''([a]*)+'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr29"
+regex = '''([a]*)+'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr30"
+regex = '''([^b]*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr32"
+regex = '''([^b]*)*'''
+input = '''b'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr33"
+regex = '''([^b]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr34"
+regex = '''([^b]*)*'''
+input = '''aaaaaab'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr35"
+regex = '''([ab]*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr36"
+regex = '''([ab]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr37"
+regex = '''([ab]*)*'''
+input = '''ababab'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr38"
+regex = '''([ab]*)*'''
+input = '''bababa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr39"
+regex = '''([ab]*)*'''
+input = '''b'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr40"
+regex = '''([ab]*)*'''
+input = '''bbbbbb'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr41"
+regex = '''([ab]*)*'''
+input = '''aaaabcde'''
+captures = [[[0, 5], [0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr42"
+regex = '''([^a]*)*'''
+input = '''b'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr43"
+regex = '''([^a]*)*'''
+input = '''bbbbbb'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr45"
+regex = '''([^a]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr46"
+regex = '''([^ab]*)*'''
+input = '''ccccxx'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr48"
+regex = '''([^ab]*)*'''
+input = '''ababab'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr50"
+regex = '''((z)+|a)*'''
+input = '''zabcde'''
+captures = [[[0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr69"
+regex = '''(a*)*(x)'''
+input = '''x'''
+captures = [[[0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr70"
+regex = '''(a*)*(x)'''
+input = '''ax'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr71"
+regex = '''(a*)*(x)'''
+input = '''axa'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr73"
+regex = '''(a*)+(x)'''
+input = '''x'''
+captures = [[[0, 1], [0, 0], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr74"
+regex = '''(a*)+(x)'''
+input = '''ax'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr75"
+regex = '''(a*)+(x)'''
+input = '''axa'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr77"
+regex = '''(a*){2}(x)'''
+input = '''x'''
+captures = [[[0, 1], [0, 0], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr78"
+regex = '''(a*){2}(x)'''
+input = '''ax'''
+captures = [[[0, 2], [1, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr79"
+regex = '''(a*){2}(x)'''
+input = '''axa'''
+captures = [[[0, 2], [1, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
diff --git a/vendor/regex-automata/tests/data/fowler/repetition-expensive.toml b/vendor/regex-automata/tests/data/fowler/repetition-expensive.toml
new file mode 100644
index 000000000..81a896452
--- /dev/null
+++ b/vendor/regex-automata/tests/data/fowler/repetition-expensive.toml
@@ -0,0 +1,341 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "repetition-expensive12"
+regex = '''X(.?){0,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive13"
+regex = '''X(.?){1,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive14"
+regex = '''X(.?){2,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive15"
+regex = '''X(.?){3,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive16"
+regex = '''X(.?){4,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive17"
+regex = '''X(.?){5,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive18"
+regex = '''X(.?){6,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive19"
+regex = '''X(.?){7,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive20"
+regex = '''X(.?){8,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive22"
+regex = '''X(.?){0,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive24"
+regex = '''X(.?){1,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive26"
+regex = '''X(.?){2,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive28"
+regex = '''X(.?){3,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive30"
+regex = '''X(.?){4,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive32"
+regex = '''X(.?){5,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive34"
+regex = '''X(.?){6,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive36"
+regex = '''X(.?){7,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive37"
+regex = '''X(.?){8,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive48"
+regex = '''(a|ab|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive49"
+regex = '''(a|ab|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive50"
+regex = '''(a|ab|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive51"
+regex = '''(a|ab|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive52"
+regex = '''(a|ab|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive53"
+regex = '''(a|ab|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive54"
+regex = '''(a|ab|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive55"
+regex = '''(a|ab|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive56"
+regex = '''(a|ab|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive57"
+regex = '''(a|ab|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive58"
+regex = '''(a|ab|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive59"
+regex = '''(a|ab|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive65"
+regex = '''(ab|a|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive67"
+regex = '''(ab|a|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive69"
+regex = '''(ab|a|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive71"
+regex = '''(ab|a|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive72"
+regex = '''(ab|a|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive74"
+regex = '''(ab|a|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive76"
+regex = '''(ab|a|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive78"
+regex = '''(ab|a|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive80"
+regex = '''(ab|a|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive81"
+regex = '''(ab|a|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive83"
+regex = '''(ab|a|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive85"
+regex = '''(ab|a|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
diff --git a/vendor/regex-automata/tests/data/fowler/repetition-long.toml b/vendor/regex-automata/tests/data/fowler/repetition-long.toml
new file mode 100644
index 000000000..fa24c834a
--- /dev/null
+++ b/vendor/regex-automata/tests/data/fowler/repetition-long.toml
@@ -0,0 +1,341 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "repetition-long12"
+regex = '''X(.?){0,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long13"
+regex = '''X(.?){1,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long14"
+regex = '''X(.?){2,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long15"
+regex = '''X(.?){3,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long16"
+regex = '''X(.?){4,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long17"
+regex = '''X(.?){5,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long18"
+regex = '''X(.?){6,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long19"
+regex = '''X(.?){7,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long20"
+regex = '''X(.?){8,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long22"
+regex = '''X(.?){0,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long24"
+regex = '''X(.?){1,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long26"
+regex = '''X(.?){2,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long28"
+regex = '''X(.?){3,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long30"
+regex = '''X(.?){4,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long32"
+regex = '''X(.?){5,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long34"
+regex = '''X(.?){6,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long36"
+regex = '''X(.?){7,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long37"
+regex = '''X(.?){8,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long48"
+regex = '''(a|ab|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long49"
+regex = '''(a|ab|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long50"
+regex = '''(a|ab|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long51"
+regex = '''(a|ab|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long52"
+regex = '''(a|ab|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long53"
+regex = '''(a|ab|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long54"
+regex = '''(a|ab|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long55"
+regex = '''(a|ab|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long56"
+regex = '''(a|ab|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long57"
+regex = '''(a|ab|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long58"
+regex = '''(a|ab|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long59"
+regex = '''(a|ab|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long65"
+regex = '''(ab|a|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long67"
+regex = '''(ab|a|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long69"
+regex = '''(ab|a|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long71"
+regex = '''(ab|a|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long72"
+regex = '''(ab|a|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long74"
+regex = '''(ab|a|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long76"
+regex = '''(ab|a|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long78"
+regex = '''(ab|a|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long80"
+regex = '''(ab|a|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long81"
+regex = '''(ab|a|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long83"
+regex = '''(ab|a|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long85"
+regex = '''(ab|a|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
diff --git a/vendor/regex-automata/tests/data/fowler/repetition.toml b/vendor/regex-automata/tests/data/fowler/repetition.toml
new file mode 100644
index 000000000..fc8da8df4
--- /dev/null
+++ b/vendor/regex-automata/tests/data/fowler/repetition.toml
@@ -0,0 +1,397 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "repetition10"
+regex = '''((..)|(.))'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition11"
+regex = '''((..)|(.))((..)|(.))'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition12"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition14"
+regex = '''((..)|(.)){1}'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition15"
+regex = '''((..)|(.)){2}'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition16"
+regex = '''((..)|(.)){3}'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition18"
+regex = '''((..)|(.))*'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition20"
+regex = '''((..)|(.))'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition21"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition22"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition24"
+regex = '''((..)|(.)){1}'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition25"
+regex = '''((..)|(.)){2}'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition26"
+regex = '''((..)|(.)){3}'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition28"
+regex = '''((..)|(.))*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition30"
+regex = '''((..)|(.))'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition31"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aa'''
+captures = [[[0, 2], [0, 1], [], [0, 1], [1, 2], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition32"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aa'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition34"
+regex = '''((..)|(.)){1}'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition35"
+regex = '''((..)|(.)){2}'''
+input = '''aa'''
+captures = [[[0, 2], [1, 2], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition36"
+regex = '''((..)|(.)){3}'''
+input = '''aa'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition38"
+regex = '''((..)|(.))*'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition40"
+regex = '''((..)|(.))'''
+input = '''aaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition41"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaa'''
+captures = [[[0, 3], [0, 2], [0, 2], [], [2, 3], [], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition42"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaa'''
+captures = [[[0, 3], [0, 1], [], [0, 1], [1, 2], [], [1, 2], [2, 3], [], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition44"
+regex = '''((..)|(.)){1}'''
+input = '''aaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition46"
+regex = '''((..)|(.)){2}'''
+input = '''aaa'''
+captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition47"
+regex = '''((..)|(.)){3}'''
+input = '''aaa'''
+captures = [[[0, 3], [2, 3], [], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition50"
+regex = '''((..)|(.))*'''
+input = '''aaa'''
+captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition52"
+regex = '''((..)|(.))'''
+input = '''aaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition53"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition54"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 3], [], [2, 3], [3, 4], [], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition56"
+regex = '''((..)|(.)){1}'''
+input = '''aaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition57"
+regex = '''((..)|(.)){2}'''
+input = '''aaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition59"
+regex = '''((..)|(.)){3}'''
+input = '''aaaa'''
+captures = [[[0, 4], [3, 4], [0, 2], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition61"
+regex = '''((..)|(.))*'''
+input = '''aaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition63"
+regex = '''((..)|(.))'''
+input = '''aaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition64"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition65"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaaa'''
+captures = [[[0, 5], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 5], [], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition67"
+regex = '''((..)|(.)){1}'''
+input = '''aaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition68"
+regex = '''((..)|(.)){2}'''
+input = '''aaaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition70"
+regex = '''((..)|(.)){3}'''
+input = '''aaaaa'''
+captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition73"
+regex = '''((..)|(.))*'''
+input = '''aaaaa'''
+captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition75"
+regex = '''((..)|(.))'''
+input = '''aaaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition76"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaaaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition77"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 6], [4, 6], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition79"
+regex = '''((..)|(.)){1}'''
+input = '''aaaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition80"
+regex = '''((..)|(.)){2}'''
+input = '''aaaaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition81"
+regex = '''((..)|(.)){3}'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [4, 6], [4, 6], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition83"
+regex = '''((..)|(.))*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [4, 6], [4, 6], []]]
+match_limit = 1
+unescape = true
+
diff --git a/vendor/regex-automata/tests/data/iter.toml b/vendor/regex-automata/tests/data/iter.toml
new file mode 100644
index 000000000..6c0539fd4
--- /dev/null
+++ b/vendor/regex-automata/tests/data/iter.toml
@@ -0,0 +1,119 @@
+[[tests]]
+name = "1"
+regex = "a"
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+
+[[tests]]
+name = "2"
+regex = "a"
+input = "aba"
+matches = [[0, 1], [2, 3]]
+
+[[tests]]
+name = "empty1"
+regex = ''
+input = ''
+matches = [[0, 0]]
+
+[[tests]]
+name = "empty2"
+regex = ''
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty3"
+regex = '()'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty4"
+regex = '()*'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty5"
+regex = '()+'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty6"
+regex = '()?'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty7"
+regex = '()()'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty8"
+regex = '()+|z'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty9"
+regex = 'z|()+'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty10"
+regex = '()+|b'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty11"
+regex = 'b|()+'
+input = 'abc'
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "start1"
+regex = "^a"
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "start2"
+regex = "^a"
+input = "aa"
+matches = [[0, 1]]
+
+[[tests]]
+name = "anchored1"
+regex = "a"
+input = "a"
+matches = [[0, 1]]
+anchored = true
+
+# This test is pretty subtle. It demonstrates the crucial difference between
+# '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively
+# matches at the start of a haystack and nowhere else. The latter regex has
+# no such restriction, but its automaton is constructed such that it lacks a
+# `.*?` prefix. So it can actually produce matches at multiple locations.
+# The anchored3 test drives this point home.
+[[tests]]
+name = "anchored2"
+regex = "a"
+input = "aa"
+matches = [[0, 1], [1, 2]]
+anchored = true
+
+# Unlikely anchored2, this test stops matching anything after it sees `b`
+# since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it
+# determines that there are no remaining matches.
+[[tests]]
+name = "anchored3"
+regex = "a"
+input = "aaba"
+matches = [[0, 1], [1, 2]]
+anchored = true
diff --git a/vendor/regex-automata/tests/data/misc.toml b/vendor/regex-automata/tests/data/misc.toml
new file mode 100644
index 000000000..c05418dd6
--- /dev/null
+++ b/vendor/regex-automata/tests/data/misc.toml
@@ -0,0 +1,99 @@
+[[tests]]
+name = "ascii-literal"
+regex = "a"
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "ascii-literal-not"
+regex = "a"
+input = "z"
+matches = []
+
+[[tests]]
+name = "ascii-literal-anchored"
+regex = "a"
+input = "a"
+matches = [[0, 1]]
+anchored = true
+
+[[tests]]
+name = "ascii-literal-anchored-not"
+regex = "a"
+input = "z"
+matches = []
+anchored = true
+
+[[tests]]
+name = "anchor-start-end-line"
+regex = '(?m)^bar$'
+input = "foo\nbar\nbaz"
+matches = [[4, 7]]
+
+[[tests]]
+name = "prefix-literal-match"
+regex = '^abc'
+input = "abc"
+matches = [[0, 3]]
+
+[[tests]]
+name = "prefix-literal-match-ascii"
+regex = '^abc'
+input = "abc"
+matches = [[0, 3]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "prefix-literal-no-match"
+regex = '^abc'
+input = "zabc"
+matches = []
+
+[[tests]]
+name = "one-literal-edge"
+regex = 'abc'
+input = "xxxxxab"
+matches = []
+
+[[tests]]
+name = "terminates"
+regex = 'a$'
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "suffix-100"
+regex = '.*abcd'
+input = "abcd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "suffix-200"
+regex = '.*(?:abcd)+'
+input = "abcd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "suffix-300"
+regex = '.*(?:abcd)+'
+input = "abcdabcd"
+matches = [[0, 8]]
+
+[[tests]]
+name = "suffix-400"
+regex = '.*(?:abcd)+'
+input = "abcdxabcd"
+matches = [[0, 9]]
+
+[[tests]]
+name = "suffix-500"
+regex = '.*x(?:abcd)+'
+input = "abcdxabcd"
+matches = [[0, 9]]
+
+[[tests]]
+name = "suffix-600"
+regex = '[^abcd]*x(?:abcd)+'
+input = "abcdxabcd"
+matches = [[4, 9]]
diff --git a/vendor/regex-automata/tests/data/multiline.toml b/vendor/regex-automata/tests/data/multiline.toml
new file mode 100644
index 000000000..cefdb2629
--- /dev/null
+++ b/vendor/regex-automata/tests/data/multiline.toml
@@ -0,0 +1,275 @@
+[[tests]]
+name = "basic1"
+regex = '(?m)^[a-z]+$'
+input = "abc\ndef\nxyz"
+matches = [[0, 3], [4, 7], [8, 11]]
+
+[[tests]]
+name = "basic2"
+regex = '(?m)^$'
+input = "abc\ndef\nxyz"
+matches = []
+
+[[tests]]
+name = "basic3"
+regex = '(?m)^'
+input = "abc\ndef\nxyz"
+matches = [[0, 0], [4, 4], [8, 8]]
+
+[[tests]]
+name = "basic4"
+regex = '(?m)$'
+input = "abc\ndef\nxyz"
+matches = [[3, 3], [7, 7], [11, 11]]
+
+[[tests]]
+name = "basic5"
+regex = '(?m)^[a-z]'
+input = "abc\ndef\nxyz"
+matches = [[0, 1], [4, 5], [8, 9]]
+
+[[tests]]
+name = "basic6"
+regex = '(?m)[a-z]^'
+input = "abc\ndef\nxyz"
+matches = []
+
+[[tests]]
+name = "basic7"
+regex = '(?m)[a-z]$'
+input = "abc\ndef\nxyz"
+matches = [[2, 3], [6, 7], [10, 11]]
+
+[[tests]]
+name = "basic8"
+regex = '(?m)$[a-z]'
+input = "abc\ndef\nxyz"
+matches = []
+
+[[tests]]
+name = "basic9"
+regex = '(?m)^$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "repeat1"
+regex = '(?m)(?:^$)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "repeat1-no-multi"
+regex = '(?:^$)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "repeat2"
+regex = '(?m)(?:^|a)+'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat100"
+regex = '(?m)(?:^|a)+'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat2-no-multi"
+regex = '(?:^|a)+'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 5]]
+
+[[tests]]
+name = "repeat3"
+regex = '(?m)(?:^|a)*'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat3-no-multi"
+regex = '(?:^|a)*'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
+
+[[tests]]
+name = "repeat4"
+regex = '(?m)(?:^|a+)'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat4-no-multi"
+regex = '(?:^|a+)'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 5]]
+
+[[tests]]
+name = "repeat5"
+regex = '(?m)(?:^|a*)'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat5-no-multi"
+regex = '(?:^|a*)'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
+
+[[tests]]
+name = "repeat6"
+regex = '(?m)(?:^[a-z])+'
+input = "abc\ndef\nxyz"
+matches = [[0, 1], [4, 5], [8, 9]]
+
+[[tests]]
+name = "repeat6-no-multi"
+regex = '(?:^[a-z])+'
+input = "abc\ndef\nxyz"
+matches = [[0, 1]]
+
+[[tests]]
+name = "repeat7"
+regex = '(?m)(?:^[a-z]{3}\n?)+'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat7-no-multi"
+regex = '(?:^[a-z]{3}\n?)+'
+input = "abc\ndef\nxyz"
+matches = [[0, 4]]
+
+[[tests]]
+name = "repeat8"
+regex = '(?m)(?:^[a-z]{3}\n?)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat8-no-multi"
+regex = '(?:^[a-z]{3}\n?)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
+
+[[tests]]
+name = "repeat9"
+regex = '(?m)(?:\n?[a-z]{3}$)+'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat9-no-multi"
+regex = '(?:\n?[a-z]{3}$)+'
+input = "abc\ndef\nxyz"
+matches = [[7, 11]]
+
+[[tests]]
+name = "repeat10"
+regex = '(?m)(?:\n?[a-z]{3}$)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat10-no-multi"
+regex = '(?:\n?[a-z]{3}$)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
+
+[[tests]]
+name = "repeat11"
+regex = '(?m)^*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat11-no-multi"
+regex = '^*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat12"
+regex = '(?m)^+'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [4, 4]]
+
+[[tests]]
+name = "repeat12-no-multi"
+regex = '^+'
+input = "\naa\n"
+matches = [[0, 0]]
+
+[[tests]]
+name = "repeat13"
+regex = '(?m)$*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat13-no-multi"
+regex = '$*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat14"
+regex = '(?m)$+'
+input = "\naa\n"
+matches = [[0, 0], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat14-no-multi"
+regex = '$+'
+input = "\naa\n"
+matches = [[4, 4]]
+
+[[tests]]
+name = "repeat15"
+regex = '(?m)(?:$\n)+'
+input = "\n\naaa\n\n"
+matches = [[0, 2], [5, 7]]
+
+[[tests]]
+name = "repeat15-no-multi"
+regex = '(?:$\n)+'
+input = "\n\naaa\n\n"
+matches = []
+
+[[tests]]
+name = "repeat16"
+regex = '(?m)(?:$\n)*'
+input = "\n\naaa\n\n"
+matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
+
+[[tests]]
+name = "repeat16-no-multi"
+regex = '(?:$\n)*'
+input = "\n\naaa\n\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
+
+[[tests]]
+name = "repeat17"
+regex = '(?m)(?:$\n^)+'
+input = "\n\naaa\n\n"
+matches = [[0, 2], [5, 7]]
+
+[[tests]]
+name = "repeat17-no-multi"
+regex = '(?:$\n^)+'
+input = "\n\naaa\n\n"
+matches = []
+
+[[tests]]
+name = "repeat18"
+regex = '(?m)(?:^|$)+'
+input = "\n\naaa\n\n"
+matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
+
+[[tests]]
+name = "repeat18-no-multi"
+regex = '(?:^|$)+'
+input = "\n\naaa\n\n"
+matches = [[0, 0], [7, 7]]
diff --git a/vendor/regex-automata/tests/data/no-unicode.toml b/vendor/regex-automata/tests/data/no-unicode.toml
new file mode 100644
index 000000000..c7fc9664f
--- /dev/null
+++ b/vendor/regex-automata/tests/data/no-unicode.toml
@@ -0,0 +1,158 @@
+[[tests]]
+name = "invalid-utf8-literal1"
+regex = '\xFF'
+input = '\xFF'
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "mixed"
+regex = '(.+)(?-u)(.+)'
+input = '\xCE\x93\xCE\x94\xFF'
+matches = [[0, 5]]
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "case1"
+regex = "a"
+input = "A"
+matches = [[0, 1]]
+case_insensitive = true
+unicode = false
+
+[[tests]]
+name = "case2"
+regex = "[a-z]+"
+input = "AaAaA"
+matches = [[0, 5]]
+case_insensitive = true
+unicode = false
+
+[[tests]]
+name = "case3"
+regex = "[a-z]+"
+input = "aA\u212AaA"
+matches = [[0, 7]]
+case_insensitive = true
+
+[[tests]]
+name = "case4"
+regex = "[a-z]+"
+input = "aA\u212AaA"
+matches = [[0, 2], [5, 7]]
+case_insensitive = true
+unicode = false
+
+
+[[tests]]
+name = "negate1"
+regex = "[^a]"
+input = "δ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "negate2"
+regex = "[^a]"
+input = "δ"
+matches = [[0, 1], [1, 2]]
+unicode = false
+utf8 = false
+
+
+[[tests]]
+name = "dotstar-prefix1"
+regex = "a"
+input = '\xFFa'
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+unescape = true
+
+[[tests]]
+name = "dotstar-prefix2"
+regex = "a"
+input = '\xFFa'
+matches = [[1, 2]]
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "null-bytes1"
+regex = '[^\x00]+\x00'
+input = 'foo\x00'
+matches = [[0, 4]]
+unicode = false
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "word-ascii"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 1]]
+unicode = false
+
+[[tests]]
+name = "word-unicode"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "decimal-ascii"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 1], [7, 8]]
+unicode = false
+
+[[tests]]
+name = "decimal-unicode"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+
+[[tests]]
+name = "space-ascii"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 1]]
+unicode = false
+
+[[tests]]
+name = "space-unicode"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 4]]
+
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "iter1-bytes"
+regex = ''
+input = "☃"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+utf8 = false
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "iter1-utf8"
+regex = ''
+input = "☃"
+matches = [[0, 0], [3, 3]]
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
+name = "iter2-bytes"
+regex = ''
+input = 'b\xFFr'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+unescape = true
+utf8 = false
diff --git a/vendor/regex-automata/tests/data/overlapping.toml b/vendor/regex-automata/tests/data/overlapping.toml
new file mode 100644
index 000000000..6662876b4
--- /dev/null
+++ b/vendor/regex-automata/tests/data/overlapping.toml
@@ -0,0 +1,126 @@
+[[tests]]
+name = "repetition-plus-leftmost-first-100"
+regex = 'a+'
+input = "aaa"
+matches = [[0, 1], [0, 2], [0, 3]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-plus-all-100"
+regex = 'a+'
+input = "aaa"
+matches = [[0, 1], [0, 2], [0, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-plus-leftmost-first-200"
+regex = '(abc)+'
+input = "zzabcabczzabc"
+matches = [[2, 5], [2, 8]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-plus-all-200"
+regex = '(abc)+'
+input = "zzabcabczzabc"
+matches = [[2, 5], [2, 8], [10, 13]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-leftmost-first-100"
+regex = 'a*'
+input = "aaa"
+matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-all-100"
+regex = 'a*'
+input = "aaa"
+matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-leftmost-first-200"
+regex = '(abc)*'
+input = "zzabcabczzabc"
+matches = [[0, 0]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-all-200"
+regex = '(abc)*'
+input = "zzabcabczzabc"
+matches = [
+ [0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
+ [2, 5],
+ [6, 6], [7, 7],
+ [2, 8],
+ [9, 9], [10, 10], [11, 11], [12, 12],
+ [10, 13],
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "start-end-rep-leftmost-first"
+regex = '(^$)*'
+input = "abc"
+matches = [[0, 0]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "start-end-rep-all"
+regex = '(^$)*'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "alt-leftmost-first-100"
+regex = 'abc|a'
+input = "zzabcazzaabc"
+matches = [[2, 3], [2, 5]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "alt-all-100"
+regex = 'abc|a'
+input = "zzabcazzaabc"
+matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty-000"
+regex = ""
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty-alt-000"
+regex = "|b"
+input = "abc"
+matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty-alt-010"
+regex = "b|"
+input = "abc"
+matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
diff --git a/vendor/regex-automata/tests/data/regression.toml b/vendor/regex-automata/tests/data/regression.toml
new file mode 100644
index 000000000..6a4dbb151
--- /dev/null
+++ b/vendor/regex-automata/tests/data/regression.toml
@@ -0,0 +1,423 @@
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-100"
+regex = '(*)'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-200"
+regex = '(?:?)'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-300"
+regex = '(?)'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-400"
+regex = '*'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/75
+[[tests]]
+name = "unsorted-binary-search-100"
+regex = '(?i-u)[a_]+'
+input = "A_"
+matches = [[0, 2]]
+
+# See: https://github.com/rust-lang/regex/issues/75
+[[tests]]
+name = "unsorted-binary-search-200"
+regex = '(?i-u)[A_]+'
+input = "a_"
+matches = [[0, 2]]
+
+# See: https://github.com/rust-lang/regex/issues/76
+[[tests]]
+name = "unicode-case-lower-nocase-flag"
+regex = '(?i)\p{Ll}+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+# See: https://github.com/rust-lang/regex/issues/99
+[[tests]]
+name = "negated-char-class-100"
+regex = '(?i)[^x]'
+input = "x"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/99
+[[tests]]
+name = "negated-char-class-200"
+regex = '(?i)[^x]'
+input = "X"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/101
+[[tests]]
+name = "ascii-word-underscore"
+regex = '[[:word:]]'
+input = "_"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/129
+[[tests]]
+name = "captures-repeat"
+regex = '([a-f]){2}(?P<foo>[x-z])'
+input = "abx"
+captures = [
+ [[0, 3], [0, 2], [2, 3]],
+]
+
+# See: https://github.com/rust-lang/regex/issues/153
+[[tests]]
+name = "alt-in-alt-100"
+regex = 'ab?|$'
+input = "az"
+matches = [[0, 1], [2, 2]]
+
+# See: https://github.com/rust-lang/regex/issues/153
+[[tests]]
+name = "alt-in-alt-200"
+regex = '^(.*?)(\n|\r\n?|$)'
+input = "ab\rcd"
+matches = [[0, 3]]
+
+# See: https://github.com/rust-lang/regex/issues/169
+[[tests]]
+name = "leftmost-first-prefix"
+regex = 'z*azb'
+input = "azb"
+matches = [[0, 3]]
+
+# See: https://github.com/rust-lang/regex/issues/191
+[[tests]]
+name = "many-alternates"
+regex = '1|2|3|4|5|6|7|8|9|10|int'
+input = "int"
+matches = [[0, 3]]
+
+# See: https://github.com/rust-lang/regex/issues/204
+[[tests]]
+name = "word-boundary-alone-100"
+regex = '\b'
+input = "Should this (work?)"
+matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]]
+
+# See: https://github.com/rust-lang/regex/issues/204
+[[tests]]
+name = "word-boundary-alone-200"
+regex = '\b'
+input = "a b c"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+# See: https://github.com/rust-lang/regex/issues/264
+[[tests]]
+name = "word-boundary-ascii-no-capture"
+regex = '\B'
+input = "\U00028F3E"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/264
+[[tests]]
+name = "word-boundary-ascii-capture"
+regex = '(\B)'
+input = "\U00028F3E"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/268
+[[tests]]
+name = "partial-anchor"
+regex = '^a|b'
+input = "ba"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "endl-or-word-boundary"
+regex = '(?m:$)|(?-u:\b)'
+input = "\U0006084E"
+matches = [[4, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "zero-or-end"
+regex = '(?i-u:\x00)|$'
+input = "\U000E682F"
+matches = [[4, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "y-or-endl"
+regex = '(?i-u:y)|(?m:$)'
+input = "\U000B4331"
+matches = [[4, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "word-boundary-start-x"
+regex = '(?u:\b)^(?-u:X)'
+input = "X"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "word-boundary-ascii-start-x"
+regex = '(?-u:\b)^(?-u:X)'
+input = "X"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "end-not-word-boundary"
+regex = '$\B'
+input = "\U0005C124\U000B576C"
+matches = [[8, 8]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/280
+[[tests]]
+name = "partial-anchor-alternate-begin"
+regex = '^a|z'
+input = "yyyyya"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/280
+[[tests]]
+name = "partial-anchor-alternate-end"
+regex = 'a$|z'
+input = "ayyyyy"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/289
+[[tests]]
+name = "lits-unambiguous-100"
+regex = '(ABC|CDA|BC)X'
+input = "CDAX"
+matches = [[0, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/291
+[[tests]]
+name = "lits-unambiguous-200"
+regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$'
+input = "CIMG2341"
+captures = [
+ [[0, 8], [0, 4], [], [0, 4], [4, 8]],
+]
+
+# See: https://github.com/rust-lang/regex/issues/303
+[[tests]]
+name = "negated-full-byte-range"
+regex = '[^\x00-\xFF]'
+input = ""
+matches = []
+compiles = false
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/321
+[[tests]]
+name = "strange-anchor-non-complete-prefix"
+regex = 'a^{2}'
+input = ""
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/321
+[[tests]]
+name = "strange-anchor-non-complete-suffix"
+regex = '${2}a'
+input = ""
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/334
+# See: https://github.com/rust-lang/regex/issues/557
+[[tests]]
+name = "captures-after-dfa-premature-end-100"
+regex = 'a(b*(X|$))?'
+input = "abcbX"
+captures = [
+ [[0, 1], [], []],
+]
+
+# See: https://github.com/rust-lang/regex/issues/334
+# See: https://github.com/rust-lang/regex/issues/557
+[[tests]]
+name = "captures-after-dfa-premature-end-200"
+regex = 'a(bc*(X|$))?'
+input = "abcbX"
+captures = [
+ [[0, 1], [], []],
+]
+
+# See: https://github.com/rust-lang/regex/issues/334
+# See: https://github.com/rust-lang/regex/issues/557
+[[tests]]
+name = "captures-after-dfa-premature-end-300"
+regex = '(aa$)?'
+input = "aaz"
+captures = [
+ [[0, 0]],
+ [[1, 1]],
+ [[2, 2]],
+ [[3, 3]],
+]
+
+# See: https://github.com/rust-lang/regex/issues/437
+[[tests]]
+name = "literal-panic"
+regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+'
+input = "test"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/527
+[[tests]]
+name = "empty-flag-expr"
+regex = '(((?x)))'
+input = ""
+matches = [[0, 0]]
+
+# See: https://github.com/rust-lang/regex/issues/533
+[[tests]]
+name = "blank-matches-nothing-between-space-and-tab"
+regex = '[[:blank:]]'
+input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
+match = false
+unescape = true
+
+# See: https://github.com/rust-lang/regex/issues/533
+[[tests]]
+name = "blank-matches-nothing-between-space-and-tab-inverted"
+regex = '^[[:^blank:]]+$'
+input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
+match = true
+unescape = true
+
+# See: https://github.com/rust-lang/regex/issues/555
+[[tests]]
+name = "invalid-repetition"
+regex = '(?m){1,1}'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/640
+[[tests]]
+name = "flags-are-unset"
+regex = '((?i)foo)|Bar'
+input = "foo Foo bar Bar"
+matches = [[0, 3], [4, 7], [12, 15]]
+
+# Note that 'Ј' is not 'j', but cyrillic Je
+# https://en.wikipedia.org/wiki/Je_(Cyrillic)
+#
+# See: https://github.com/rust-lang/regex/issues/659
+[[tests]]
+name = "empty-group-with-unicode"
+regex = '()Ј01'
+input = 'zЈ01'
+matches = [[1, 5]]
+
+# See: https://github.com/rust-lang/regex/issues/579
+[[tests]]
+name = "word-boundary-weird"
+regex = '\b..\b'
+input = "I have 12, he has 2!"
+matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
+
+# See: https://github.com/rust-lang/regex/issues/579
+[[tests]]
+name = "word-boundary-weird-ascii"
+regex = '\b..\b'
+input = "I have 12, he has 2!"
+matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/579
+[[tests]]
+name = "word-boundary-weird-minimal-ascii"
+regex = '\b..\b'
+input = "az,,b"
+matches = [[0, 2], [2, 4]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1203
+[[tests]]
+name = "reverse-suffix-100"
+regex = '[0-4][0-4][0-4]000'
+input = "153.230000"
+matches = [[4, 10]]
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1203
+[[tests]]
+name = "reverse-suffix-200"
+regex = '[0-9][0-9][0-9]000'
+input = "153.230000\n"
+matches = [[4, 10]]
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1247
+[[tests]]
+name = "stops"
+regex = '\bs(?:[ab])'
+input = 's\xE4'
+matches = []
+unescape = true
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1247
+[[tests]]
+name = "stops-ascii"
+regex = '(?-u:\b)s(?:[ab])'
+input = 's\xE4'
+matches = []
+unescape = true
+
+# There is no issue for this bug.
+[[tests]]
+name = "anchored-prefix-100"
+regex = '^a[[:^space:]]'
+input = "a "
+matches = []
+
+# There is no issue for this bug.
+[[tests]]
+name = "anchored-prefix-200"
+regex = '^a[[:^space:]]'
+input = "foo boo a"
+matches = []
+
+# There is no issue for this bug.
+[[tests]]
+name = "anchored-prefix-300"
+regex = '^-[a-z]'
+input = "r-f"
+matches = []
+
+# Tests that a possible Aho-Corasick optimization works correctly. It only
+# kicks in when we have a lot of literals. By "works correctly," we mean that
+# leftmost-first match semantics are properly respected. That is, samwise
+# should match, not sam.
+#
+# There is no issue for this bug.
+[[tests]]
+name = "aho-corasick-100"
+regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z'
+input = "samwise"
+matches = [[0, 7]]
diff --git a/vendor/regex-automata/tests/data/set.toml b/vendor/regex-automata/tests/data/set.toml
new file mode 100644
index 000000000..e0eb0583e
--- /dev/null
+++ b/vendor/regex-automata/tests/data/set.toml
@@ -0,0 +1,523 @@
+[[tests]]
+name = "basic10"
+regexes = ["a", "a"]
+input = "a"
+matches = [
+ { id = 0, offsets = [0, 1] },
+ { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic10-leftmost-first"
+regexes = ["a", "a"]
+input = "a"
+matches = [
+ { id = 0, offsets = [0, 1] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic20"
+regexes = ["a", "a"]
+input = "ba"
+matches = [
+ { id = 0, offsets = [1, 2] },
+ { id = 1, offsets = [1, 2] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic30"
+regexes = ["a", "b"]
+input = "a"
+matches = [
+ { id = 0, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic40"
+regexes = ["a", "b"]
+input = "b"
+matches = [
+ { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic50"
+regexes = ["a|b", "b|a"]
+input = "b"
+matches = [
+ { id = 0, offsets = [0, 1] },
+ { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic60"
+regexes = ["foo", "oo"]
+input = "foo"
+matches = [
+ { id = 0, offsets = [0, 3] },
+ { id = 1, offsets = [1, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic60-leftmost-first"
+regexes = ["foo", "oo"]
+input = "foo"
+matches = [
+ { id = 0, offsets = [0, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic61"
+regexes = ["oo", "foo"]
+input = "foo"
+matches = [
+ { id = 1, offsets = [0, 3] },
+ { id = 0, offsets = [1, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic61-leftmost-first"
+regexes = ["oo", "foo"]
+input = "foo"
+matches = [
+ { id = 1, offsets = [0, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic70"
+regexes = ["abcd", "bcd", "cd", "d"]
+input = "abcd"
+matches = [
+ { id = 0, offsets = [0, 4] },
+ { id = 1, offsets = [1, 4] },
+ { id = 2, offsets = [2, 4] },
+ { id = 3, offsets = [3, 4] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic71"
+regexes = ["bcd", "cd", "d", "abcd"]
+input = "abcd"
+matches = [
+ { id = 3, offsets = [0, 4] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic80"
+regexes = ["^foo", "bar$"]
+input = "foo"
+matches = [
+ { id = 0, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic81"
+regexes = ["^foo", "bar$"]
+input = "foo bar"
+matches = [
+ { id = 0, offsets = [0, 3] },
+ { id = 1, offsets = [4, 7] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic82"
+regexes = ["^foo", "bar$"]
+input = "bar"
+matches = [
+ { id = 1, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic90"
+regexes = ["[a-z]+$", "foo"]
+input = "01234 foo"
+matches = [
+ { id = 0, offsets = [6, 9] },
+ { id = 1, offsets = [6, 9] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic91"
+regexes = ["[a-z]+$", "foo"]
+input = "foo 01234"
+matches = [
+ { id = 1, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic100"
+regexes = [".*?", "a"]
+input = "zzza"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [0, 1] },
+ { id = 0, offsets = [0, 2] },
+ { id = 0, offsets = [0, 3] },
+ { id = 0, offsets = [0, 4] },
+ { id = 1, offsets = [3, 4] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic101"
+regexes = [".*", "a"]
+input = "zzza"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [0, 1] },
+ { id = 0, offsets = [0, 2] },
+ { id = 0, offsets = [0, 3] },
+ { id = 0, offsets = [0, 4] },
+ { id = 1, offsets = [3, 4] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic102"
+regexes = [".*", "a"]
+input = "zzz"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [0, 1] },
+ { id = 0, offsets = [0, 2] },
+ { id = 0, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic110"
+regexes = ['\ba\b']
+input = "hello a bye"
+matches = [
+ { id = 0, offsets = [6, 7] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic111"
+regexes = ['\ba\b', '\be\b']
+input = "hello a bye e"
+matches = [
+ { id = 0, offsets = [6, 7] },
+ { id = 1, offsets = [12, 13] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic120"
+regexes = ["a"]
+input = "a"
+matches = [
+ { id = 0, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic121"
+regexes = [".*a"]
+input = "a"
+matches = [
+ { id = 0, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic122"
+regexes = [".*a", "β"]
+input = "β"
+matches = [
+ { id = 1, offsets = [0, 2] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic130"
+regexes = ["ab", "b"]
+input = "ba"
+matches = [
+ { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty10"
+regexes = ["", "a"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 1, offsets = [0, 1] },
+ { id = 0, offsets = [1, 1] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty10-leftmost-first"
+regexes = ["", "a"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [1, 1] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty11"
+regexes = ["a", ""]
+input = "abc"
+matches = [
+ { id = 1, offsets = [0, 0] },
+ { id = 0, offsets = [0, 1] },
+ { id = 1, offsets = [1, 1] },
+ { id = 1, offsets = [2, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty11-leftmost-first"
+regexes = ["a", ""]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 1] },
+ { id = 1, offsets = [2, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty20"
+regexes = ["", "b"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [1, 1] },
+ { id = 1, offsets = [1, 2] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty20-leftmost-first"
+regexes = ["", "b"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [1, 1] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty21"
+regexes = ["b", ""]
+input = "abc"
+matches = [
+ { id = 1, offsets = [0, 0] },
+ { id = 1, offsets = [1, 1] },
+ { id = 0, offsets = [1, 2] },
+ { id = 1, offsets = [2, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty21-leftmost-first"
+regexes = ["b", ""]
+input = "abc"
+matches = [
+ { id = 1, offsets = [0, 0] },
+ { id = 0, offsets = [1, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty22"
+regexes = ["(?:)", "b"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [1, 1] },
+ { id = 1, offsets = [1, 2] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty23"
+regexes = ["b", "(?:)"]
+input = "abc"
+matches = [
+ { id = 1, offsets = [0, 0] },
+ { id = 1, offsets = [1, 1] },
+ { id = 0, offsets = [1, 2] },
+ { id = 1, offsets = [2, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty30"
+regexes = ["", "z"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [1, 1] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty30-leftmost-first"
+regexes = ["", "z"]
+input = "abc"
+matches = [
+ { id = 0, offsets = [0, 0] },
+ { id = 0, offsets = [1, 1] },
+ { id = 0, offsets = [2, 2] },
+ { id = 0, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty31"
+regexes = ["z", ""]
+input = "abc"
+matches = [
+ { id = 1, offsets = [0, 0] },
+ { id = 1, offsets = [1, 1] },
+ { id = 1, offsets = [2, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty31-leftmost-first"
+regexes = ["z", ""]
+input = "abc"
+matches = [
+ { id = 1, offsets = [0, 0] },
+ { id = 1, offsets = [1, 1] },
+ { id = 1, offsets = [2, 2] },
+ { id = 1, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty40"
+regexes = ["c(?:)", "b"]
+input = "abc"
+matches = [
+ { id = 1, offsets = [1, 2] },
+ { id = 0, offsets = [2, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty40-leftmost-first"
+regexes = ["c(?:)", "b"]
+input = "abc"
+matches = [
+ { id = 1, offsets = [1, 2] },
+ { id = 0, offsets = [2, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "nomatch10"
+regexes = ["a", "a"]
+input = "b"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "nomatch20"
+regexes = ["^foo", "bar$"]
+input = "bar foo"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "nomatch30"
+regexes = []
+input = "a"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "nomatch40"
+regexes = ["^rooted$", '\.log$']
+input = "notrooted"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
diff --git a/vendor/regex-automata/tests/data/unicode.toml b/vendor/regex-automata/tests/data/unicode.toml
new file mode 100644
index 000000000..016bbfd9b
--- /dev/null
+++ b/vendor/regex-automata/tests/data/unicode.toml
@@ -0,0 +1,514 @@
+# Basic Unicode literal support.
+[[tests]]
+name = "literal1"
+regex = '☃'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "literal2"
+regex = '☃+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "literal3"
+regex = '(?i)☃+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "literal4"
+regex = '(?i)Δ'
+input = "δ"
+matches = [[0, 2]]
+
+# Unicode word boundaries.
+[[tests]]
+name = "wb-100"
+regex = '\d\b'
+input = "6δ"
+matches = []
+
+[[tests]]
+name = "wb-200"
+regex = '\d\b'
+input = "6 "
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb-300"
+regex = '\d\B'
+input = "6δ"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb-400"
+regex = '\d\B'
+input = "6 "
+matches = []
+
+# Unicode character class support.
+[[tests]]
+name = "class1"
+regex = '[☃Ⅰ]+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class2"
+regex = '\pN'
+input = "Ⅰ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class3"
+regex = '\pN+'
+input = "Ⅰ1Ⅱ2"
+matches = [[0, 8]]
+
+[[tests]]
+name = "class4"
+regex = '\PN+'
+input = "abⅠ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class5"
+regex = '[\PN]+'
+input = "abⅠ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class6"
+regex = '[^\PN]+'
+input = "abⅠ"
+matches = [[2, 5]]
+
+[[tests]]
+name = "class7"
+regex = '\p{Lu}+'
+input = "ΛΘΓΔα"
+matches = [[0, 8]]
+
+[[tests]]
+name = "class8"
+regex = '(?i)\p{Lu}+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+[[tests]]
+name = "class9"
+regex = '\pL+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+[[tests]]
+name = "class10"
+regex = '\p{Ll}+'
+input = "ΛΘΓΔα"
+matches = [[8, 10]]
+
+# Unicode aware "Perl" character classes.
+[[tests]]
+name = "perl1"
+regex = '\w+'
+input = "dδd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "perl2"
+regex = '\w+'
+input = "⥡"
+matches = []
+
+[[tests]]
+name = "perl3"
+regex = '\W+'
+input = "⥡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "perl4"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+
+[[tests]]
+name = "perl5"
+regex = '\d+'
+input = "Ⅱ"
+matches = []
+
+[[tests]]
+name = "perl6"
+regex = '\D+'
+input = "Ⅱ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "perl7"
+regex = '\s+'
+input = " "
+matches = [[0, 3]]
+
+[[tests]]
+name = "perl8"
+regex = '\s+'
+input = "☃"
+matches = []
+
+[[tests]]
+name = "perl9"
+regex = '\S+'
+input = "☃"
+matches = [[0, 3]]
+
+# Specific tests for Unicode general category classes.
+[[tests]]
+name = "class-gencat1"
+regex = '\p{Cased_Letter}'
+input = "A"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat2"
+regex = '\p{Close_Punctuation}'
+input = "❯"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat3"
+regex = '\p{Connector_Punctuation}'
+input = "⁀"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat4"
+regex = '\p{Control}'
+input = "\u009F"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-gencat5"
+regex = '\p{Currency_Symbol}'
+input = "£"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat6"
+regex = '\p{Dash_Punctuation}'
+input = "〰"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat7"
+regex = '\p{Decimal_Number}'
+input = "𑓙"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat8"
+regex = '\p{Enclosing_Mark}'
+input = "\uA672"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat9"
+regex = '\p{Final_Punctuation}'
+input = "⸡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat10"
+regex = '\p{Format}'
+input = "\U000E007F"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat11"
+regex = '\p{Initial_Punctuation}'
+input = "⸜"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat12"
+regex = '\p{Letter}'
+input = "Έ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-gencat13"
+regex = '\p{Letter_Number}'
+input = "ↂ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat14"
+regex = '\p{Line_Separator}'
+input = "\u2028"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat15"
+regex = '\p{Lowercase_Letter}'
+input = "ϛ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-gencat16"
+regex = '\p{Mark}'
+input = "\U000E01EF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat17"
+regex = '\p{Math}'
+input = "⋿"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat18"
+regex = '\p{Modifier_Letter}'
+input = "𖭃"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat19"
+regex = '\p{Modifier_Symbol}'
+input = "🏿"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat20"
+regex = '\p{Nonspacing_Mark}'
+input = "\U0001E94A"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat21"
+regex = '\p{Number}'
+input = "⓿"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat22"
+regex = '\p{Open_Punctuation}'
+input = "⦅"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat23"
+regex = '\p{Other}'
+input = "\u0BC9"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat24"
+regex = '\p{Other_Letter}'
+input = "ꓷ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat25"
+regex = '\p{Other_Number}'
+input = "㉏"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat26"
+regex = '\p{Other_Punctuation}'
+input = "𞥞"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat27"
+regex = '\p{Other_Symbol}'
+input = "⅌"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat28"
+regex = '\p{Paragraph_Separator}'
+input = "\u2029"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat29"
+regex = '\p{Private_Use}'
+input = "\U0010FFFD"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat30"
+regex = '\p{Punctuation}'
+input = "𑁍"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat31"
+regex = '\p{Separator}'
+input = "\u3000"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat32"
+regex = '\p{Space_Separator}'
+input = "\u205F"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat33"
+regex = '\p{Spacing_Mark}'
+input = "\U00016F7E"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat34"
+regex = '\p{Symbol}'
+input = "⯈"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat35"
+regex = '\p{Titlecase_Letter}'
+input = "ῼ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat36"
+regex = '\p{Unassigned}'
+input = "\U0010FFFF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat37"
+regex = '\p{Uppercase_Letter}'
+input = "Ꝋ"
+matches = [[0, 3]]
+
+
+# Tests for Unicode emoji properties.
+[[tests]]
+name = "class-emoji1"
+regex = '\p{Emoji}'
+input = "\u23E9"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-emoji2"
+regex = '\p{emoji}'
+input = "\U0001F21A"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-emoji3"
+regex = '\p{extendedpictographic}'
+input = "\U0001FA6E"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-emoji4"
+regex = '\p{extendedpictographic}'
+input = "\U0001FFFD"
+matches = [[0, 4]]
+
+
+# Tests for Unicode grapheme cluster properties.
+[[tests]]
+name = "class-gcb1"
+regex = '\p{grapheme_cluster_break=prepend}'
+input = "\U00011D46"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb2"
+regex = '\p{gcb=regional_indicator}'
+input = "\U0001F1E6"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb3"
+regex = '\p{gcb=ri}'
+input = "\U0001F1E7"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb4"
+regex = '\p{regionalindicator}'
+input = "\U0001F1FF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb5"
+regex = '\p{gcb=lvt}'
+input = "\uC989"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gcb6"
+regex = '\p{gcb=zwj}'
+input = "\u200D"
+matches = [[0, 3]]
+
+# Tests for Unicode word boundary properties.
+[[tests]]
+name = "class-word-break1"
+regex = '\p{word_break=Hebrew_Letter}'
+input = "\uFB46"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break2"
+regex = '\p{wb=hebrewletter}'
+input = "\uFB46"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break3"
+regex = '\p{wb=ExtendNumLet}'
+input = "\uFF3F"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break4"
+regex = '\p{wb=WSegSpace}'
+input = "\u3000"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break5"
+regex = '\p{wb=numeric}'
+input = "\U0001E950"
+matches = [[0, 4]]
+
+# Tests for Unicode sentence boundary properties.
+[[tests]]
+name = "class-sentence-break1"
+regex = '\p{sentence_break=Lower}'
+input = "\u0469"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-sentence-break2"
+regex = '\p{sb=lower}'
+input = "\u0469"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-sentence-break3"
+regex = '\p{sb=Close}'
+input = "\uFF60"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-sentence-break4"
+regex = '\p{sb=Close}'
+input = "\U0001F677"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-sentence-break5"
+regex = '\p{sb=SContinue}'
+input = "\uFF64"
+matches = [[0, 3]]
diff --git a/vendor/regex-automata/tests/data/word-boundary.toml b/vendor/regex-automata/tests/data/word-boundary.toml
new file mode 100644
index 000000000..e84b25c2a
--- /dev/null
+++ b/vendor/regex-automata/tests/data/word-boundary.toml
@@ -0,0 +1,771 @@
+# Some of these are cribbed from RE2's test suite.
+
+# These test \b. Below are tests for \B.
+[[tests]]
+name = "wb1"
+regex = '\b'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb2"
+regex = '\b'
+input = "a"
+matches = [[0, 0], [1, 1]]
+unicode = false
+
+[[tests]]
+name = "wb3"
+regex = '\b'
+input = "ab"
+matches = [[0, 0], [2, 2]]
+unicode = false
+
+[[tests]]
+name = "wb4"
+regex = '^\b'
+input = "ab"
+matches = [[0, 0]]
+unicode = false
+
+[[tests]]
+name = "wb5"
+regex = '\b$'
+input = "ab"
+matches = [[2, 2]]
+unicode = false
+
+[[tests]]
+name = "wb6"
+regex = '^\b$'
+input = "ab"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb7"
+regex = '\bbar\b'
+input = "nobar bar foo bar"
+matches = [[6, 9], [14, 17]]
+unicode = false
+
+[[tests]]
+name = "wb8"
+regex = 'a\b'
+input = "faoa x"
+matches = [[3, 4]]
+unicode = false
+
+[[tests]]
+name = "wb9"
+regex = '\bbar'
+input = "bar x"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb10"
+regex = '\bbar'
+input = "foo\nbar x"
+matches = [[4, 7]]
+unicode = false
+
+[[tests]]
+name = "wb11"
+regex = 'bar\b'
+input = "foobar"
+matches = [[3, 6]]
+unicode = false
+
+[[tests]]
+name = "wb12"
+regex = 'bar\b'
+input = "foobar\nxxx"
+matches = [[3, 6]]
+unicode = false
+
+[[tests]]
+name = "wb13"
+regex = '(foo|bar|[A-Z])\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb14"
+regex = '(foo|bar|[A-Z])\b'
+input = "foo\n"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb15"
+regex = '\b(foo|bar|[A-Z])'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb16"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "X"
+matches = [[0, 1]]
+unicode = false
+
+[[tests]]
+name = "wb17"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "XY"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb18"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "bar"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb19"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb20"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "foo\n"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb21"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "ffoo bbar N x"
+matches = [[10, 11]]
+unicode = false
+
+[[tests]]
+name = "wb22"
+regex = '\b(fo|foo)\b'
+input = "fo"
+matches = [[0, 2]]
+unicode = false
+
+[[tests]]
+name = "wb23"
+regex = '\b(fo|foo)\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb24"
+regex = '\b\b'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb25"
+regex = '\b\b'
+input = "a"
+matches = [[0, 0], [1, 1]]
+unicode = false
+
+[[tests]]
+name = "wb26"
+regex = '\b$'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb27"
+regex = '\b$'
+input = "x"
+matches = [[1, 1]]
+unicode = false
+
+[[tests]]
+name = "wb28"
+regex = '\b$'
+input = "y x"
+matches = [[3, 3]]
+unicode = false
+
+[[tests]]
+name = "wb29"
+regex = '(?-u:\b).$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb30"
+regex = '^\b(fo|foo)\b'
+input = "fo"
+matches = [[0, 2]]
+unicode = false
+
+[[tests]]
+name = "wb31"
+regex = '^\b(fo|foo)\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb32"
+regex = '^\b$'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb33"
+regex = '^\b$'
+input = "x"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb34"
+regex = '^(?-u:\b).$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb35"
+regex = '^(?-u:\b).(?-u:\b)$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb36"
+regex = '^^^^^\b$$$$$'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb37"
+regex = '^^^^^(?-u:\b).$$$$$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb38"
+regex = '^^^^^\b$$$$$'
+input = "x"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb39"
+regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb40"
+regex = '(?-u:\b).+(?-u:\b)'
+input = "$$abc$$"
+matches = [[2, 5]]
+
+[[tests]]
+name = "wb41"
+regex = '\b'
+input = "a b c"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+unicode = false
+
+[[tests]]
+name = "wb42"
+regex = '\bfoo\b'
+input = "zzz foo zzz"
+matches = [[4, 7]]
+unicode = false
+
+[[tests]]
+name = "wb43"
+regex = '\b^'
+input = "ab"
+matches = [[0, 0]]
+unicode = false
+
+[[tests]]
+name = "wb44"
+regex = '$\b'
+input = "ab"
+matches = [[2, 2]]
+unicode = false
+
+
+# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we
+# have to disable it for most of these tests. This is because \B can match at
+# non-UTF-8 boundaries.
+[[tests]]
+name = "nb1"
+regex = '\Bfoo\B'
+input = "n foo xfoox that"
+matches = [[7, 10]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb2"
+regex = 'a\B'
+input = "faoa x"
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb3"
+regex = '\Bbar'
+input = "bar x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb4"
+regex = '\Bbar'
+input = "foo\nbar x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb5"
+regex = 'bar\B'
+input = "foobar"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb6"
+regex = 'bar\B'
+input = "foobar\nxxx"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb7"
+regex = '(foo|bar|[A-Z])\B'
+input = "foox"
+matches = [[0, 3]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb8"
+regex = '(foo|bar|[A-Z])\B'
+input = "foo\n"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb9"
+regex = '\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb10"
+regex = '\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb11"
+regex = '\B(foo|bar|[A-Z])'
+input = "foo"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb12"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "xXy"
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb13"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "XY"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb14"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "XYZ"
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb15"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "abara"
+matches = [[1, 4]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb16"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "xfoo_"
+matches = [[1, 4]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb17"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "xfoo\n"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb18"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "foo bar vNX"
+matches = [[9, 10]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb19"
+regex = '\B(fo|foo)\B'
+input = "xfoo"
+matches = [[1, 3]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb20"
+regex = '\B(foo|fo)\B'
+input = "xfooo"
+matches = [[1, 4]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb21"
+regex = '\B\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb22"
+regex = '\B\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb23"
+regex = '\B$'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb24"
+regex = '\B$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb25"
+regex = '\B$'
+input = "y x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb26"
+regex = '\B.$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb27"
+regex = '^\B(fo|foo)\B'
+input = "fo"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb28"
+regex = '^\B(fo|foo)\B'
+input = "fo"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb29"
+regex = '^\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb30"
+regex = '^\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb31"
+regex = '^\B\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb32"
+regex = '^\B\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb33"
+regex = '^\B$'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb34"
+regex = '^\B$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb35"
+regex = '^\B.$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb36"
+regex = '^\B.\B$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb37"
+regex = '^^^^^\B$$$$$'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb38"
+regex = '^^^^^\B.$$$$$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb39"
+regex = '^^^^^\B$$$$$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+
+# unicode1* and unicode2* work for both Unicode and ASCII because all matches
+# are reported as byte offsets, and « and » do not correspond to word
+# boundaries at either the character or byte level.
+[[tests]]
+name = "unicode1"
+regex = '\bx\b'
+input = "«x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "unicode1-only-ascii"
+regex = '\bx\b'
+input = "«x"
+matches = [[2, 3]]
+unicode = false
+
+[[tests]]
+name = "unicode2"
+regex = '\bx\b'
+input = "x»"
+matches = [[0, 1]]
+
+[[tests]]
+name = "unicode2-only-ascii"
+regex = '\bx\b'
+input = "x»"
+matches = [[0, 1]]
+unicode = false
+
+# ASCII word boundaries are completely oblivious to Unicode characters, so
+# even though β is a character, an ASCII \b treats it as a word boundary
+# when it is adjacent to another ASCII character. (The ASCII \b only looks
+# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.
+[[tests]]
+name = "unicode3"
+regex = '\bx\b'
+input = 'áxβ'
+matches = []
+
+[[tests]]
+name = "unicode3-only-ascii"
+regex = '\bx\b'
+input = 'áxβ'
+matches = [[2, 3]]
+unicode = false
+
+[[tests]]
+name = "unicode4"
+regex = '\Bx\B'
+input = 'áxβ'
+matches = [[2, 3]]
+
+[[tests]]
+name = "unicode4-only-ascii"
+regex = '\Bx\B'
+input = 'áxβ'
+matches = []
+unicode = false
+utf8 = false
+
+# The same as above, but with \b instead of \B as a sanity check.
+[[tests]]
+name = "unicode5"
+regex = '\b'
+input = "0\U0007EF5E"
+matches = [[0, 0], [1, 1]]
+
+[[tests]]
+name = "unicode5-only-ascii"
+regex = '\b'
+input = "0\U0007EF5E"
+matches = [[0, 0], [1, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "unicode5-noutf8"
+regex = '\b'
+input = '0\xFF\xFF\xFF\xFF'
+matches = [[0, 0], [1, 1]]
+unescape = true
+utf8 = false
+
+[[tests]]
+name = "unicode5-noutf8-only-ascii"
+regex = '\b'
+input = '0\xFF\xFF\xFF\xFF'
+matches = [[0, 0], [1, 1]]
+unescape = true
+unicode = false
+utf8 = false
+
+# Weird special case to ensure that ASCII \B treats each individual code unit
+# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary
+# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the
+# \w character class.)
+[[tests]]
+name = "unicode5-not"
+regex = '\B'
+input = "0\U0007EF5E"
+matches = [[5, 5]]
+
+[[tests]]
+name = "unicode5-not-only-ascii"
+regex = '\B'
+input = "0\U0007EF5E"
+matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
+unicode = false
+utf8 = false
+
+# This gets no matches since \B only matches in the presence of valid UTF-8
+# when Unicode is enabled, even when UTF-8 mode is disabled.
+[[tests]]
+name = "unicode5-not-noutf8"
+regex = '\B'
+input = '0\xFF\xFF\xFF\xFF'
+matches = []
+unescape = true
+utf8 = false
+
+# But this DOES get matches since \B in ASCII mode only looks at individual
+# bytes.
+[[tests]]
+name = "unicode5-not-noutf8-only-ascii"
+regex = '\B'
+input = '0\xFF\xFF\xFF\xFF'
+matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
+unescape = true
+unicode = false
+utf8 = false
+
+# Some tests of no particular significance.
+[[tests]]
+name = "unicode6"
+regex = '\b[0-9]+\b'
+input = "foo 123 bar 456 quux 789"
+matches = [[4, 7], [12, 15], [21, 24]]
+
+[[tests]]
+name = "unicode7"
+regex = '\b[0-9]+\b'
+input = "foo 123 bar a456 quux 789"
+matches = [[4, 7], [22, 25]]
+
+[[tests]]
+name = "unicode8"
+regex = '\b[0-9]+\b'
+input = "foo 123 bar 456a quux 789"
+matches = [[4, 7], [22, 25]]
diff --git a/vendor/regex-automata/tests/dfa/api.rs b/vendor/regex-automata/tests/dfa/api.rs
new file mode 100644
index 000000000..80d7d704c
--- /dev/null
+++ b/vendor/regex-automata/tests/dfa/api.rs
@@ -0,0 +1,133 @@
+use std::error::Error;
+
+use regex_automata::{
+ dfa::{dense, regex::Regex, Automaton, OverlappingState},
+ nfa::thompson,
+ HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+ let dfa = dense::Builder::new()
+ .configure(dense::Config::new().quit(b'x', true))
+ .build("[[:word:]]+$")?;
+
+ assert_eq!(
+ dfa.find_earliest_fwd(b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_fwd(b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_overlapping_fwd(b"abcxyz", &mut OverlappingState::start()),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+ let dfa = dense::Builder::new()
+ .configure(dense::Config::new().quit(b'x', true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build("^[[:word:]]+")?;
+
+ assert_eq!(
+ dfa.find_earliest_rev(b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_rev(b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+ dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// Tests that if we attempt an overlapping search using a regex without a
+// reverse DFA compiled with 'starts_for_each_pattern', then we get a panic.
+#[test]
+#[should_panic]
+fn incorrect_config_overlapping_search_panics() {
+ let forward = dense::DFA::new(r"abca").unwrap();
+ let reverse = dense::Builder::new()
+ .configure(
+ dense::Config::new()
+ .anchored(true)
+ .match_kind(MatchKind::All)
+ .starts_for_each_pattern(false),
+ )
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"abca")
+ .unwrap();
+
+ let re = Regex::builder().build_from_dfas(forward, reverse);
+ let haystack = "bar abcabcabca abca foo".as_bytes();
+ re.find_overlapping(haystack, &mut OverlappingState::start());
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+ let mut config = dense::Config::new();
+ for b in 0x80..=0xFF {
+ config = config.quit(b, true);
+ }
+ let dfa = dense::Builder::new().configure(config).build(r"\b")?;
+ let expected = HalfMatch::must(0, 1);
+ assert_eq!(dfa.find_leftmost_fwd(b" a"), Ok(Some(expected)));
+ Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+ let re = Regex::new(r"a[0-9]+")
+ .unwrap()
+ .with_prefilter(SubstringPrefilter::new("a"));
+ let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+ let matches: Vec<(usize, usize)> =
+ re.find_leftmost_iter(text).map(|m| (m.start(), m.end())).collect();
+ assert_eq!(
+ matches,
+ vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+ );
+ Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+ let text = b"za123";
+ let re = Regex::new(r"a[0-9]+")
+ .unwrap()
+ .with_prefilter(SubstringPrefilter::new("a"));
+ assert_eq!(re.find_leftmost(b"za123"), Some(MultiMatch::must(0, 1, 5)));
+ assert_eq!(re.find_leftmost(b"a123"), Some(MultiMatch::must(0, 0, 4)));
+ let re = re.with_prefilter(BunkPrefilter::new());
+ assert_eq!(re.find_leftmost(b"za123"), None);
+ // This checks that the prefilter is used when first starting the search,
+ // instead of waiting until at least one transition has occurred.
+ assert_eq!(re.find_leftmost(b"a123"), None);
+ Ok(())
+}
diff --git a/vendor/regex-automata/tests/dfa/mod.rs b/vendor/regex-automata/tests/dfa/mod.rs
new file mode 100644
index 000000000..f4299510c
--- /dev/null
+++ b/vendor/regex-automata/tests/dfa/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/vendor/regex-automata/tests/dfa/suite.rs b/vendor/regex-automata/tests/dfa/suite.rs
new file mode 100644
index 000000000..426ae346d
--- /dev/null
+++ b/vendor/regex-automata/tests/dfa/suite.rs
@@ -0,0 +1,280 @@
+use regex_automata::{
+ dfa::{self, dense, regex::Regex, sparse, Automaton},
+ nfa::thompson,
+ MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+ bstr::{BString, ByteSlice},
+ CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+ SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Runs the test suite with the default configuration.
+#[test]
+fn unminimized_default() -> Result<()> {
+ let builder = Regex::builder();
+ TestRunner::new()?
+ .test_iter(suite()?.iter(), dense_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite with byte classes disabled.
+#[test]
+fn unminimized_no_byte_class() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dense(dense::Config::new().byte_classes(false));
+
+ TestRunner::new()?
+ .test_iter(suite()?.iter(), dense_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite with NFA shrinking disabled.
+#[test]
+fn unminimized_no_nfa_shrink() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.thompson(thompson::Config::new().shrink(false));
+
+ TestRunner::new()?
+ .test_iter(suite()?.iter(), dense_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite on a minimized DFA with an otherwise default
+/// configuration.
+#[test]
+fn minimized_default() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dense(dense::Config::new().minimize(true));
+ TestRunner::new()?
+ // These regexes tend to be too big. Minimization takes... forever.
+ .blacklist("expensive")
+ .test_iter(suite()?.iter(), dense_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite on a minimized DFA with byte classes disabled.
+#[test]
+fn minimized_no_byte_class() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dense(dense::Config::new().minimize(true).byte_classes(false));
+
+ TestRunner::new()?
+ // These regexes tend to be too big. Minimization takes... forever.
+ .blacklist("expensive")
+ .test_iter(suite()?.iter(), dense_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Runs the test suite on a sparse unminimized DFA.
+#[test]
+fn sparse_unminimized_default() -> Result<()> {
+ let builder = Regex::builder();
+ TestRunner::new()?
+ .test_iter(suite()?.iter(), sparse_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Another basic sanity test that checks we can serialize and then deserialize
+/// a regex, and that the resulting regex can be used for searching correctly.
+#[test]
+fn serialization_unminimized_default() -> Result<()> {
+ let builder = Regex::builder();
+ let my_compiler = |builder| {
+ compiler(builder, |builder, re| {
+ let builder = builder.clone();
+ let (fwd_bytes, _) = re.forward().to_bytes_native_endian();
+ let (rev_bytes, _) = re.reverse().to_bytes_native_endian();
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ let fwd: dense::DFA<&[u32]> =
+ dense::DFA::from_bytes(&fwd_bytes).unwrap().0;
+ let rev: dense::DFA<&[u32]> =
+ dense::DFA::from_bytes(&rev_bytes).unwrap().0;
+ let re = builder.build_from_dfas(fwd, rev);
+
+ run_test(&re, test)
+ }))
+ })
+ };
+ TestRunner::new()?
+ .test_iter(suite()?.iter(), my_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// A basic sanity test that checks we can serialize and then deserialize a
+/// regex using sparse DFAs, and that the resulting regex can be used for
+/// searching correctly.
+#[test]
+fn sparse_serialization_unminimized_default() -> Result<()> {
+ let builder = Regex::builder();
+ let my_compiler = |builder| {
+ compiler(builder, |builder, re| {
+ let builder = builder.clone();
+ let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian();
+ let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian();
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ let fwd: sparse::DFA<&[u8]> =
+ sparse::DFA::from_bytes(&fwd_bytes).unwrap().0;
+ let rev: sparse::DFA<&[u8]> =
+ sparse::DFA::from_bytes(&rev_bytes).unwrap().0;
+ let re = builder.build_from_dfas(fwd, rev);
+ run_test(&re, test)
+ }))
+ })
+ };
+ TestRunner::new()?
+ .test_iter(suite()?.iter(), my_compiler(builder))
+ .assert();
+ Ok(())
+}
+
+fn dense_compiler(
+ builder: dfa::regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+ compiler(builder, |_, re| {
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ run_test(&re, test)
+ }))
+ })
+}
+
+fn sparse_compiler(
+ builder: dfa::regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+ compiler(builder, |builder, re| {
+ let fwd = re.forward().to_sparse()?;
+ let rev = re.reverse().to_sparse()?;
+ let re = builder.build_from_dfas(fwd, rev);
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ run_test(&re, test)
+ }))
+ })
+}
+
+fn compiler(
+ mut builder: dfa::regex::Builder,
+ mut create_matcher: impl FnMut(
+ &dfa::regex::Builder,
+ Regex,
+ ) -> Result<CompiledRegex>,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ let regexes = regexes
+ .iter()
+ .map(|r| r.to_str().map(|s| s.to_string()))
+ .collect::<std::result::Result<Vec<String>, _>>()?;
+
+ // Check if our regex contains things that aren't supported by DFAs.
+ // That is, Unicode word boundaries when searching non-ASCII text.
+ let mut thompson = thompson::Builder::new();
+ thompson.configure(config_thompson(test));
+ // TODO: Modify Hir to report facts like this, instead of needing to
+ // build an NFA to do it.
+ if let Ok(nfa) = thompson.build_many(&regexes) {
+ let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
+ if nfa.has_word_boundary_unicode() && non_ascii {
+ return Ok(CompiledRegex::skip());
+ }
+ }
+ if !configure_regex_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ create_matcher(&builder, builder.build_many(&regexes)?)
+ }
+}
+
+fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> Vec<TestResult> {
+ let is_match = if re.is_match(test.input()) {
+ TestResult::matched()
+ } else {
+ TestResult::no_match()
+ };
+ let is_match = is_match.name("is_match");
+
+ let find_matches = match test.search_kind() {
+ TestSearchKind::Earliest => {
+ let it = re
+ .find_earliest_iter(test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_earliest_iter")
+ }
+ TestSearchKind::Leftmost => {
+ let it = re
+ .find_leftmost_iter(test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_leftmost_iter")
+ }
+ TestSearchKind::Overlapping => {
+ let it = re
+ .find_overlapping_iter(test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_overlapping_iter")
+ }
+ };
+
+ vec![is_match, find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_regex_builder(
+ test: &RegexTest,
+ builder: &mut dfa::regex::Builder,
+) -> bool {
+ let match_kind = match test.match_kind() {
+ TestMatchKind::All => MatchKind::All,
+ TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
+ TestMatchKind::LeftmostLongest => return false,
+ };
+
+ let syntax_config = SyntaxConfig::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8());
+ let dense_config = dense::Config::new()
+ .anchored(test.anchored())
+ .match_kind(match_kind)
+ .unicode_word_boundary(true);
+ let regex_config = Regex::config().utf8(test.utf8());
+
+ builder
+ .configure(regex_config)
+ .syntax(syntax_config)
+ .thompson(config_thompson(test))
+ .dense(dense_config);
+ true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+ thompson::Config::new().utf8(test.utf8())
+}
diff --git a/vendor/regex-automata/tests/hybrid/api.rs b/vendor/regex-automata/tests/hybrid/api.rs
new file mode 100644
index 000000000..9a834dbb8
--- /dev/null
+++ b/vendor/regex-automata/tests/hybrid/api.rs
@@ -0,0 +1,195 @@
+use std::error::Error;
+
+use regex_automata::{
+ hybrid::{
+ dfa::{self, DFA},
+ regex::Regex,
+ OverlappingState,
+ },
+ nfa::thompson,
+ HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that too many cache resets cause the lazy DFA to quit.
+//
+// We only test this on 64-bit because the test is gingerly crafted based on
+// implementation details of cache sizes. It's not a great test because of
+// that, but it does check some interesting properties around how positions are
+// reported when a search "gives up."
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
+ // This is a carefully chosen regex. The idea is to pick one that requires
+ // some decent number of states (hence the bounded repetition). But we
+ // specifically choose to create a class with an ASCII letter and a
+ // non-ASCII letter so that we can check that no new states are created
+ // once the cache is full. Namely, if we fill up the cache on a haystack
+ // of 'a's, then in order to match one 'β', a new state will need to be
+ // created since a 'β' is encoded with multiple bytes. Since there's no
+ // room for this state, the search should quit at the very first position.
+ let pattern = r"[aβ]{100}";
+ let dfa = DFA::builder()
+ .configure(
+ // Configure it so that we have the minimum cache capacity
+ // possible. And that if any resets occur, the search quits.
+ DFA::config()
+ .skip_cache_capacity_check(true)
+ .cache_capacity(0)
+ .minimum_cache_clear_count(Some(0)),
+ )
+ .build(pattern)?;
+ let mut cache = dfa.create_cache();
+
+ let haystack = "a".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 25 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
+ assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
+ assert_eq!(
+ dfa.find_overlapping_fwd(
+ &mut cache,
+ &haystack,
+ &mut OverlappingState::start()
+ ),
+ Err(err.clone())
+ );
+
+ let haystack = "β".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 0 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+ // no need to test that other find routines quit, since we did that above
+
+ // OK, if we reset the cache, then we should be able to create more states
+ // and make more progress with searching for betas.
+ cache.reset(&dfa);
+ let err = MatchError::GaveUp { offset: 26 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+ // ... switching back to ASCII still makes progress since it just needs to
+ // set transitions on existing states!
+ let haystack = "a".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 13 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+ Ok(())
+}
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+ let dfa = DFA::builder()
+ .configure(DFA::config().quit(b'x', true))
+ .build("[[:word:]]+$")?;
+ let mut cache = dfa.create_cache();
+
+ assert_eq!(
+ dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_overlapping_fwd(
+ &mut cache,
+ b"abcxyz",
+ &mut OverlappingState::start()
+ ),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+ let dfa = DFA::builder()
+ .configure(DFA::config().quit(b'x', true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build("^[[:word:]]+")?;
+ let mut cache = dfa.create_cache();
+
+ assert_eq!(
+ dfa.find_earliest_rev(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+ DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+ let mut config = DFA::config();
+ for b in 0x80..=0xFF {
+ config = config.quit(b, true);
+ }
+ let dfa = DFA::builder().configure(config).build(r"\b")?;
+ let mut cache = dfa.create_cache();
+ let expected = HalfMatch::must(0, 1);
+ assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
+ Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+ let mut re = Regex::new(r"a[0-9]+").unwrap();
+ re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+ let mut cache = re.create_cache();
+
+ let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+ let matches: Vec<(usize, usize)> = re
+ .find_leftmost_iter(&mut cache, text)
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ assert_eq!(
+ matches,
+ vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+ );
+ Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+ let text = b"za123";
+ let mut re = Regex::new(r"a[0-9]+").unwrap();
+ let mut cache = re.create_cache();
+
+ re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+ assert_eq!(
+ re.find_leftmost(&mut cache, b"za123"),
+ Some(MultiMatch::must(0, 1, 5))
+ );
+ assert_eq!(
+ re.find_leftmost(&mut cache, b"a123"),
+ Some(MultiMatch::must(0, 0, 4))
+ );
+ re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
+ assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
+ // This checks that the prefilter is used when first starting the search,
+ // instead of waiting until at least one transition has occurred.
+ assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
+ Ok(())
+}
diff --git a/vendor/regex-automata/tests/hybrid/mod.rs b/vendor/regex-automata/tests/hybrid/mod.rs
new file mode 100644
index 000000000..f4299510c
--- /dev/null
+++ b/vendor/regex-automata/tests/hybrid/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/vendor/regex-automata/tests/hybrid/suite.rs b/vendor/regex-automata/tests/hybrid/suite.rs
new file mode 100644
index 000000000..d60570d84
--- /dev/null
+++ b/vendor/regex-automata/tests/hybrid/suite.rs
@@ -0,0 +1,212 @@
+use regex_automata::{
+ hybrid::{
+ dfa::DFA,
+ regex::{self, Regex},
+ },
+ nfa::thompson,
+ MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+ bstr::{BString, ByteSlice},
+ CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+ SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+ let builder = Regex::builder();
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA with NFA shrinking disabled.
+///
+/// This is actually the typical configuration one wants for a lazy DFA. NFA
+/// shrinking is mostly only advantageous when building a full DFA since it
+/// can sharply decrease the amount of time determinization takes. But NFA
+/// shrinking is itself otherwise fairly expensive. Since a lazy DFA has
+/// no compilation time (other than for building the NFA of course) before
+/// executing a search, it's usually worth it to forgo NFA shrinking.
+#[test]
+fn no_nfa_shrink() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.thompson(thompson::Config::new().shrink(false));
+ TestRunner::new()?
+ // Without NFA shrinking, this test blows the default cache capacity.
+ .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+ .test_iter(suite()?.iter(), compiler(builder))
+ .assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled.
+#[test]
+fn starts_for_each_pattern() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().starts_for_each_pattern(true));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when byte classes are disabled.
+///
+/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
+/// All it does is cause every byte value to be its own distinct equivalence
+/// class.
+#[test]
+fn no_byte_classes() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().byte_classes(false));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests that hybrid NFA/DFA never clears its cache for any test with the
+/// default capacity.
+///
+/// N.B. If a regex suite test is added that causes the cache to be cleared,
+/// then this should just skip that test. (Which can be done by calling the
+/// 'blacklist' method on 'TestRunner'.)
+#[test]
+fn no_cache_clearing() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when the minimum cache capacity is set.
+#[test]
+fn min_cache_capacity() -> Result<()> {
+ let mut builder = Regex::builder();
+ builder
+ .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+fn compiler(
+ mut builder: regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ let regexes = regexes
+ .iter()
+ .map(|r| r.to_str().map(|s| s.to_string()))
+ .collect::<std::result::Result<Vec<String>, _>>()?;
+
+ // Check if our regex contains things that aren't supported by DFAs.
+ // That is, Unicode word boundaries when searching non-ASCII text.
+ let mut thompson = thompson::Builder::new();
+ thompson.syntax(config_syntax(test)).configure(config_thompson(test));
+ if let Ok(nfa) = thompson.build_many(&regexes) {
+ let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
+ if nfa.has_word_boundary_unicode() && non_ascii {
+ return Ok(CompiledRegex::skip());
+ }
+ }
+ if !configure_regex_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ let re = builder.build_many(&regexes)?;
+ let mut cache = re.create_cache();
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ run_test(&re, &mut cache, test)
+ }))
+ }
+}
+
+fn run_test(
+ re: &Regex,
+ cache: &mut regex::Cache,
+ test: &RegexTest,
+) -> Vec<TestResult> {
+ let is_match = if re.is_match(cache, test.input()) {
+ TestResult::matched()
+ } else {
+ TestResult::no_match()
+ };
+ let is_match = is_match.name("is_match");
+
+ let find_matches = match test.search_kind() {
+ TestSearchKind::Earliest => {
+ let it = re
+ .find_earliest_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_earliest_iter")
+ }
+ TestSearchKind::Leftmost => {
+ let it = re
+ .find_leftmost_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_leftmost_iter")
+ }
+ TestSearchKind::Overlapping => {
+ let it = re
+ .find_overlapping_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_overlapping_iter")
+ }
+ };
+ vec![is_match, find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_regex_builder(
+ test: &RegexTest,
+ builder: &mut regex::Builder,
+) -> bool {
+ let match_kind = match test.match_kind() {
+ TestMatchKind::All => MatchKind::All,
+ TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
+ TestMatchKind::LeftmostLongest => return false,
+ };
+
+ let dense_config = DFA::config()
+ .anchored(test.anchored())
+ .match_kind(match_kind)
+ .unicode_word_boundary(true);
+ let regex_config = Regex::config().utf8(test.utf8());
+ builder
+ .configure(regex_config)
+ .syntax(config_syntax(test))
+ .thompson(config_thompson(test))
+ .dfa(dense_config);
+ true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+ thompson::Config::new().utf8(test.utf8())
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> SyntaxConfig {
+ SyntaxConfig::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+}
diff --git a/vendor/regex-automata/tests/nfa/mod.rs b/vendor/regex-automata/tests/nfa/mod.rs
new file mode 100644
index 000000000..326862147
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/mod.rs
@@ -0,0 +1 @@
+mod thompson;
diff --git a/vendor/regex-automata/tests/nfa/thompson/mod.rs b/vendor/regex-automata/tests/nfa/thompson/mod.rs
new file mode 100644
index 000000000..3a03f52ce
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/thompson/mod.rs
@@ -0,0 +1 @@
+mod pikevm;
diff --git a/vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs b/vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs
new file mode 100644
index 000000000..c8199f709
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/thompson/pikevm/api.rs
@@ -0,0 +1,191 @@
+/*
+use std::error::Error;
+
+use regex_automata::{
+ hybrid::{
+ dfa::{self, DFA},
+ regex::Regex,
+ OverlappingState,
+ },
+ nfa::thompson,
+ HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that too many cache resets cause the lazy DFA to quit.
+#[test]
+fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
+ // This is a carefully chosen regex. The idea is to pick one that requires
+ // some decent number of states (hence the bounded repetition). But we
+ // specifically choose to create a class with an ASCII letter and a
+ // non-ASCII letter so that we can check that no new states are created
+ // once the cache is full. Namely, if we fill up the cache on a haystack
+ // of 'a's, then in order to match one 'β', a new state will need to be
+ // created since a 'β' is encoded with multiple bytes. Since there's no
+ // room for this state, the search should quit at the very first position.
+ let pattern = r"[aβ]{100}";
+ let dfa = DFA::builder()
+ .configure(
+ // Configure it so that we have the minimum cache capacity
+ // possible. And that if any resets occur, the search quits.
+ DFA::config()
+ .skip_cache_capacity_check(true)
+ .cache_capacity(0)
+ .minimum_cache_clear_count(Some(0)),
+ )
+ .build(pattern)?;
+ let mut cache = dfa.create_cache();
+
+ let haystack = "a".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 25 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
+ assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
+ assert_eq!(
+ dfa.find_overlapping_fwd(
+ &mut cache,
+ &haystack,
+ &mut OverlappingState::start()
+ ),
+ Err(err.clone())
+ );
+
+ let haystack = "β".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 0 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+ // no need to test that other find routines quit, since we did that above
+
+ // OK, if we reset the cache, then we should be able to create more states
+ // and make more progress with searching for betas.
+ cache.reset(&dfa);
+ let err = MatchError::GaveUp { offset: 26 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+ // ... switching back to ASCII still makes progress since it just needs to
+ // set transitions on existing states!
+ let haystack = "a".repeat(101).into_bytes();
+ let err = MatchError::GaveUp { offset: 13 };
+ assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+ Ok(())
+}
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+ let dfa = DFA::builder()
+ .configure(DFA::config().quit(b'x', true))
+ .build("[[:word:]]+$")?;
+ let mut cache = dfa.create_cache();
+
+ assert_eq!(
+ dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_overlapping_fwd(
+ &mut cache,
+ b"abcxyz",
+ &mut OverlappingState::start()
+ ),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+ let dfa = DFA::builder()
+ .configure(DFA::config().quit(b'x', true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build("^[[:word:]]+")?;
+ let mut cache = dfa.create_cache();
+
+ assert_eq!(
+ dfa.find_earliest_rev(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+ assert_eq!(
+ dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
+ Err(MatchError::Quit { byte: b'x', offset: 3 })
+ );
+
+ Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+ DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+ let mut config = DFA::config();
+ for b in 0x80..=0xFF {
+ config = config.quit(b, true);
+ }
+ let dfa = DFA::builder().configure(config).build(r"\b")?;
+ let mut cache = dfa.create_cache();
+ let expected = HalfMatch::must(0, 1);
+ assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
+ Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+ let mut re = Regex::new(r"a[0-9]+").unwrap();
+ re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+ let mut cache = re.create_cache();
+
+ let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+ let matches: Vec<(usize, usize)> = re
+ .find_leftmost_iter(&mut cache, text)
+ .map(|m| (m.start(), m.end()))
+ .collect();
+ assert_eq!(
+ matches,
+ vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+ );
+ Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+ let text = b"za123";
+ let mut re = Regex::new(r"a[0-9]+").unwrap();
+ let mut cache = re.create_cache();
+
+ re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+ assert_eq!(
+ re.find_leftmost(&mut cache, b"za123"),
+ Some(MultiMatch::must(0, 1, 5))
+ );
+ assert_eq!(
+ re.find_leftmost(&mut cache, b"a123"),
+ Some(MultiMatch::must(0, 0, 4))
+ );
+ re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
+ assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
+ // This checks that the prefilter is used when first starting the search,
+ // instead of waiting until at least one transition has occurred.
+ assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
+ Ok(())
+}
+*/
diff --git a/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs b/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs
new file mode 100644
index 000000000..f4299510c
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/thompson/pikevm/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs b/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs
new file mode 100644
index 000000000..e5505d59a
--- /dev/null
+++ b/vendor/regex-automata/tests/nfa/thompson/pikevm/suite.rs
@@ -0,0 +1,109 @@
+use regex_automata::{
+ nfa::thompson::{
+ self,
+ pikevm::{self, PikeVM},
+ },
+ MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+ bstr::{BString, ByteSlice},
+ CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+ SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+ let builder = PikeVM::builder();
+ TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+ Ok(())
+}
+
+fn compiler(
+ mut builder: pikevm::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+ move |test, regexes| {
+ let regexes = regexes
+ .iter()
+ .map(|r| r.to_str().map(|s| s.to_string()))
+ .collect::<std::result::Result<Vec<String>, _>>()?;
+ if !configure_pikevm_builder(test, &mut builder) {
+ return Ok(CompiledRegex::skip());
+ }
+ let re = builder.build_many(&regexes)?;
+ let mut cache = re.create_cache();
+ Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+ run_test(&re, &mut cache, test)
+ }))
+ }
+}
+
+fn run_test(
+ re: &PikeVM,
+ cache: &mut pikevm::Cache,
+ test: &RegexTest,
+) -> Vec<TestResult> {
+ // let is_match = if re.is_match(cache, test.input()) {
+ // TestResult::matched()
+ // } else {
+ // TestResult::no_match()
+ // };
+ // let is_match = is_match.name("is_match");
+
+ let find_matches = match test.search_kind() {
+ TestSearchKind::Earliest => {
+ TestResult::skip().name("find_earliest_iter")
+ }
+ TestSearchKind::Leftmost => {
+ let it = re
+ .find_leftmost_iter(cache, test.input())
+ .take(test.match_limit().unwrap_or(std::usize::MAX))
+ .map(|m| Match {
+ id: m.pattern().as_usize(),
+ start: m.start(),
+ end: m.end(),
+ });
+ TestResult::matches(it).name("find_leftmost_iter")
+ }
+ TestSearchKind::Overlapping => {
+ TestResult::skip().name("find_overlapping_iter")
+ }
+ };
+ // vec![is_match, find_matches]
+ vec![find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_pikevm_builder(
+ test: &RegexTest,
+ builder: &mut pikevm::Builder,
+) -> bool {
+ let pikevm_config =
+ PikeVM::config().anchored(test.anchored()).utf8(test.utf8());
+ builder
+ .configure(pikevm_config)
+ .syntax(config_syntax(test))
+ .thompson(config_thompson(test));
+ true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+ thompson::Config::new().utf8(test.utf8())
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> SyntaxConfig {
+ SyntaxConfig::new()
+ .case_insensitive(test.case_insensitive())
+ .unicode(test.unicode())
+ .utf8(test.utf8())
+}
diff --git a/vendor/regex-automata/tests/regression.rs b/vendor/regex-automata/tests/regression.rs
index c2d2c1226..e5355fed7 100644
--- a/vendor/regex-automata/tests/regression.rs
+++ b/vendor/regex-automata/tests/regression.rs
@@ -1,4 +1,7 @@
-use regex_automata::{dense, DFA};
+use regex_automata::{
+ dfa::{dense, Automaton},
+ MatchError,
+};
// A regression test for checking that minimization correctly translates
// whether a state is a match state or not. Previously, it was possible for
@@ -34,9 +37,8 @@ fn minimize_sets_correct_match_states() {
";
let dfa = dense::Builder::new()
- .minimize(true)
- .anchored(true)
+ .configure(dense::Config::new().anchored(true).minimize(true))
.build(pattern)
.unwrap();
- assert_eq!(None, dfa.find(b"\xE2"));
+ assert_eq!(Ok(None), dfa.find_leftmost_fwd(b"\xE2"));
}
diff --git a/vendor/regex-automata/tests/suite.rs b/vendor/regex-automata/tests/suite.rs
deleted file mode 100644
index 839719403..000000000
--- a/vendor/regex-automata/tests/suite.rs
+++ /dev/null
@@ -1,250 +0,0 @@
-use regex_automata::{DenseDFA, Regex, RegexBuilder, SparseDFA};
-
-use collection::{RegexTester, SUITE};
-
-#[test]
-fn unminimized_standard() {
- let mut builder = RegexBuilder::new();
- builder.minimize(false).premultiply(false).byte_classes(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn unminimized_premultiply() {
- let mut builder = RegexBuilder::new();
- builder.minimize(false).premultiply(true).byte_classes(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn unminimized_byte_class() {
- let mut builder = RegexBuilder::new();
- builder.minimize(false).premultiply(false).byte_classes(true);
-
- let mut tester = RegexTester::new();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn unminimized_premultiply_byte_class() {
- let mut builder = RegexBuilder::new();
- builder.minimize(false).premultiply(true).byte_classes(true);
-
- let mut tester = RegexTester::new();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn unminimized_standard_no_nfa_shrink() {
- let mut builder = RegexBuilder::new();
- builder
- .minimize(false)
- .premultiply(false)
- .byte_classes(false)
- .shrink(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn minimized_standard() {
- let mut builder = RegexBuilder::new();
- builder.minimize(true).premultiply(false).byte_classes(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn minimized_premultiply() {
- let mut builder = RegexBuilder::new();
- builder.minimize(true).premultiply(true).byte_classes(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn minimized_byte_class() {
- let mut builder = RegexBuilder::new();
- builder.minimize(true).premultiply(false).byte_classes(true);
-
- let mut tester = RegexTester::new();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn minimized_premultiply_byte_class() {
- let mut builder = RegexBuilder::new();
- builder.minimize(true).premultiply(true).byte_classes(true);
-
- let mut tester = RegexTester::new();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-#[test]
-fn minimized_standard_no_nfa_shrink() {
- let mut builder = RegexBuilder::new();
- builder
- .minimize(true)
- .premultiply(false)
- .byte_classes(false)
- .shrink(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- tester.test_all(builder, SUITE.tests());
- tester.assert();
-}
-
-// A basic sanity test that checks we can convert a regex to a smaller
-// representation and that the resulting regex still passes our tests.
-//
-// If tests grow minimal regexes that cannot be represented in 16 bits, then
-// we'll either want to skip those or increase the size to test to u32.
-#[test]
-fn u16() {
- let mut builder = RegexBuilder::new();
- builder.minimize(true).premultiply(false).byte_classes(true);
-
- let mut tester = RegexTester::new().skip_expensive();
- for test in SUITE.tests() {
- let builder = builder.clone();
- let re: Regex = match tester.build_regex(builder, test) {
- None => continue,
- Some(re) => re,
- };
- let small_re = Regex::from_dfas(
- re.forward().to_u16().unwrap(),
- re.reverse().to_u16().unwrap(),
- );
-
- tester.test(test, &small_re);
- }
- tester.assert();
-}
-
-// Test that sparse DFAs work using the standard configuration.
-#[test]
-fn sparse_unminimized_standard() {
- let mut builder = RegexBuilder::new();
- builder.minimize(false).premultiply(false).byte_classes(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- for test in SUITE.tests() {
- let builder = builder.clone();
- let re: Regex = match tester.build_regex(builder, test) {
- None => continue,
- Some(re) => re,
- };
- let fwd = re.forward().to_sparse().unwrap();
- let rev = re.reverse().to_sparse().unwrap();
- let sparse_re = Regex::from_dfas(fwd, rev);
-
- tester.test(test, &sparse_re);
- }
- tester.assert();
-}
-
-// Test that sparse DFAs work after converting them to a different state ID
-// representation.
-#[test]
-fn sparse_u16() {
- let mut builder = RegexBuilder::new();
- builder.minimize(true).premultiply(false).byte_classes(false);
-
- let mut tester = RegexTester::new().skip_expensive();
- for test in SUITE.tests() {
- let builder = builder.clone();
- let re: Regex = match tester.build_regex(builder, test) {
- None => continue,
- Some(re) => re,
- };
- let fwd = re.forward().to_sparse().unwrap().to_u16().unwrap();
- let rev = re.reverse().to_sparse().unwrap().to_u16().unwrap();
- let sparse_re = Regex::from_dfas(fwd, rev);
-
- tester.test(test, &sparse_re);
- }
- tester.assert();
-}
-
-// Another basic sanity test that checks we can serialize and then deserialize
-// a regex, and that the resulting regex can be used for searching correctly.
-#[test]
-fn serialization_roundtrip() {
- let mut builder = RegexBuilder::new();
- builder.premultiply(false).byte_classes(true);
-
- let mut tester = RegexTester::new().skip_expensive();
- for test in SUITE.tests() {
- let builder = builder.clone();
- let re: Regex = match tester.build_regex(builder, test) {
- None => continue,
- Some(re) => re,
- };
-
- let fwd_bytes = re.forward().to_bytes_native_endian().unwrap();
- let rev_bytes = re.reverse().to_bytes_native_endian().unwrap();
- let fwd: DenseDFA<&[usize], usize> =
- unsafe { DenseDFA::from_bytes(&fwd_bytes) };
- let rev: DenseDFA<&[usize], usize> =
- unsafe { DenseDFA::from_bytes(&rev_bytes) };
- let re = Regex::from_dfas(fwd, rev);
-
- tester.test(test, &re);
- }
- tester.assert();
-}
-
-// A basic sanity test that checks we can serialize and then deserialize a
-// regex using sparse DFAs, and that the resulting regex can be used for
-// searching correctly.
-#[test]
-fn sparse_serialization_roundtrip() {
- let mut builder = RegexBuilder::new();
- builder.byte_classes(true);
-
- let mut tester = RegexTester::new().skip_expensive();
- for test in SUITE.tests() {
- let builder = builder.clone();
- let re: Regex = match tester.build_regex(builder, test) {
- None => continue,
- Some(re) => re,
- };
-
- let fwd_bytes = re
- .forward()
- .to_sparse()
- .unwrap()
- .to_bytes_native_endian()
- .unwrap();
- let rev_bytes = re
- .reverse()
- .to_sparse()
- .unwrap()
- .to_bytes_native_endian()
- .unwrap();
- let fwd: SparseDFA<&[u8], usize> =
- unsafe { SparseDFA::from_bytes(&fwd_bytes) };
- let rev: SparseDFA<&[u8], usize> =
- unsafe { SparseDFA::from_bytes(&rev_bytes) };
- let re = Regex::from_dfas(fwd, rev);
-
- tester.test(test, &re);
- }
- tester.assert();
-}
diff --git a/vendor/regex-automata/tests/tests.rs b/vendor/regex-automata/tests/tests.rs
index fb4cd7717..e4728470c 100644
--- a/vendor/regex-automata/tests/tests.rs
+++ b/vendor/regex-automata/tests/tests.rs
@@ -1,25 +1,44 @@
-#[cfg(feature = "std")]
-#[macro_use]
-extern crate lazy_static;
-#[cfg(feature = "std")]
-extern crate regex;
-#[cfg(feature = "std")]
-extern crate regex_automata;
-#[cfg(feature = "std")]
-extern crate serde;
-#[cfg(feature = "std")]
-extern crate serde_bytes;
-#[cfg(feature = "std")]
-#[macro_use]
-extern crate serde_derive;
-#[cfg(feature = "std")]
-extern crate toml;
+#![allow(warnings)]
-#[cfg(feature = "std")]
-mod collection;
-#[cfg(feature = "std")]
+use regex_test::RegexTests;
+
+mod dfa;
+mod hybrid;
+mod nfa;
mod regression;
-#[cfg(feature = "std")]
-mod suite;
-#[cfg(feature = "std")]
-mod unescape;
+mod util;
+
+type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
+
+fn suite() -> Result<RegexTests> {
+ let mut tests = RegexTests::new();
+ macro_rules! load {
+ ($name:expr) => {{
+ const DATA: &[u8] =
+ include_bytes!(concat!("data/", $name, ".toml"));
+ tests.load_slice($name, DATA)?;
+ }};
+ }
+
+ load!("bytes");
+ load!("crazy");
+ load!("earliest");
+ load!("empty");
+ load!("expensive");
+ load!("flags");
+ load!("iter");
+ load!("misc");
+ load!("multiline");
+ load!("no-unicode");
+ load!("overlapping");
+ load!("regression");
+ load!("set");
+ load!("unicode");
+ load!("word-boundary");
+ load!("fowler/basic");
+ load!("fowler/nullsubexpr");
+ load!("fowler/repetition");
+ load!("fowler/repetition-expensive");
+
+ Ok(tests)
+}
diff --git a/vendor/regex-automata/tests/unescape.rs b/vendor/regex-automata/tests/unescape.rs
deleted file mode 100644
index 43fe04e71..000000000
--- a/vendor/regex-automata/tests/unescape.rs
+++ /dev/null
@@ -1,84 +0,0 @@
-#[derive(Clone, Copy, Eq, PartialEq)]
-enum State {
- /// The state after seeing a `\`.
- Escape,
- /// The state after seeing a `\x`.
- HexFirst,
- /// The state after seeing a `\x[0-9A-Fa-f]`.
- HexSecond(char),
- /// Default state.
- Literal,
-}
-
-pub fn unescape(s: &str) -> Vec<u8> {
- use self::State::*;
-
- let mut bytes = vec![];
- let mut state = Literal;
- for c in s.chars() {
- match state {
- Escape => match c {
- '\\' => {
- bytes.push(b'\\');
- state = Literal;
- }
- 'n' => {
- bytes.push(b'\n');
- state = Literal;
- }
- 'r' => {
- bytes.push(b'\r');
- state = Literal;
- }
- 't' => {
- bytes.push(b'\t');
- state = Literal;
- }
- 'x' => {
- state = HexFirst;
- }
- c => {
- bytes.extend(format!(r"\{}", c).into_bytes());
- state = Literal;
- }
- },
- HexFirst => match c {
- '0'..='9' | 'A'..='F' | 'a'..='f' => {
- state = HexSecond(c);
- }
- c => {
- bytes.extend(format!(r"\x{}", c).into_bytes());
- state = Literal;
- }
- },
- HexSecond(first) => match c {
- '0'..='9' | 'A'..='F' | 'a'..='f' => {
- let ordinal = format!("{}{}", first, c);
- let byte = u8::from_str_radix(&ordinal, 16).unwrap();
- bytes.push(byte);
- state = Literal;
- }
- c => {
- let original = format!(r"\x{}{}", first, c);
- bytes.extend(original.into_bytes());
- state = Literal;
- }
- },
- Literal => match c {
- '\\' => {
- state = Escape;
- }
- c => {
- bytes.extend(c.to_string().as_bytes());
- }
- },
- }
- }
- match state {
- Escape => bytes.push(b'\\'),
- HexFirst => bytes.extend(b"\\x"),
- HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()),
- Literal => {}
- }
- bytes
-}
diff --git a/vendor/regex-automata/tests/util.rs b/vendor/regex-automata/tests/util.rs
new file mode 100644
index 000000000..499aa8c6d
--- /dev/null
+++ b/vendor/regex-automata/tests/util.rs
@@ -0,0 +1,57 @@
+use regex_automata::util::prefilter::{self, Candidate, Prefilter};
+
+#[derive(Clone, Debug)]
+pub struct SubstringPrefilter(bstr::Finder<'static>);
+
+impl SubstringPrefilter {
+ pub fn new<B: AsRef<[u8]>>(needle: B) -> SubstringPrefilter {
+ SubstringPrefilter(bstr::Finder::new(needle.as_ref()).into_owned())
+ }
+}
+
+impl Prefilter for SubstringPrefilter {
+ #[inline]
+ fn next_candidate(
+ &self,
+ state: &mut prefilter::State,
+ haystack: &[u8],
+ at: usize,
+ ) -> Candidate {
+ self.0
+ .find(&haystack[at..])
+ .map(|i| Candidate::PossibleStartOfMatch(at + i))
+ .unwrap_or(Candidate::None)
+ }
+
+ fn heap_bytes(&self) -> usize {
+ self.0.needle().len()
+ }
+}
+
+/// A prefilter that always returns `Candidate::None`, even if it's a false
+/// negative. This is useful for confirming that a prefilter is actually
+/// active by asserting an incorrect result.
+#[derive(Clone, Debug)]
+pub struct BunkPrefilter(());
+
+impl BunkPrefilter {
+ pub fn new() -> BunkPrefilter {
+ BunkPrefilter(())
+ }
+}
+
+impl Prefilter for BunkPrefilter {
+ #[inline]
+ fn next_candidate(
+ &self,
+ _state: &mut prefilter::State,
+ _haystack: &[u8],
+ _at: usize,
+ ) -> Candidate {
+ Candidate::None
+ }
+
+ fn heap_bytes(&self) -> usize {
+ 0
+ }
+}