summaryrefslogtreecommitdiffstats
path: root/vendor/regex
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 03:57:19 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 03:57:19 +0000
commita0b8f38ab54ac451646aa00cd5e91b6c76f22a84 (patch)
treefc451898ccaf445814e26b46664d78702178101d /vendor/regex
parentAdding debian version 1.71.1+dfsg1-2. (diff)
downloadrustc-a0b8f38ab54ac451646aa00cd5e91b6c76f22a84.tar.xz
rustc-a0b8f38ab54ac451646aa00cd5e91b6c76f22a84.zip
Merging upstream version 1.72.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex')
-rw-r--r--vendor/regex/.cargo-checksum.json2
-rw-r--r--vendor/regex/CHANGELOG.md191
-rw-r--r--vendor/regex/Cargo.lock18
-rw-r--r--vendor/regex/Cargo.toml17
-rw-r--r--vendor/regex/README.md4
-rw-r--r--vendor/regex/record/README.md4
-rw-r--r--vendor/regex/record/compile-test/2023-04-19_1.7.3.csv11
-rw-r--r--vendor/regex/record/compile-test/2023-04-20_master.csv11
-rw-r--r--vendor/regex/record/compile-test/README.md27
-rw-r--r--vendor/regex/src/compile.rs297
-rw-r--r--vendor/regex/src/dfa.rs2
-rw-r--r--vendor/regex/src/error.rs22
-rw-r--r--vendor/regex/src/exec.rs196
-rw-r--r--vendor/regex/src/expand.rs10
-rw-r--r--vendor/regex/src/lib.rs40
-rw-r--r--vendor/regex/src/literal/imp.rs81
-rw-r--r--vendor/regex/src/literal/mod.rs6
-rw-r--r--vendor/regex/src/prog.rs4
-rw-r--r--vendor/regex/src/re_bytes.rs128
-rw-r--r--vendor/regex/src/re_set.rs11
-rw-r--r--vendor/regex/src/re_trait.rs2
-rw-r--r--vendor/regex/src/re_unicode.rs128
-rw-r--r--vendor/regex/tests/regression.rs41
-rw-r--r--vendor/regex/tests/regression_fuzz.rs9
-rw-r--r--vendor/regex/tests/replace.rs16
-rw-r--r--vendor/regex/tests/set.rs7
-rw-r--r--vendor/regex/tests/unicode.rs3
27 files changed, 1032 insertions, 256 deletions
diff --git a/vendor/regex/.cargo-checksum.json b/vendor/regex/.cargo-checksum.json
index d6ea1df9a..a85152d35 100644
--- a/vendor/regex/.cargo-checksum.json
+++ b/vendor/regex/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"CHANGELOG.md":"c66cc76a297a1068a3aa81d08326175be887b60fb6330c4b7922f4ad286bd144","Cargo.lock":"fa78ce7955999185e017ef812db215124b65ae8aef67b4ea9ca5bd18c50b0d35","Cargo.toml":"566dca60827f0cbafd1d920457d6d7fad9bd5b85630832207de8cc9c35cab274","HACKING.md":"17818f7a17723608f6bdbe6388ad0a913d4f96f76a16649aaf4e274b1fa0ea97","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","PERFORMANCE.md":"0d5ef3866386918dfdefb1aa9a28cfe33cb3c8ceeb79f3f8ba5b88253dd95991","README.md":"f69204a0f446047d8f4d1f3d84b75f235adb5c26477f3a37b671411bc954d14c","UNICODE.md":"a8a8399540eed000d19420135a527f400247a04572e44d124c786b870f518776","examples/regexdna-input.txt":"156a49710bb3e1ed4bc2bbb0af0f383b747b3d0281453cfff39c296124c598f8","examples/regexdna-output.txt":"35e85b19b70a893d752fd43e54e1e9da08bac43559191cea85b33387c24c4cc1","examples/shootout-regex-dna-bytes.rs":"fa2daedb4e0a05f64f33f4af62fbb0176db998e3676f8637ab684b725367a7b4","examples/shootout-regex-dna-cheat.rs":"1f871a6eaaf8372299fa3c762051112fa89a14235b03f734fc50ebd51ecaee72","examples/shootout-regex-dna-replace.rs":"32ffdf13ac6c4ce3fc32116a048e9cc682aa34cdb8e5beaf565a22addbdcd9ab","examples/shootout-regex-dna-single-cheat.rs":"809f75bf1e1917a53623eb6f1a3ce3b7d2ed98a6a1dbc0bd4853bec49a0c6f94","examples/shootout-regex-dna-single.rs":"1ab14f5703cd4be2e75a2e792e0ba1d322b9e4b14535d396805a4316d577f5bb","examples/shootout-regex-dna.rs":"20ea46ab63f91e3ac6a64e997eadd436a9cbc2f1bdade28e4512052f0e25bc34","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/backtrack.rs":"52987d80448f3d7f5d4e3545ddfc09f1f30de7602d9b5489961db4b215a377fd","src/compile.rs":"79a59be2d2db650b5a322e15e9bf1d3227944410bc780fc6089da8f4d2609b77","src/dfa.rs":"10273980d1f08aaff495e11efa240249a2b2c08a4db7c49c8d6759bc65a3b174","src/error.rs":"71c85db839514f26ee024a689061743ea94a34eb7a3291e6c2b69b45a9682d09","src/exec.rs":"4726e2c210c6adb91b25390cbc864ab71deabb17ca87a46e83acef26a194bfc2","src/expand.rs":"71220309a3bac797f55129f49e79c03e96efec894ea338c735b78695367e04ca","src/find_byte.rs":"b387247b77e3269f057c3399aefe5a815032c3af918c876f80eb4b282e4eb95e","src/freqs.rs":"255555f3d95b08a5bb3bc2f38d5a06cc100a39c0f0127fe4f50c33afa1cadc65","src/input.rs":"13f49c1bce2fadd04a45b421d374cd0f8b72bef83f7e8fda958962aaccbe799a","src/lib.rs":"982fadba415c4c5b93f4d7d4a73a23ec88e2d96daaa03b679d14490ea0f63197","src/literal/imp.rs":"b7f63a861c299bea4baaab17353a420ee339c2cf76d3858c95f39342bd4463e7","src/literal/mod.rs":"533f1d68af088e9485170145e27518368e541a0337fdb44f63249ebf97310300","src/pattern.rs":"993d8b6b4bcea5e02bee3c76e17c356a5a47f8fc53c5555edfd1ebb71c0878bf","src/pikevm.rs":"6c0eaa7e878c945ac4c3c545c98f5706ad04846fc432a5086c8ee78eb030dfa7","src/pool.rs":"942e991ae31ef349bd76efd78b2a712c01166dec965bf93742977ed0870d5a10","src/prog.rs":"bebb3e50745bbc05d6c8240d972ba55a1818c51b1161dc1c21f3fe13c11d4884","src/re_builder.rs":"943344bf6e2fc90902ee04b11b741c32418ac6814b21b7982cc0a3a817713f3e","src/re_bytes.rs":"63ee1db1637a3764addb10e27248129acffaf78bb0a69624add4d9d6f1e97040","src/re_set.rs":"7921ac4a919b7a5deffe82d099a9ccaf5487aebd890dfb7a661e602c6ad3f1a9","src/re_trait.rs":"d237121b6f6b606836c72305cbcb3bbdbc54d1f6827d19a19cd0fbb4372e0145","src/re_unicode.rs":"4ca66d6e835df7c0f570c8cde52667ef90ba1687d5285f12fedef2e38ae925b4","src/sparse.rs":"0da3ddb7972109869248a764dbb10254555f4bb51c375e89fb3fab9cafa47320","src/testdata/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","src/testdata/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","src/testdata/basic.dat":"b5b33aa89d48a61cd67cb1fbfd8f70e62c83e30b86256f9f915a5190dd38ff06","src/testdata/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","src/testdata/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","src/utf8.rs":"f85a356ff5d5b19e417b73ce1dd84581b21d283f6dddd195547c30af9c60bd1a","test":"0d62fdca7da12fc19ea5306b5de1d83e68d9365a029c043d524334da138b0304","tests/api.rs":"7b2a0ef75e99b9776094967bd66e9cdeaa8e11359f5f0a12bd08ef0e8d0c11fc","tests/api_str.rs":"2ae38c04e7e8fac008b609a820d0b1561ba75f39b0edc0987d6d3d06132da77f","tests/bytes.rs":"edc50f526c5fee43df89d639ef18b237e4eb91e9d533bfc43f3cbab7417d38ba","tests/consistent.rs":"d69435154c09478076497216e43081a835ac65147181a4fbddad7bff469605b2","tests/crates_regex.rs":"91a59d470e0700b4bcb3ff735d06799f3107b8ef4875a2e9904607b164be0326","tests/crazy.rs":"c0d56380dff19bdd5d7a3eb731d0e2dc564e169a1b73c81e1879b1e87f5f5f77","tests/flags.rs":"05caace2c81a99d2168037f3a38035d4dffe9f85ef3ebd7ef18b1bc6612f1ea8","tests/fowler.rs":"d78cf914de40b1e125cc92b65ccb444d462586bd07b5e05de4e4a1b5de16aa76","tests/macros.rs":"6db70c16fc90df13e6b30d2b606f8b6dd4dc976697967f6ee001b15aab6d0b19","tests/macros_bytes.rs":"a049f528a93173a1bb176cd46932dce1880679f4a1752e099be920f0e4546fd0","tests/macros_str.rs":"e585b1461374c45a2eca44ca045bc3c1fe984b2b4212e432b0c695b420e708b7","tests/misc.rs":"395f52793fa022e4cdda78675b6a6fba1a3106b4b99c834c39f7801574054bd1","tests/multiline.rs":"1b1a3326ed976437c1357f01d81833ece7ea244f38826246eab55cacd5d0862a","tests/noparse.rs":"12b6be0eff3d80779d33c6459396c74c0f6ebf4ddc9f1d33c3e747ea9e3bf268","tests/regression.rs":"1c965fefb8c7a2b1dfdab3e3fdeebaf47846555c50c8005e5537f96a52a3e252","tests/regression_fuzz.rs":"a504ec563e0d23bd2039493b7b1767fe1f831d7d668f6f4b2ecd124fc7899bcd","tests/replace.rs":"66f97532e40697934e2a77605b9002dfd22c46b6033ccb755e7660d855229f41","tests/searcher.rs":"ce35e47b0a276a7e8c9060c6a0b225ffba163aebc61fbc15555a6897fa0e552c","tests/set.rs":"f1e2af6baeeaed3cc99ed347ff516fe7b2eb0027ef64b891502e1486598eaf8a","tests/shortest_match.rs":"a2c94390c0d61bc24796b4c1288c924e90c8c9c6156fdebb858175177a194a42","tests/suffix_reverse.rs":"b95f89397404871227d9efe6df23b9ded147f183db81597e608f693955c668b5","tests/test_backtrack.rs":"b70c5e5f1241efd76dd9f9dd4a4df8a7b38113bd407d1f5f56867f1176177a59","tests/test_backtrack_bytes.rs":"b8a111d4b4109c8bba7e2afb650572c495a14d357fb1f743c1076fb001f704b5","tests/test_backtrack_utf8bytes.rs":"c0c279785d18beac2b4e178e7bf6c14ed235d65f00ca467cfd9c333d79487649","tests/test_crates_regex.rs":"fd9525c2eef0e2f8cb7f787bc2b721bcd0b5d84f3bca49adfe48d657a99c721a","tests/test_default.rs":"b32c11a43da4379a3717dd7a5f152c811257c7d6595c9d3c51f2de102e320c87","tests/test_default_bytes.rs":"831d3e6bfb882feb15f700e30304bd34328f888fb4c15c7169371e25024ce9a7","tests/test_nfa.rs":"f119fc43a018249c39c813d57096b0654ff69f337345f2bbd9b0e61cc9137285","tests/test_nfa_bytes.rs":"89eae3bef6a1d0bcea6b5de5be35ad72f613f2ceb8b58fe82a6c6ef2ccdc07d0","tests/test_nfa_utf8bytes.rs":"7d830b4aa401887d7cf098b62fed4cd8017ef8b61f625c7c9a2159a6b4cfeb71","tests/unicode.rs":"1af9db7f09a6b0113b8a64733e06c8415fef720b2fdef227ae398d94332287cd","tests/word_boundary.rs":"7081317ddcec1e82dd4a2090a571c6abf2ff4bbfa8cd10395e1eb3f386157fae","tests/word_boundary_ascii.rs":"cd0be5b5b485de0ba7994b42e2864585556c3d2d8bf5eab05b58931d9aaf4b87","tests/word_boundary_unicode.rs":"75dbcc35d3abc0f9795c2ea99e216dc227b0a5b58e9ca5eef767815ff0513921"},"package":"8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"} \ No newline at end of file
+{"files":{"CHANGELOG.md":"8839af2eea6a564b11b5a8cf5023f121de9b5edd02829448670b2d880ded3bb4","Cargo.lock":"d8f5dca5987e52303b325d99fd65dea8e75202b2dab766dcf9be38e64fd73f46","Cargo.toml":"1bfd9b78fc3842caa6ec435ddc6a4f81123d26d0cf4c1e66c1f1ef05a3ec3e72","HACKING.md":"17818f7a17723608f6bdbe6388ad0a913d4f96f76a16649aaf4e274b1fa0ea97","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","PERFORMANCE.md":"0d5ef3866386918dfdefb1aa9a28cfe33cb3c8ceeb79f3f8ba5b88253dd95991","README.md":"cdcafba78fda99c94f9ea3290ad521fbdbb12043ca6925b10cef801eb4e5e223","UNICODE.md":"a8a8399540eed000d19420135a527f400247a04572e44d124c786b870f518776","examples/regexdna-input.txt":"156a49710bb3e1ed4bc2bbb0af0f383b747b3d0281453cfff39c296124c598f8","examples/regexdna-output.txt":"35e85b19b70a893d752fd43e54e1e9da08bac43559191cea85b33387c24c4cc1","examples/shootout-regex-dna-bytes.rs":"fa2daedb4e0a05f64f33f4af62fbb0176db998e3676f8637ab684b725367a7b4","examples/shootout-regex-dna-cheat.rs":"1f871a6eaaf8372299fa3c762051112fa89a14235b03f734fc50ebd51ecaee72","examples/shootout-regex-dna-replace.rs":"32ffdf13ac6c4ce3fc32116a048e9cc682aa34cdb8e5beaf565a22addbdcd9ab","examples/shootout-regex-dna-single-cheat.rs":"809f75bf1e1917a53623eb6f1a3ce3b7d2ed98a6a1dbc0bd4853bec49a0c6f94","examples/shootout-regex-dna-single.rs":"1ab14f5703cd4be2e75a2e792e0ba1d322b9e4b14535d396805a4316d577f5bb","examples/shootout-regex-dna.rs":"20ea46ab63f91e3ac6a64e997eadd436a9cbc2f1bdade28e4512052f0e25bc34","record/README.md":"02e6f85f8a43f18540e4a52a75d1001494df7aceac3873e9a13e3ceba190206d","record/compile-test/2023-04-19_1.7.3.csv":"460059ba2f10456175ff92bd75d4a365b14a1843e2b46e7b285d58da59e6d3ca","record/compile-test/2023-04-20_master.csv":"6b94df278e4ed82a3fd0d4bfe92a4614714e00435e983c7649ee9f54925f906e","record/compile-test/README.md":"ba2b606993edd8d705ad1677ec954862614e52b028407e1908bb5dfb07767f2d","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/backtrack.rs":"52987d80448f3d7f5d4e3545ddfc09f1f30de7602d9b5489961db4b215a377fd","src/compile.rs":"65b9f083d28a79bcef9584b8da3d87f21b561912de29c6c6053546e1e752980f","src/dfa.rs":"405f24adbf775b0902fd830cc5a5446de80da1a2a5eb950fca357aff5b19163f","src/error.rs":"476a86da4bb115cb85e9327aee6f423c1dade524517178186c747a3baa9be71d","src/exec.rs":"72693556149e1347283ff2499bf624e01fb917076bf4d103a6723f4ecfa9cf65","src/expand.rs":"59e459a9bbd0ae60478a6cbe48203091245e39bbd064e04b50d848d75f6de920","src/find_byte.rs":"b387247b77e3269f057c3399aefe5a815032c3af918c876f80eb4b282e4eb95e","src/freqs.rs":"255555f3d95b08a5bb3bc2f38d5a06cc100a39c0f0127fe4f50c33afa1cadc65","src/input.rs":"13f49c1bce2fadd04a45b421d374cd0f8b72bef83f7e8fda958962aaccbe799a","src/lib.rs":"7cb5ea7fbb41b71d6a9d0692442b8bdfccd10199dd1340e534202f988cfad493","src/literal/imp.rs":"26239f37d7c79a88f154ffb864be282598486d9df9363c918ac3106537119b3d","src/literal/mod.rs":"59fd8bc37784906d729167b69bd14f91094c4c82749984ee5ffd41ae62c38af2","src/pattern.rs":"993d8b6b4bcea5e02bee3c76e17c356a5a47f8fc53c5555edfd1ebb71c0878bf","src/pikevm.rs":"6c0eaa7e878c945ac4c3c545c98f5706ad04846fc432a5086c8ee78eb030dfa7","src/pool.rs":"942e991ae31ef349bd76efd78b2a712c01166dec965bf93742977ed0870d5a10","src/prog.rs":"8ab44101bb2aaf51f00872798f3d926ef150744898538b4ceb5f3d38fbf861f0","src/re_builder.rs":"943344bf6e2fc90902ee04b11b741c32418ac6814b21b7982cc0a3a817713f3e","src/re_bytes.rs":"15a53cccd7d573f668ac38158b140c0e0e51a31ac16de800f24e72c8d364561e","src/re_set.rs":"8b9b9b78fc1dbd8731298269f410c67689baedf4116fb617afd309fd4bfe116c","src/re_trait.rs":"df29beedc00933e34e7f89b4db645cba18db7f7e4cf3f1d48328bddada5191d5","src/re_unicode.rs":"940be2629a8176065f821b419693135fdfdb016b573e8e00a10d963712bf1fa8","src/sparse.rs":"0da3ddb7972109869248a764dbb10254555f4bb51c375e89fb3fab9cafa47320","src/testdata/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","src/testdata/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","src/testdata/basic.dat":"b5b33aa89d48a61cd67cb1fbfd8f70e62c83e30b86256f9f915a5190dd38ff06","src/testdata/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","src/testdata/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","src/utf8.rs":"f85a356ff5d5b19e417b73ce1dd84581b21d283f6dddd195547c30af9c60bd1a","test":"0d62fdca7da12fc19ea5306b5de1d83e68d9365a029c043d524334da138b0304","tests/api.rs":"7b2a0ef75e99b9776094967bd66e9cdeaa8e11359f5f0a12bd08ef0e8d0c11fc","tests/api_str.rs":"2ae38c04e7e8fac008b609a820d0b1561ba75f39b0edc0987d6d3d06132da77f","tests/bytes.rs":"edc50f526c5fee43df89d639ef18b237e4eb91e9d533bfc43f3cbab7417d38ba","tests/consistent.rs":"d69435154c09478076497216e43081a835ac65147181a4fbddad7bff469605b2","tests/crates_regex.rs":"91a59d470e0700b4bcb3ff735d06799f3107b8ef4875a2e9904607b164be0326","tests/crazy.rs":"c0d56380dff19bdd5d7a3eb731d0e2dc564e169a1b73c81e1879b1e87f5f5f77","tests/flags.rs":"05caace2c81a99d2168037f3a38035d4dffe9f85ef3ebd7ef18b1bc6612f1ea8","tests/fowler.rs":"d78cf914de40b1e125cc92b65ccb444d462586bd07b5e05de4e4a1b5de16aa76","tests/macros.rs":"6db70c16fc90df13e6b30d2b606f8b6dd4dc976697967f6ee001b15aab6d0b19","tests/macros_bytes.rs":"a049f528a93173a1bb176cd46932dce1880679f4a1752e099be920f0e4546fd0","tests/macros_str.rs":"e585b1461374c45a2eca44ca045bc3c1fe984b2b4212e432b0c695b420e708b7","tests/misc.rs":"395f52793fa022e4cdda78675b6a6fba1a3106b4b99c834c39f7801574054bd1","tests/multiline.rs":"1b1a3326ed976437c1357f01d81833ece7ea244f38826246eab55cacd5d0862a","tests/noparse.rs":"12b6be0eff3d80779d33c6459396c74c0f6ebf4ddc9f1d33c3e747ea9e3bf268","tests/regression.rs":"3b15568d79ae7d9845fda737a93cd518db01a7ed388b2ac437389a2a1d50f129","tests/regression_fuzz.rs":"3c99498af578044159336c63c8ac81d65bfc611a0aa80217400544d5caa66827","tests/replace.rs":"5f1bbf3f89de8cd021406a4affd0d07484ba194ac791ac307efd66f2792a2366","tests/searcher.rs":"ce35e47b0a276a7e8c9060c6a0b225ffba163aebc61fbc15555a6897fa0e552c","tests/set.rs":"a69fab05adabdbf27e788d51d7cea06acfd9017182e2f201d592b45c4fec5618","tests/shortest_match.rs":"a2c94390c0d61bc24796b4c1288c924e90c8c9c6156fdebb858175177a194a42","tests/suffix_reverse.rs":"b95f89397404871227d9efe6df23b9ded147f183db81597e608f693955c668b5","tests/test_backtrack.rs":"b70c5e5f1241efd76dd9f9dd4a4df8a7b38113bd407d1f5f56867f1176177a59","tests/test_backtrack_bytes.rs":"b8a111d4b4109c8bba7e2afb650572c495a14d357fb1f743c1076fb001f704b5","tests/test_backtrack_utf8bytes.rs":"c0c279785d18beac2b4e178e7bf6c14ed235d65f00ca467cfd9c333d79487649","tests/test_crates_regex.rs":"fd9525c2eef0e2f8cb7f787bc2b721bcd0b5d84f3bca49adfe48d657a99c721a","tests/test_default.rs":"b32c11a43da4379a3717dd7a5f152c811257c7d6595c9d3c51f2de102e320c87","tests/test_default_bytes.rs":"831d3e6bfb882feb15f700e30304bd34328f888fb4c15c7169371e25024ce9a7","tests/test_nfa.rs":"f119fc43a018249c39c813d57096b0654ff69f337345f2bbd9b0e61cc9137285","tests/test_nfa_bytes.rs":"89eae3bef6a1d0bcea6b5de5be35ad72f613f2ceb8b58fe82a6c6ef2ccdc07d0","tests/test_nfa_utf8bytes.rs":"7d830b4aa401887d7cf098b62fed4cd8017ef8b61f625c7c9a2159a6b4cfeb71","tests/unicode.rs":"d0a2fec28cb28910a5ec1a51849dcf7923673a2c3bc0ffc24025f7c37667add2","tests/word_boundary.rs":"7081317ddcec1e82dd4a2090a571c6abf2ff4bbfa8cd10395e1eb3f386157fae","tests/word_boundary_ascii.rs":"cd0be5b5b485de0ba7994b42e2864585556c3d2d8bf5eab05b58931d9aaf4b87","tests/word_boundary_unicode.rs":"75dbcc35d3abc0f9795c2ea99e216dc227b0a5b58e9ca5eef767815ff0513921"},"package":"d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"} \ No newline at end of file
diff --git a/vendor/regex/CHANGELOG.md b/vendor/regex/CHANGELOG.md
index 44274acac..e019afb2f 100644
--- a/vendor/regex/CHANGELOG.md
+++ b/vendor/regex/CHANGELOG.md
@@ -1,3 +1,192 @@
+1.8.4 (2023-06-05)
+==================
+This is a patch release that fixes a bug where `(?-u:\B)` was allowed in
+Unicode regexes, despite the fact that the current matching engines can report
+match offsets between the code units of a single UTF-8 encoded codepoint. That
+in turn means that match offsets that split a codepoint could be reported,
+which in turn results in panicking when one uses them to slice a `&str`.
+
+This bug occurred in the transition to `regex 1.8` because the underlying
+syntactical error that prevented this regex from compiling was intentionally
+removed. That's because `(?-u:\B)` will be permitted in Unicode regexes in
+`regex 1.9`, but the matching engines will guarantee to never report match
+offsets that split a codepoint. When the underlying syntactical error was
+removed, no code was added to ensure that `(?-u:\B)` didn't compile in the
+`regex 1.8` transition release. This release, `regex 1.8.4`, adds that code
+such that `Regex::new(r"(?-u:\B)")` returns to the `regex <1.8` behavior of
+not compiling. (A `bytes::Regex` can still of course compile it.)
+
+Bug fixes:
+
+* [BUG #1006](https://github.com/rust-lang/regex/issues/1006):
+Fix a bug where `(?-u:\B)` was allowed in Unicode regexes, and in turn could
+lead to match offsets that split a codepoint in `&str`.
+
+
+1.8.3 (2023-05-25)
+==================
+This is a patch release that fixes a bug where the regex would report a
+match at every position even when it shouldn't. This could occur in a very
+small subset of regexes, usually an alternation of simple literals that
+have particular properties. (See the issue linked below for a more precise
+description.)
+
+Bug fixes:
+
+* [BUG #999](https://github.com/rust-lang/regex/issues/999):
+Fix a bug where a match at every position is erroneously reported.
+
+
+1.8.2 (2023-05-22)
+==================
+This is a patch release that fixes a bug where regex compilation could panic
+in debug mode for regexes with large counted repetitions. For example,
+`a{2147483516}{2147483416}{5}` resulted in an integer overflow that wrapped
+in release mode but panicking in debug mode. Despite the unintended wrapping
+arithmetic in release mode, it didn't cause any other logical bugs since the
+errant code was for new analysis that wasn't used yet.
+
+Bug fixes:
+
+* [BUG #995](https://github.com/rust-lang/regex/issues/995):
+Fix a bug where regex compilation with large counted repetitions could panic.
+
+
+1.8.1 (2023-04-21)
+==================
+This is a patch release that fixes a bug where a regex match could be reported
+where none was found. Specifically, the bug occurs when a pattern contains some
+literal prefixes that could be extracted _and_ an optional word boundary in the
+prefix.
+
+Bug fixes:
+
+* [BUG #981](https://github.com/rust-lang/regex/issues/981):
+Fix a bug where a word boundary could interact with prefix literal
+optimizations and lead to a false positive match.
+
+
+1.8.0 (2023-04-20)
+==================
+This is a sizeable release that will be soon followed by another sizeable
+release. Both of them will combined close over 40 existing issues and PRs.
+
+This first release, despite its size, essentially represents preparatory work
+for the second release, which will be even bigger. Namely, this release:
+
+* Increases the MSRV to Rust 1.60.0, which was released about 1 year ago.
+* Upgrades its dependency on `aho-corasick` to the recently released 1.0
+version.
+* Upgrades its dependency on `regex-syntax` to the simultaneously released
+`0.7` version. The changes to `regex-syntax` principally revolve around a
+rewrite of its literal extraction code and a number of simplifications and
+optimizations to its high-level intermediate representation (HIR).
+
+The second release, which will follow ~shortly after the release above, will
+contain a soup-to-nuts rewrite of every regex engine. This will be done by
+bringing [`regex-automata`](https://github.com/BurntSushi/regex-automata) into
+this repository, and then changing the `regex` crate to be nothing but an API
+shim layer on top of `regex-automata`'s API.
+
+These tandem releases are the culmination of about 3
+years of on-and-off work that [began in earnest in March
+2020](https://github.com/rust-lang/regex/issues/656).
+
+Because of the scale of changes involved in these releases, I would love to
+hear about your experience. Especially if you notice undocumented changes in
+behavior or performance changes (positive *or* negative).
+
+Most changes in the first release are listed below. For more details, please
+see the commit log, which reflects a linear and decently documented history
+of all changes.
+
+New features:
+
+* [FEATURE #501](https://github.com/rust-lang/regex/issues/501):
+Permit many more characters to be escaped, even if they have no significance.
+More specifically, any ASCII character except for `[0-9A-Za-z<>]` can now be
+escaped. Also, a new routine, `is_escapeable_character`, has been added to
+`regex-syntax` to query whether a character is escapeable or not.
+* [FEATURE #547](https://github.com/rust-lang/regex/issues/547):
+Add `Regex::captures_at`. This filles a hole in the API, but doesn't otherwise
+introduce any new expressive power.
+* [FEATURE #595](https://github.com/rust-lang/regex/issues/595):
+Capture group names are now Unicode-aware. They can now begin with either a `_`
+or any "alphabetic" codepoint. After the first codepoint, subsequent codepoints
+can be any sequence of alpha-numeric codepoints, along with `_`, `.`, `[` and
+`]`. Note that replacement syntax has not changed.
+* [FEATURE #810](https://github.com/rust-lang/regex/issues/810):
+Add `Match::is_empty` and `Match::len` APIs.
+* [FEATURE #905](https://github.com/rust-lang/regex/issues/905):
+Add an `impl Default for RegexSet`, with the default being the empty set.
+* [FEATURE #908](https://github.com/rust-lang/regex/issues/908):
+A new method, `Regex::static_captures_len`, has been added which returns the
+number of capture groups in the pattern if and only if every possible match
+always contains the same number of matching groups.
+* [FEATURE #955](https://github.com/rust-lang/regex/issues/955):
+Named captures can now be written as `(?<name>re)` in addition to
+`(?P<name>re)`.
+* FEATURE: `regex-syntax` now supports empty character classes.
+* FEATURE: `regex-syntax` now has an optional `std` feature. (This will come
+to `regex` in the second release.)
+* FEATURE: The `Hir` type in `regex-syntax` has had a number of simplifications
+made to it.
+* FEATURE: `regex-syntax` has support for a new `R` flag for enabling CRLF
+mode. This will be supported in `regex` proper in the second release.
+* FEATURE: `regex-syntax` now has proper support for "regex that never
+matches" via `Hir::fail()`.
+* FEATURE: The `hir::literal` module of `regex-syntax` has been completely
+re-worked. It now has more documentation, examples and advice.
+* FEATURE: The `allow_invalid_utf8` option in `regex-syntax` has been renamed
+to `utf8`, and the meaning of the boolean has been flipped.
+
+Performance improvements:
+
+* PERF: The upgrade to `aho-corasick 1.0` may improve performance in some
+cases. It's difficult to characterize exactly which patterns this might impact,
+but if there are a small number of longish (>= 4 bytes) prefix literals, then
+it might be faster than before.
+
+Bug fixes:
+
+* [BUG #514](https://github.com/rust-lang/regex/issues/514):
+Improve `Debug` impl for `Match` so that it doesn't show the entire haystack.
+* BUGS [#516](https://github.com/rust-lang/regex/issues/516),
+[#731](https://github.com/rust-lang/regex/issues/731):
+Fix a number of issues with printing `Hir` values as regex patterns.
+* [BUG #610](https://github.com/rust-lang/regex/issues/610):
+Add explicit example of `foo|bar` in the regex syntax docs.
+* [BUG #625](https://github.com/rust-lang/regex/issues/625):
+Clarify that `SetMatches::len` does not (regretably) refer to the number of
+matches in the set.
+* [BUG #660](https://github.com/rust-lang/regex/issues/660):
+Clarify "verbose mode" in regex syntax documentation.
+* BUG [#738](https://github.com/rust-lang/regex/issues/738),
+[#950](https://github.com/rust-lang/regex/issues/950):
+Fix `CaptureLocations::get` so that it never panics.
+* [BUG #747](https://github.com/rust-lang/regex/issues/747):
+Clarify documentation for `Regex::shortest_match`.
+* [BUG #835](https://github.com/rust-lang/regex/issues/835):
+Fix `\p{Sc}` so that it is equivalent to `\p{Currency_Symbol}`.
+* [BUG #846](https://github.com/rust-lang/regex/issues/846):
+Add more clarifying documentation to the `CompiledTooBig` error variant.
+* [BUG #854](https://github.com/rust-lang/regex/issues/854):
+Clarify that `regex::Regex` searches as if the haystack is a sequence of
+Unicode scalar values.
+* [BUG #884](https://github.com/rust-lang/regex/issues/884):
+Replace `__Nonexhaustive` variants with `#[non_exhaustive]` attribute.
+* [BUG #893](https://github.com/rust-lang/regex/pull/893):
+Optimize case folding since it can get quite slow in some pathological cases.
+* [BUG #895](https://github.com/rust-lang/regex/issues/895):
+Reject `(?-u:\W)` in `regex::Regex` APIs.
+* [BUG #942](https://github.com/rust-lang/regex/issues/942):
+Add a missing `void` keyword to indicate "no parameters" in C API.
+* [BUG #965](https://github.com/rust-lang/regex/issues/965):
+Fix `\p{Lc}` so that it is equivalent to `\p{Cased_Letter}`.
+* [BUG #975](https://github.com/rust-lang/regex/issues/975):
+Clarify documentation for `\pX` syntax.
+
+
1.7.3 (2023-03-24)
==================
This is a small release that fixes a bug in `Regex::shortest_match_at` that
@@ -743,7 +932,7 @@ Bug gixes:
==================
This release includes a ground-up rewrite of the regex-syntax crate, which has
been in development for over a year.
-
+731
New features:
* Error messages for invalid regexes have been greatly improved. You get these
diff --git a/vendor/regex/Cargo.lock b/vendor/regex/Cargo.lock
index f91c8879b..6cf8da756 100644
--- a/vendor/regex/Cargo.lock
+++ b/vendor/regex/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
[[package]]
name = "aho-corasick"
-version = "0.7.20"
+version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
dependencies = [
"memchr",
]
@@ -19,9 +19,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "getrandom"
-version = "0.2.8"
+version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
+checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
dependencies = [
"cfg-if",
"libc",
@@ -36,9 +36,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
-version = "0.2.139"
+version = "0.2.142"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317"
[[package]]
name = "memchr"
@@ -75,7 +75,7 @@ dependencies = [
[[package]]
name = "regex"
-version = "1.7.3"
+version = "1.8.4"
dependencies = [
"aho-corasick",
"lazy_static",
@@ -87,9 +87,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
-version = "0.6.29"
+version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
[[package]]
name = "wasi"
diff --git a/vendor/regex/Cargo.toml b/vendor/regex/Cargo.toml
index 37e44fb3b..b4371c4b9 100644
--- a/vendor/regex/Cargo.toml
+++ b/vendor/regex/Cargo.toml
@@ -10,9 +10,10 @@
# See Cargo.toml.orig for the original contents.
[package]
-edition = "2018"
+edition = "2021"
+rust-version = "1.60.0"
name = "regex"
-version = "1.7.3"
+version = "1.8.4"
authors = ["The Rust Project Developers"]
exclude = [
"/scripts/*",
@@ -31,13 +32,13 @@ license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-lang/regex"
[profile.bench]
-debug = true
+debug = 2
[profile.release]
-debug = true
+debug = 2
[profile.test]
-debug = true
+debug = 2
[lib]
doctest = false
@@ -80,15 +81,15 @@ name = "crates-regex"
path = "tests/test_crates_regex.rs"
[dependencies.aho-corasick]
-version = "0.7.18"
+version = "1.0.0"
optional = true
[dependencies.memchr]
-version = "2.4.0"
+version = "2.5.0"
optional = true
[dependencies.regex-syntax]
-version = "0.6.29"
+version = "0.7.2"
default-features = false
[dev-dependencies.lazy_static]
diff --git a/vendor/regex/README.md b/vendor/regex/README.md
index 861417da6..020b35395 100644
--- a/vendor/regex/README.md
+++ b/vendor/regex/README.md
@@ -9,7 +9,7 @@ by [RE2](https://github.com/google/re2).
[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex)
-[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
+[![Rust](https://img.shields.io/badge/rust-1.60.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
### Documentation
@@ -218,7 +218,7 @@ The full set of features one can disable are
### Minimum Rust version policy
-This crate's minimum supported `rustc` version is `1.41.1`.
+This crate's minimum supported `rustc` version is `1.60.0`.
The current **tentative** policy is that the minimum Rust version required
to use this crate can be increased in minor version updates. For example, if
diff --git a/vendor/regex/record/README.md b/vendor/regex/record/README.md
new file mode 100644
index 000000000..432b06ab9
--- /dev/null
+++ b/vendor/regex/record/README.md
@@ -0,0 +1,4 @@
+This directory contains various recordings of results. These are committed to
+the repository so that they can be compared over time. (At the time of writing,
+there is no tooling for facilitating this comparison. It has to be done
+manually.)
diff --git a/vendor/regex/record/compile-test/2023-04-19_1.7.3.csv b/vendor/regex/record/compile-test/2023-04-19_1.7.3.csv
new file mode 100644
index 000000000..af62da10a
--- /dev/null
+++ b/vendor/regex/record/compile-test/2023-04-19_1.7.3.csv
@@ -0,0 +1,11 @@
+name,crate,revision,profile,duration,size,relative-size
+regex__dev__std_perf_unicode,regex,9582040009,dev,1.824209152s,3434992,3113064
+regex__dev__std,regex,9582040009,dev,1.206314935s,1362392,1040464
+regex__dev__std_perf,regex,9582040009,dev,1.543583435s,2726384,2404456
+regex__dev__std_unicode,regex,9582040009,dev,1.490095643s,2066904,1744976
+regex__dev__std_unicode-case_unicode-perl,regex,9582040009,dev,1.292011694s,1812952,1491024
+regex__release__std_perf_unicode,regex,9582040009,release,2.398133563s,1616216,1294368
+regex__release__std,regex,9582040009,release,1.413680252s,694592,372744
+regex__release__std_perf,regex,9582040009,release,2.341496191s,1124696,802848
+regex__release__std_unicode,regex,9582040009,release,1.671407822s,1190208,868360
+regex__release__std_unicode-case_unicode-perl,regex,9582040009,release,1.441712198s,932160,610312
diff --git a/vendor/regex/record/compile-test/2023-04-20_master.csv b/vendor/regex/record/compile-test/2023-04-20_master.csv
new file mode 100644
index 000000000..4c3e91674
--- /dev/null
+++ b/vendor/regex/record/compile-test/2023-04-20_master.csv
@@ -0,0 +1,11 @@
+name,crate,revision,profile,duration,size,relative-size
+regex__dev__std_perf_unicode,regex,f1f99af2bc,dev,1.834267609s,3799536,3477608
+regex__dev__std,regex,f1f99af2bc,dev,1.263958602s,1427928,1106000
+regex__dev__std_perf,regex,f1f99af2bc,dev,1.631302845s,3234288,2912360
+regex__dev__std_unicode,regex,f1f99af2bc,dev,1.550536696s,1997272,1675344
+regex__dev__std_unicode-case_unicode-perl,regex,f1f99af2bc,dev,1.341622852s,1739224,1417296
+regex__release__std_perf_unicode,regex,f1f99af2bc,release,2.475080323s,1755480,1433632
+regex__release__std,regex,f1f99af2bc,release,1.45990031s,731456,409608
+regex__release__std_perf,regex,f1f99af2bc,release,2.421787211s,1259864,938016
+regex__release__std_unicode,regex,f1f99af2bc,release,1.693972619s,1227072,905224
+regex__release__std_unicode-case_unicode-perl,regex,f1f99af2bc,release,1.528003306s,969024,647176
diff --git a/vendor/regex/record/compile-test/README.md b/vendor/regex/record/compile-test/README.md
new file mode 100644
index 000000000..7291d5d37
--- /dev/null
+++ b/vendor/regex/record/compile-test/README.md
@@ -0,0 +1,27 @@
+This directory contains the results of compilation tests. Specifically,
+the results are from testing both the from scratch compilation time and
+relative binary size increases of various features for both the `regex` and
+`regex-automata` crates.
+
+Here's an example of how to run these tests for just the `regex` crate. You'll
+need the `regex-cli` command installed, which can be found in the `regex-cli`
+directory in the root of this repository.
+
+This must be run in the root of a checkout of this repository.
+
+```
+$ mkdir /tmp/regex-compile-test
+$ regex-cli compile-test ./ /tmp/regex-compile-test | tee record/compile-test/2023-04-19_1.7.3.csv
+```
+
+You can then look at the results using a tool like [`xsv`][xsv]:
+
+```
+$ xsv table record/compile-test/2023-04-19_1.7.3.csv
+```
+
+Note that the relative binary size is computed by building a "baseline" hello
+world program, and then subtracting that from the size of a binary that uses
+the regex crate.
+
+[xsv]: https://github.com/BurntSushi/xsv
diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs
index 90ca25015..23e63ec89 100644
--- a/vendor/regex/src/compile.rs
+++ b/vendor/regex/src/compile.rs
@@ -4,7 +4,7 @@ use std::iter;
use std::result;
use std::sync::Arc;
-use regex_syntax::hir::{self, Hir};
+use regex_syntax::hir::{self, Hir, Look};
use regex_syntax::is_word_byte;
use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
@@ -137,13 +137,24 @@ impl Compiler {
}
fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+ if self.compiled.only_utf8
+ && expr.properties().look_set().contains(Look::WordAsciiNegate)
+ {
+ return Err(Error::Syntax(
+ "ASCII-only \\B is not allowed in Unicode regexes \
+ because it may result in invalid UTF-8 matches"
+ .to_string(),
+ ));
+ }
// If we're compiling a forward DFA and we aren't anchored, then
// add a `.*?` before the first capture group.
// Other matching engines handle this by baking the logic into the
// matching engine itself.
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
- self.compiled.is_anchored_start = expr.is_anchored_start();
- self.compiled.is_anchored_end = expr.is_anchored_end();
+ self.compiled.is_anchored_start =
+ expr.properties().look_set_prefix().contains(Look::Start);
+ self.compiled.is_anchored_end =
+ expr.properties().look_set_suffix().contains(Look::End);
if self.compiled.needs_dotstar() {
dotstar_patch = self.c_dotstar()?;
self.compiled.start = dotstar_patch.entry;
@@ -159,6 +170,8 @@ impl Compiler {
self.fill_to_next(patch.hole);
self.compiled.matches = vec![self.insts.len()];
self.push_compiled(Inst::Match(0));
+ self.compiled.static_captures_len =
+ expr.properties().static_explicit_captures_len();
self.compile_finish()
}
@@ -168,10 +181,12 @@ impl Compiler {
) -> result::Result<Program, Error> {
debug_assert!(exprs.len() > 1);
- self.compiled.is_anchored_start =
- exprs.iter().all(|e| e.is_anchored_start());
- self.compiled.is_anchored_end =
- exprs.iter().all(|e| e.is_anchored_end());
+ self.compiled.is_anchored_start = exprs
+ .iter()
+ .all(|e| e.properties().look_set_prefix().contains(Look::Start));
+ self.compiled.is_anchored_end = exprs
+ .iter()
+ .all(|e| e.properties().look_set_suffix().contains(Look::End));
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
if self.compiled.needs_dotstar() {
dotstar_patch = self.c_dotstar()?;
@@ -272,17 +287,21 @@ impl Compiler {
self.check_size()?;
match *expr.kind() {
Empty => self.c_empty(),
- Literal(hir::Literal::Unicode(c)) => self.c_char(c),
- Literal(hir::Literal::Byte(b)) => {
- assert!(self.compiled.uses_bytes());
- self.c_byte(b)
+ Literal(hir::Literal(ref bytes)) => {
+ if self.compiled.is_reverse {
+ let mut bytes = bytes.to_vec();
+ bytes.reverse();
+ self.c_literal(&bytes)
+ } else {
+ self.c_literal(bytes)
+ }
}
Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
Class(hir::Class::Bytes(ref cls)) => {
if self.compiled.uses_bytes() {
self.c_class_bytes(cls.ranges())
} else {
- assert!(cls.is_all_ascii());
+ assert!(cls.is_ascii());
let mut char_ranges = vec![];
for r in cls.iter() {
let (s, e) = (r.start() as char, r.end() as char);
@@ -291,92 +310,94 @@ impl Compiler {
self.c_class(&char_ranges)
}
}
- Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- Anchor(hir::Anchor::StartLine) => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::StartLine)
- }
- Anchor(hir::Anchor::EndLine) => {
- self.byte_classes.set_range(b'\n', b'\n');
- self.c_empty_look(prog::EmptyLook::EndLine)
- }
- Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- Anchor(hir::Anchor::StartText) => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
- self.c_empty_look(prog::EmptyLook::StartText)
- }
- Anchor(hir::Anchor::EndText) => {
- self.c_empty_look(prog::EmptyLook::EndText)
- }
- WordBoundary(hir::WordBoundary::Unicode) => {
- if !cfg!(feature = "unicode-perl") {
- return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
- .to_string(),
- ));
+ Look(ref look) => match *look {
+ hir::Look::Start if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::EndText)
}
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // We also make sure that all ASCII bytes are in a different
- // class from non-ASCII bytes. Otherwise, it's possible for
- // ASCII bytes to get lumped into the same class as non-ASCII
- // bytes. This in turn may cause the lazy DFA to falsely start
- // when it sees an ASCII byte that maps to a byte class with
- // non-ASCII bytes. This ensures that never happens.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::WordBoundary)
- }
- WordBoundary(hir::WordBoundary::UnicodeNegate) => {
- if !cfg!(feature = "unicode-perl") {
+ hir::Look::Start => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ hir::Look::End if self.compiled.is_reverse => {
+ self.c_empty_look(prog::EmptyLook::StartText)
+ }
+ hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
+ hir::Look::StartLF if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ hir::Look::StartLF => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ hir::Look::EndLF if self.compiled.is_reverse => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::StartLine)
+ }
+ hir::Look::EndLF => {
+ self.byte_classes.set_range(b'\n', b'\n');
+ self.c_empty_look(prog::EmptyLook::EndLine)
+ }
+ hir::Look::StartCRLF | hir::Look::EndCRLF => {
return Err(Error::Syntax(
- "Unicode word boundaries are unavailable when \
- the unicode-perl feature is disabled"
+ "CRLF-aware line anchors are not supported yet"
.to_string(),
));
}
- self.compiled.has_unicode_word_boundary = true;
- self.byte_classes.set_word_boundary();
- // See comments above for why we set the ASCII range here.
- self.byte_classes.set_range(0, 0x7F);
- self.c_empty_look(prog::EmptyLook::NotWordBoundary)
- }
- WordBoundary(hir::WordBoundary::Ascii) => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
- }
- WordBoundary(hir::WordBoundary::AsciiNegate) => {
- self.byte_classes.set_word_boundary();
- self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
- }
- Group(ref g) => match g.kind {
- hir::GroupKind::NonCapturing => self.c(&g.hir),
- hir::GroupKind::CaptureIndex(index) => {
- if index as usize >= self.compiled.captures.len() {
- self.compiled.captures.push(None);
+ hir::Look::WordAscii => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
+ }
+ hir::Look::WordAsciiNegate => {
+ self.byte_classes.set_word_boundary();
+ self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
+ }
+ hir::Look::WordUnicode => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
}
- self.c_capture(2 * index as usize, &g.hir)
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // We also make sure that all ASCII bytes are in a different
+ // class from non-ASCII bytes. Otherwise, it's possible for
+ // ASCII bytes to get lumped into the same class as non-ASCII
+ // bytes. This in turn may cause the lazy DFA to falsely start
+ // when it sees an ASCII byte that maps to a byte class with
+ // non-ASCII bytes. This ensures that never happens.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::WordBoundary)
}
- hir::GroupKind::CaptureName { index, ref name } => {
- if index as usize >= self.compiled.captures.len() {
- let n = name.to_string();
- self.compiled.captures.push(Some(n.clone()));
- self.capture_name_idx.insert(n, index as usize);
+ hir::Look::WordUnicodeNegate => {
+ if !cfg!(feature = "unicode-perl") {
+ return Err(Error::Syntax(
+ "Unicode word boundaries are unavailable when \
+ the unicode-perl feature is disabled"
+ .to_string(),
+ ));
}
- self.c_capture(2 * index as usize, &g.hir)
+ self.compiled.has_unicode_word_boundary = true;
+ self.byte_classes.set_word_boundary();
+ // See comments above for why we set the ASCII range here.
+ self.byte_classes.set_range(0, 0x7F);
+ self.c_empty_look(prog::EmptyLook::NotWordBoundary)
}
},
+ Capture(hir::Capture { index, ref name, ref sub }) => {
+ if index as usize >= self.compiled.captures.len() {
+ let name = match *name {
+ None => None,
+ Some(ref boxed_str) => Some(boxed_str.to_string()),
+ };
+ self.compiled.captures.push(name.clone());
+ if let Some(name) = name {
+ self.capture_name_idx.insert(name, index as usize);
+ }
+ }
+ self.c_capture(2 * index as usize, sub)
+ }
Concat(ref es) => {
if self.compiled.is_reverse {
self.c_concat(es.iter().rev())
@@ -420,21 +441,19 @@ impl Compiler {
}
fn c_dotstar(&mut self) -> Result {
- Ok(if !self.compiled.only_utf8() {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
- greedy: false,
- hir: Box::new(Hir::any(true)),
- }))?
- .unwrap()
+ let hir = if self.compiled.only_utf8() {
+ Hir::dot(hir::Dot::AnyChar)
} else {
- self.c(&Hir::repetition(hir::Repetition {
- kind: hir::RepetitionKind::ZeroOrMore,
+ Hir::dot(hir::Dot::AnyByte)
+ };
+ Ok(self
+ .c(&Hir::repetition(hir::Repetition {
+ min: 0,
+ max: None,
greedy: false,
- hir: Box::new(Hir::any(false)),
+ sub: Box::new(hir),
}))?
- .unwrap()
- })
+ .unwrap())
}
fn c_char(&mut self, c: char) -> ResultOrEmpty {
@@ -457,7 +476,11 @@ impl Compiler {
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
use std::mem::size_of;
- assert!(!ranges.is_empty());
+ if ranges.is_empty() {
+ return Err(Error::Syntax(
+ "empty character classes are not allowed".to_string(),
+ ));
+ }
if self.compiled.uses_bytes() {
Ok(Some(CompileClass { c: self, ranges }.compile()?))
} else {
@@ -482,7 +505,11 @@ impl Compiler {
&mut self,
ranges: &[hir::ClassBytesRange],
) -> ResultOrEmpty {
- debug_assert!(!ranges.is_empty());
+ if ranges.is_empty() {
+ return Err(Error::Syntax(
+ "empty character classes are not allowed".to_string(),
+ ));
+ }
let first_split_entry = self.insts.len();
let mut holes = vec![];
@@ -513,6 +540,52 @@ impl Compiler {
Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
}
+ fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
+ match core::str::from_utf8(bytes) {
+ Ok(string) => {
+ let mut it = string.chars();
+ let Patch { mut hole, entry } = loop {
+ match it.next() {
+ None => return self.c_empty(),
+ Some(ch) => {
+ if let Some(p) = self.c_char(ch)? {
+ break p;
+ }
+ }
+ }
+ };
+ for ch in it {
+ if let Some(p) = self.c_char(ch)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+ Err(_) => {
+ assert!(self.compiled.uses_bytes());
+ let mut it = bytes.iter().copied();
+ let Patch { mut hole, entry } = loop {
+ match it.next() {
+ None => return self.c_empty(),
+ Some(byte) => {
+ if let Some(p) = self.c_byte(byte)? {
+ break p;
+ }
+ }
+ }
+ };
+ for byte in it {
+ if let Some(p) = self.c_byte(byte)? {
+ self.fill(hole, p.entry);
+ hole = p.hole;
+ }
+ }
+ Ok(Some(Patch { hole, entry }))
+ }
+ }
+ }
+
fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
where
I: IntoIterator<Item = &'a Hir>,
@@ -587,19 +660,15 @@ impl Compiler {
}
fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
- use regex_syntax::hir::RepetitionKind::*;
- match rep.kind {
- ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
- ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
- OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
- Range(hir::RepetitionRange::Exactly(min_max)) => {
- self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
- }
- Range(hir::RepetitionRange::AtLeast(min)) => {
- self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+ match (rep.min, rep.max) {
+ (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
+ (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
+ (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
+ (min, None) => {
+ self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
}
- Range(hir::RepetitionRange::Bounded(min, max)) => {
- self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+ (min, Some(max)) => {
+ self.c_repeat_range(&rep.sub, rep.greedy, min, max)
}
}
}
diff --git a/vendor/regex/src/dfa.rs b/vendor/regex/src/dfa.rs
index dc9952120..78ed71021 100644
--- a/vendor/regex/src/dfa.rs
+++ b/vendor/regex/src/dfa.rs
@@ -1576,7 +1576,7 @@ impl<'a> Fsm<'a> {
/// inputs, a new state could be created for every byte of input. (This is
/// bad for memory use, so we bound it with a cache.)
fn approximate_size(&self) -> usize {
- self.cache.size + self.prog.approximate_size()
+ self.cache.size
}
}
diff --git a/vendor/regex/src/error.rs b/vendor/regex/src/error.rs
index 3e0ec7521..6c341f604 100644
--- a/vendor/regex/src/error.rs
+++ b/vendor/regex/src/error.rs
@@ -6,8 +6,26 @@ use std::iter::repeat;
pub enum Error {
/// A syntax error.
Syntax(String),
- /// The compiled program exceeded the set size limit.
- /// The argument is the size limit imposed.
+ /// The compiled program exceeded the set size
+ /// limit. The argument is the size limit imposed by
+ /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even
+ /// when not configured explicitly, it defaults to a reasonable limit.
+ ///
+ /// If you're getting this error, it occurred because your regex has been
+ /// compiled to an intermediate state that is too big. It is important to
+ /// note that exceeding this limit does _not_ mean the regex is too big to
+ /// _work_, but rather, the regex is big enough that it may wind up being
+ /// surprisingly slow when used in a search. In other words, this error is
+ /// meant to be a practical heuristic for avoiding a performance footgun,
+ /// and especially so for the case where the regex pattern is coming from
+ /// an untrusted source.
+ ///
+ /// There are generally two ways to move forward if you hit this error.
+ /// The first is to find some way to use a smaller regex. The second is to
+ /// increase the size limit via `RegexBuilder::size_limit`. However, if
+ /// your regex pattern is not from a trusted source, then neither of these
+ /// approaches may be appropriate. Instead, you'll have to determine just
+ /// how big of a regex you want to allow.
CompiledTooBig(usize),
/// Hints that destructuring should not be exhaustive.
///
diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs
index b9abcdc04..ee8b589d2 100644
--- a/vendor/regex/src/exec.rs
+++ b/vendor/regex/src/exec.rs
@@ -4,9 +4,9 @@ use std::panic::AssertUnwindSafe;
use std::sync::Arc;
#[cfg(feature = "perf-literal")]
-use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
-use regex_syntax::hir::literal::Literals;
-use regex_syntax::hir::Hir;
+use aho_corasick::{AhoCorasick, MatchKind};
+use regex_syntax::hir::literal;
+use regex_syntax::hir::{Hir, Look};
use regex_syntax::ParserBuilder;
use crate::backtrack;
@@ -78,15 +78,18 @@ struct ExecReadOnly {
/// not supported.) Note that this program contains an embedded `.*?`
/// preceding the first capture group, unless the regex is anchored at the
/// beginning.
+ #[allow(dead_code)]
dfa: Program,
/// The same as above, except the program is reversed (and there is no
/// preceding `.*?`). This is used by the DFA to find the starting location
/// of matches.
+ #[allow(dead_code)]
dfa_reverse: Program,
/// A set of suffix literals extracted from the regex.
///
/// Prefix literals are stored on the `Program`, since they are used inside
/// the matching engines.
+ #[allow(dead_code)]
suffixes: LiteralSearcher,
/// An Aho-Corasick automaton with leftmost-first match semantics.
///
@@ -98,7 +101,7 @@ struct ExecReadOnly {
/// if we were to exhaust the ID space, we probably would have long
/// surpassed the compilation size limit.
#[cfg(feature = "perf-literal")]
- ac: Option<AhoCorasick<u32>>,
+ ac: Option<AhoCorasick>,
/// match_type encodes as much upfront knowledge about how we're going to
/// execute a search as possible.
match_type: MatchType,
@@ -121,8 +124,8 @@ pub struct ExecBuilder {
/// literals.
struct Parsed {
exprs: Vec<Hir>,
- prefixes: Literals,
- suffixes: Literals,
+ prefixes: literal::Seq,
+ suffixes: literal::Seq,
bytes: bool,
}
@@ -228,8 +231,8 @@ impl ExecBuilder {
/// Parse the current set of patterns into their AST and extract literals.
fn parse(&self) -> Result<Parsed, Error> {
let mut exprs = Vec::with_capacity(self.options.pats.len());
- let mut prefixes = Some(Literals::empty());
- let mut suffixes = Some(Literals::empty());
+ let mut prefixes = Some(literal::Seq::empty());
+ let mut suffixes = Some(literal::Seq::empty());
let mut bytes = false;
let is_set = self.options.pats.len() > 1;
// If we're compiling a regex set and that set has any anchored
@@ -243,54 +246,103 @@ impl ExecBuilder {
.swap_greed(self.options.swap_greed)
.ignore_whitespace(self.options.ignore_whitespace)
.unicode(self.options.unicode)
- .allow_invalid_utf8(!self.only_utf8)
+ .utf8(self.only_utf8)
.nest_limit(self.options.nest_limit)
.build();
let expr =
parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
- bytes = bytes || !expr.is_always_utf8();
+ let props = expr.properties();
+ // This used to just check whether the HIR matched valid UTF-8
+ // or not, but in regex-syntax 0.7, we changed our definition of
+ // "matches valid UTF-8" to exclude zero-width matches. And in
+ // particular, previously, we considered WordAsciiNegate (that
+ // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
+ // matcher engines were built under this assumption and fixing
+ // them is not worth it with the imminent plan to switch over to
+ // regex-automata. So for now, we retain the previous behavior by
+ // just explicitly treating the presence of a negated ASCII word
+ // boundary as forcing use to use a byte oriented automaton.
+ bytes = bytes
+ || !props.is_utf8()
+ || props.look_set().contains(Look::WordAsciiNegate);
if cfg!(feature = "perf-literal") {
- if !expr.is_anchored_start() && expr.is_any_anchored_start() {
+ if !props.look_set_prefix().contains(Look::Start)
+ && props.look_set().contains(Look::Start)
+ {
// Partial anchors unfortunately make it hard to use
// prefixes, so disable them.
prefixes = None;
- } else if is_set && expr.is_anchored_start() {
+ } else if is_set
+ && props.look_set_prefix_any().contains(Look::Start)
+ {
// Regex sets with anchors do not go well with literal
// optimizations.
prefixes = None;
+ } else if props.look_set_prefix_any().contains_word() {
+ // The new literal extractor ignores look-around while
+ // the old one refused to extract prefixes from regexes
+ // that began with a \b. These old creaky regex internals
+ // can't deal with it, so we drop it.
+ prefixes = None;
+ } else if props.look_set_prefix_any().contains(Look::StartLF) {
+ // Similar to the reasoning for word boundaries, this old
+ // regex engine can't handle literal prefixes with '(?m:^)'
+ // at the beginning of a regex.
+ prefixes = None;
}
- prefixes = prefixes.and_then(|mut prefixes| {
- if !prefixes.union_prefixes(&expr) {
- None
- } else {
- Some(prefixes)
- }
- });
- if !expr.is_anchored_end() && expr.is_any_anchored_end() {
+ if !props.look_set_suffix().contains(Look::End)
+ && props.look_set().contains(Look::End)
+ {
// Partial anchors unfortunately make it hard to use
// suffixes, so disable them.
suffixes = None;
- } else if is_set && expr.is_anchored_end() {
+ } else if is_set
+ && props.look_set_suffix_any().contains(Look::End)
+ {
// Regex sets with anchors do not go well with literal
// optimizations.
suffixes = None;
+ } else if props.look_set_suffix_any().contains_word() {
+ // See the prefix case for reasoning here.
+ suffixes = None;
+ } else if props.look_set_suffix_any().contains(Look::EndLF) {
+ // See the prefix case for reasoning here.
+ suffixes = None;
}
- suffixes = suffixes.and_then(|mut suffixes| {
- if !suffixes.union_suffixes(&expr) {
- None
+
+ let (mut pres, mut suffs) =
+ if prefixes.is_none() && suffixes.is_none() {
+ (literal::Seq::infinite(), literal::Seq::infinite())
} else {
- Some(suffixes)
- }
+ literal_analysis(&expr)
+ };
+ // These old creaky regex internals can't handle cases where
+ // the literal sequences are exact but there are look-around
+ // assertions. So we make sure the sequences are inexact if
+ // there are look-around assertions anywhere. This forces the
+ // regex engines to run instead of assuming that a literal
+ // match implies an overall match.
+ if !props.look_set().is_empty() {
+ pres.make_inexact();
+ suffs.make_inexact();
+ }
+ prefixes = prefixes.and_then(|mut prefixes| {
+ prefixes.union(&mut pres);
+ Some(prefixes)
+ });
+ suffixes = suffixes.and_then(|mut suffixes| {
+ suffixes.union(&mut suffs);
+ Some(suffixes)
});
}
exprs.push(expr);
}
Ok(Parsed {
exprs,
- prefixes: prefixes.unwrap_or_else(Literals::empty),
- suffixes: suffixes.unwrap_or_else(Literals::empty),
+ prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
+ suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
bytes,
})
}
@@ -356,7 +408,7 @@ impl ExecBuilder {
}
#[cfg(feature = "perf-literal")]
- fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
+ fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
if parsed.exprs.len() != 1 {
return None;
}
@@ -370,10 +422,9 @@ impl ExecBuilder {
return None;
}
Some(
- AhoCorasickBuilder::new()
+ AhoCorasick::builder()
.match_kind(MatchKind::LeftmostFirst)
- .auto_configure(&lits)
- .build_with_size::<u32, _, _>(&lits)
+ .build(&lits)
// This should never happen because we'd long exceed the
// compilation limit for regexes first.
.expect("AC automaton too big"),
@@ -1311,6 +1362,12 @@ impl Exec {
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
&self.ro.nfa.capture_name_idx
}
+
+ /// If the number of capture groups in every match is always the same, then
+ /// return that number. Otherwise return `None`.
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.ro.nfa.static_captures_len
+ }
}
impl Clone for Exec {
@@ -1382,7 +1439,18 @@ impl ExecReadOnly {
// This case shouldn't happen. When the regex isn't
// anchored, then complete prefixes should imply complete
// suffixes.
- Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ //
+ // The above is wrong! This case can happen. While
+ // complete prefixes should imply complete suffixes
+ // here, that doesn't necessarily mean we have a useful
+ // prefix matcher! It could be the case that the literal
+ // searcher decided the prefixes---even though they are
+ // "complete"---weren't good enough and thus created an
+ // empty matcher. If that happens and we return Unanchored
+ // here, then we'll end up using that matcher, which is
+ // very bad because it matches at every position. So...
+ // return None.
+ None
};
}
None
@@ -1557,7 +1625,7 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
// optimization pipeline, because this is a terribly inflexible way to go
// about things.
- if !expr.is_alternation_literal() {
+ if !expr.properties().is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
@@ -1565,25 +1633,19 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
_ => return None, // one literal isn't worth it
};
- let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
- Literal::Unicode(c) => {
- let mut buf = [0; 4];
- dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
- }
- Literal::Byte(b) => {
- dst.push(b);
- }
- };
-
let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
- HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Literal(Literal(ref bytes)) => {
+ lit.extend_from_slice(bytes)
+ }
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
- HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Literal(Literal(ref bytes)) => {
+ lit.extend_from_slice(bytes);
+ }
_ => unreachable!("expected literal, got {:?}", e),
}
}
@@ -1595,6 +1657,48 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
Some(lits)
}
+#[cfg(not(feature = "perf-literal"))]
+fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
+ (literal::Seq::infinite(), literal::Seq::infinite())
+}
+
+#[cfg(feature = "perf-literal")]
+fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
+ const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)];
+
+ let mut prefixes = literal::Extractor::new()
+ .kind(literal::ExtractKind::Prefix)
+ .extract(expr);
+ for (keep, limit) in ATTEMPTS {
+ let len = match prefixes.len() {
+ None => break,
+ Some(len) => len,
+ };
+ if len <= limit {
+ break;
+ }
+ prefixes.keep_first_bytes(keep);
+ prefixes.minimize_by_preference();
+ }
+
+ let mut suffixes = literal::Extractor::new()
+ .kind(literal::ExtractKind::Suffix)
+ .extract(expr);
+ for (keep, limit) in ATTEMPTS {
+ let len = match suffixes.len() {
+ None => break,
+ Some(len) => len,
+ };
+ if len <= limit {
+ break;
+ }
+ suffixes.keep_last_bytes(keep);
+ suffixes.minimize_by_preference();
+ }
+
+ (prefixes, suffixes)
+}
+
#[cfg(test)]
mod test {
#[test]
diff --git a/vendor/regex/src/expand.rs b/vendor/regex/src/expand.rs
index 67b514926..98fafc949 100644
--- a/vendor/regex/src/expand.rs
+++ b/vendor/regex/src/expand.rs
@@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
})
}
-/// Returns true if and only if the given byte is allowed in a capture name.
+/// Returns true if and only if the given byte is allowed in a capture name
+/// written in non-brace form.
fn is_valid_cap_letter(b: u8) -> bool {
match b {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
@@ -236,4 +237,11 @@ mod tests {
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
find!(find_cap_ref18, "${#}", c!("#", 4));
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+ find!(find_cap_ref20, "${¾}", c!("¾", 5));
+ find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
+ find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
+ find!(find_cap_ref23, "${☃}", c!("☃", 6));
+ find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
+ find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
+ find!(find_cap_ref26, "${名字}", c!("名字", 9));
}
diff --git a/vendor/regex/src/lib.rs b/vendor/regex/src/lib.rs
index 6b95739c5..82c1b77ad 100644
--- a/vendor/regex/src/lib.rs
+++ b/vendor/regex/src/lib.rs
@@ -199,6 +199,8 @@ instead.)
This implementation executes regular expressions **only** on valid UTF-8
while exposing match locations as byte indices into the search string. (To
relax this restriction, use the [`bytes`](bytes/index.html) sub-module.)
+Conceptually, the regex engine works by matching a haystack as if it were a
+sequence of Unicode scalar values.
Only simple case folding is supported. Namely, when matching
case-insensitively, the characters are first mapped using the "simple" case
@@ -285,9 +287,9 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
. any character except new line (includes new line with s flag)
\d digit (\p{Nd})
\D not digit
-\pN One-letter name Unicode character class
+\pX Unicode character class identified by a one-letter name
\p{Greek} Unicode character class (general category or script)
-\PN Negated one-letter name Unicode character class
+\PX Negated Unicode character class identified by a one-letter name
\P{Greek} negated Unicode character class (general category or script)
</pre>
@@ -325,6 +327,25 @@ xy concatenation (x followed by y)
x|y alternation (x or y, prefer x)
</pre>
+This example shows how an alternation works, and what it means to prefer a
+branch in the alternation over subsequent branches.
+
+```
+use regex::Regex;
+
+let haystack = "samwise";
+// If 'samwise' comes first in our alternation, then it is
+// preferred as a match, even if the regex engine could
+// technically detect that 'sam' led to a match earlier.
+let re = Regex::new(r"samwise|sam").unwrap();
+assert_eq!("samwise", re.find(haystack).unwrap().as_str());
+// But if 'sam' comes first, then it will match instead.
+// In this case, it is impossible for 'samwise' to match
+// because 'sam' is a prefix of it.
+let re = Regex::new(r"sam|samwise").unwrap();
+assert_eq!("sam", re.find(haystack).unwrap().as_str());
+```
+
## Repetitions
<pre class="rust">
@@ -360,12 +381,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
-(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?P&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
+(?&lt;name&gt;exp) named (also numbered) capture group (names must be alpha-numeric)
(?:exp) non-capturing group
(?flags) set flags within current group
(?flags:exp) set flags for exp (non-capturing)
</pre>
+Capture group names must be any sequence of alpha-numeric Unicode codepoints,
+in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or
+an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic`
+Unicode property, while numeric codepoints correspond to the union of the
+`Decimal_Number`, `Letter_Number` and `Other_Number` general categories.
+
Flags are each a single character. For example, `(?x)` sets the flag `x`
and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
@@ -379,9 +407,13 @@ m multi-line mode: ^ and $ match begin/end of line
s allow . to match \n
U swap the meaning of x* and x*?
u Unicode support (enabled by default)
-x ignore whitespace and allow line comments (starting with `#`)
+x verbose mode, ignores whitespace and allow line comments (starting with `#`)
</pre>
+Note that in verbose mode, whitespace is ignored everywhere, including within
+character classes. To insert whitespace, use its escaped form or a hex literal.
+For example, `\ ` or `\x20` for an ASCII space.
+
Flags can be toggled within a pattern. Here's an example that matches
case-insensitively for the first part but case-sensitively for the second part:
diff --git a/vendor/regex/src/literal/imp.rs b/vendor/regex/src/literal/imp.rs
index 90b2f1160..75fa6e37b 100644
--- a/vendor/regex/src/literal/imp.rs
+++ b/vendor/regex/src/literal/imp.rs
@@ -1,8 +1,8 @@
use std::mem;
-use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use aho_corasick::{self, packed, AhoCorasick};
use memchr::{memchr, memchr2, memchr3, memmem};
-use regex_syntax::hir::literal::{Literal, Literals};
+use regex_syntax::hir::literal::{Literal, Seq};
/// A prefix extracted from a compiled regular expression.
///
@@ -26,7 +26,7 @@ enum Matcher {
/// A single substring, using vector accelerated routines when available.
Memmem(Memmem),
/// An Aho-Corasick automaton.
- AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
+ AC { ac: AhoCorasick, lits: Vec<Literal> },
/// A packed multiple substring searcher, using SIMD.
///
/// Note that Aho-Corasick will actually use this packed searcher
@@ -39,27 +39,26 @@ enum Matcher {
impl LiteralSearcher {
/// Returns a matcher that never matches and never advances the input.
pub fn empty() -> Self {
- Self::new(Literals::empty(), Matcher::Empty)
+ Self::new(Seq::infinite(), Matcher::Empty)
}
/// Returns a matcher for literal prefixes from the given set.
- pub fn prefixes(lits: Literals) -> Self {
+ pub fn prefixes(lits: Seq) -> Self {
let matcher = Matcher::prefixes(&lits);
Self::new(lits, matcher)
}
/// Returns a matcher for literal suffixes from the given set.
- pub fn suffixes(lits: Literals) -> Self {
+ pub fn suffixes(lits: Seq) -> Self {
let matcher = Matcher::suffixes(&lits);
Self::new(lits, matcher)
}
- fn new(lits: Literals, matcher: Matcher) -> Self {
- let complete = lits.all_complete();
+ fn new(lits: Seq, matcher: Matcher) -> Self {
LiteralSearcher {
- complete,
- lcp: Memmem::new(lits.longest_common_prefix()),
- lcs: Memmem::new(lits.longest_common_suffix()),
+ complete: lits.is_exact(),
+ lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")),
+ lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")),
matcher,
}
}
@@ -150,7 +149,7 @@ impl LiteralSearcher {
Empty => 0,
Bytes(ref sset) => sset.dense.len(),
Memmem(_) => 1,
- AC { ref ac, .. } => ac.pattern_count(),
+ AC { ref ac, .. } => ac.patterns_len(),
Packed { ref lits, .. } => lits.len(),
}
}
@@ -162,27 +161,31 @@ impl LiteralSearcher {
Empty => 0,
Bytes(ref sset) => sset.approximate_size(),
Memmem(ref single) => single.approximate_size(),
- AC { ref ac, .. } => ac.heap_bytes(),
- Packed { ref s, .. } => s.heap_bytes(),
+ AC { ref ac, .. } => ac.memory_usage(),
+ Packed { ref s, .. } => s.memory_usage(),
}
}
}
impl Matcher {
- fn prefixes(lits: &Literals) -> Self {
+ fn prefixes(lits: &Seq) -> Self {
let sset = SingleByteSet::prefixes(lits);
Matcher::new(lits, sset)
}
- fn suffixes(lits: &Literals) -> Self {
+ fn suffixes(lits: &Seq) -> Self {
let sset = SingleByteSet::suffixes(lits);
Matcher::new(lits, sset)
}
- fn new(lits: &Literals, sset: SingleByteSet) -> Self {
- if lits.literals().is_empty() {
+ fn new(lits: &Seq, sset: SingleByteSet) -> Self {
+ if lits.is_empty() || lits.min_literal_len() == Some(0) {
return Matcher::Empty;
}
+ let lits = match lits.literals() {
+ None => return Matcher::Empty,
+ Some(members) => members,
+ };
if sset.dense.len() >= 26 {
// Avoid trying to match a large number of single bytes.
// This is *very* sensitive to a frequency analysis comparison
@@ -195,26 +198,26 @@ impl Matcher {
if sset.complete {
return Matcher::Bytes(sset);
}
- if lits.literals().len() == 1 {
- return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
+ if lits.len() == 1 {
+ return Matcher::Memmem(Memmem::new(lits[0].as_bytes()));
}
- let pats = lits.literals().to_owned();
+ let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect();
let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
- if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+ if lits.len() <= 100 && !is_aho_corasick_fast {
let mut builder = packed::Config::new()
.match_kind(packed::MatchKind::LeftmostFirst)
.builder();
if let Some(s) = builder.extend(&pats).build() {
- return Matcher::Packed { s, lits: pats };
+ return Matcher::Packed { s, lits: lits.to_owned() };
}
}
- let ac = AhoCorasickBuilder::new()
+ let ac = AhoCorasick::builder()
.match_kind(aho_corasick::MatchKind::LeftmostFirst)
- .dfa(true)
- .build_with_size::<u32, _, _>(&pats)
+ .kind(Some(aho_corasick::AhoCorasickKind::DFA))
+ .build(&pats)
.unwrap();
- Matcher::AC { ac, lits: pats }
+ Matcher::AC { ac, lits: lits.to_owned() }
}
}
@@ -257,7 +260,7 @@ impl<'a> Iterator for LiteralIter<'a> {
} else {
let next = &lits[0];
*lits = &lits[1..];
- Some(&**next)
+ Some(next.as_bytes())
}
}
LiteralIter::Packed(ref mut lits) => {
@@ -266,7 +269,7 @@ impl<'a> Iterator for LiteralIter<'a> {
} else {
let next = &lits[0];
*lits = &lits[1..];
- Some(&**next)
+ Some(next.as_bytes())
}
}
}
@@ -291,11 +294,15 @@ impl SingleByteSet {
}
}
- fn prefixes(lits: &Literals) -> SingleByteSet {
+ fn prefixes(lits: &Seq) -> SingleByteSet {
let mut sset = SingleByteSet::new();
- for lit in lits.literals() {
+ let lits = match lits.literals() {
+ None => return sset,
+ Some(lits) => lits,
+ };
+ for lit in lits.iter() {
sset.complete = sset.complete && lit.len() == 1;
- if let Some(&b) = lit.get(0) {
+ if let Some(&b) = lit.as_bytes().get(0) {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
@@ -308,11 +315,15 @@ impl SingleByteSet {
sset
}
- fn suffixes(lits: &Literals) -> SingleByteSet {
+ fn suffixes(lits: &Seq) -> SingleByteSet {
let mut sset = SingleByteSet::new();
- for lit in lits.literals() {
+ let lits = match lits.literals() {
+ None => return sset,
+ Some(lits) => lits,
+ };
+ for lit in lits.iter() {
sset.complete = sset.complete && lit.len() == 1;
- if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
+ if let Some(&b) = lit.as_bytes().last() {
if !sset.sparse[b as usize] {
if b > 0x7F {
sset.all_ascii = false;
diff --git a/vendor/regex/src/literal/mod.rs b/vendor/regex/src/literal/mod.rs
index 980f52330..b9fb77aed 100644
--- a/vendor/regex/src/literal/mod.rs
+++ b/vendor/regex/src/literal/mod.rs
@@ -6,7 +6,7 @@ mod imp;
#[allow(missing_docs)]
#[cfg(not(feature = "perf-literal"))]
mod imp {
- use regex_syntax::hir::literal::Literals;
+ use regex_syntax::hir::literal::Seq;
#[derive(Clone, Debug)]
pub struct LiteralSearcher(());
@@ -16,11 +16,11 @@ mod imp {
LiteralSearcher(())
}
- pub fn prefixes(_: Literals) -> Self {
+ pub fn prefixes(_: Seq) -> Self {
LiteralSearcher(())
}
- pub fn suffixes(_: Literals) -> Self {
+ pub fn suffixes(_: Seq) -> Self {
LiteralSearcher(())
}
diff --git a/vendor/regex/src/prog.rs b/vendor/regex/src/prog.rs
index c211f71d8..100862cf1 100644
--- a/vendor/regex/src/prog.rs
+++ b/vendor/regex/src/prog.rs
@@ -27,6 +27,9 @@ pub struct Program {
pub captures: Vec<Option<String>>,
/// Pointers to all named capture groups into `captures`.
pub capture_name_idx: Arc<HashMap<String, usize>>,
+ /// If the number of capture groups is the same for all possible matches,
+ /// then this is that number.
+ pub static_captures_len: Option<usize>,
/// A pointer to the start instruction. This can vary depending on how
/// the program was compiled. For example, programs for use with the DFA
/// engine have a `.*?` inserted at the beginning of unanchored regular
@@ -83,6 +86,7 @@ impl Program {
matches: vec![],
captures: vec![],
capture_name_idx: Arc::new(HashMap::new()),
+ static_captures_len: None,
start: 0,
byte_classes: vec![0; 256],
only_utf8: true,
diff --git a/vendor/regex/src/re_bytes.rs b/vendor/regex/src/re_bytes.rs
index 07e9f98ac..e3a3b019b 100644
--- a/vendor/regex/src/re_bytes.rs
+++ b/vendor/regex/src/re_bytes.rs
@@ -17,7 +17,7 @@ use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
/// Match represents a single match of a regex in a haystack.
///
/// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t [u8],
start: usize,
@@ -37,6 +37,18 @@ impl<'t> Match<'t> {
self.end
}
+ /// Returns true if and only if this match has a length of zero.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+
+ /// Returns the length, in bytes, of this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
/// Returns the range over the starting and ending byte offsets of the
/// match in the haystack.
#[inline]
@@ -57,6 +69,24 @@ impl<'t> Match<'t> {
}
}
+impl<'t> std::fmt::Debug for Match<'t> {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ let mut fmt = f.debug_struct("Match");
+ fmt.field("start", &self.start).field("end", &self.end);
+ if let Ok(s) = std::str::from_utf8(self.as_bytes()) {
+ fmt.field("bytes", &s);
+ } else {
+ // FIXME: It would be nice if this could be printed as a string
+ // with invalid UTF-8 replaced with hex escapes. A alloc would
+ // probably okay if that makes it easier, but regex-automata does
+ // (at time of writing) have internal routines that do this. So
+ // maybe we should expose them.
+ fmt.field("bytes", &self.as_bytes());
+ }
+ fmt.finish()
+ }
+}
+
impl<'t> From<Match<'t>> for Range<usize> {
fn from(m: Match<'t>) -> Range<usize> {
m.range()
@@ -253,12 +283,7 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `get(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
- let mut locs = self.capture_locations();
- self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
- text,
- locs: locs.0,
- named_groups: self.0.capture_name_idx().clone(),
- })
+ self.captures_at(text, 0)
}
/// Returns an iterator over all the non-overlapping capture groups matched
@@ -537,7 +562,14 @@ impl Regex {
/// This method may have the same performance characteristics as
/// `is_match`, except it provides an end location for a match. In
/// particular, the location returned *may be shorter* than the proper end
- /// of the leftmost-first match.
+ /// of the leftmost-first match that you would find via `Regex::find`.
+ ///
+ /// Note that it is not guaranteed that this routine finds the shortest or
+ /// "earliest" possible match. Instead, the main idea of this API is that
+ /// it returns the offset at the point at which the internal regex engine
+ /// has determined that a match has occurred. This may vary depending on
+ /// which internal regex engine is used, and thus, the offset itself may
+ /// change.
///
/// # Example
///
@@ -598,6 +630,25 @@ impl Regex {
.map(|(s, e)| Match::new(text, s, e))
}
+ /// Returns the same as [`Regex::captures`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_at<'t>(
+ &self,
+ text: &'t [u8],
+ start: usize,
+ ) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
/// This is like `captures`, but uses
/// [`CaptureLocations`](struct.CaptureLocations.html)
/// instead of
@@ -667,6 +718,46 @@ impl Regex {
self.0.capture_names().len()
}
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex::bytes::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.0.static_captures_len().map(|len| len.saturating_add(1))
+ }
+
/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
@@ -856,6 +947,27 @@ impl<'r> FusedIterator for CaptureNames<'r> {}
/// In order to build a value of this type, you'll need to call the
/// `capture_locations` method on the `Regex` being used to execute the search.
/// The value returned can then be reused in subsequent searches.
+///
+/// # Example
+///
+/// This example shows how to create and use `CaptureLocations` in a search.
+///
+/// ```
+/// use regex::bytes::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+/// let mut locs = re.capture_locations();
+/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap();
+/// assert_eq!(0..17, m.range());
+/// assert_eq!(Some((0, 17)), locs.get(0));
+/// assert_eq!(Some((0, 5)), locs.get(1));
+/// assert_eq!(Some((6, 17)), locs.get(2));
+///
+/// // Asking for an invalid capture group always returns None.
+/// assert_eq!(None, locs.get(3));
+/// assert_eq!(None, locs.get(34973498648));
+/// assert_eq!(None, locs.get(9944060567225171988));
+/// ```
#[derive(Clone, Debug)]
pub struct CaptureLocations(re_trait::Locations);
diff --git a/vendor/regex/src/re_set.rs b/vendor/regex/src/re_set.rs
index a6d886d76..7c8253f0c 100644
--- a/vendor/regex/src/re_set.rs
+++ b/vendor/regex/src/re_set.rs
@@ -289,6 +289,12 @@ impl RegexSet {
}
}
+impl Default for RegexSet {
+ fn default() -> Self {
+ RegexSet::empty()
+ }
+}
+
/// A set of matches returned by a regex set.
#[derive(Clone, Debug)]
pub struct SetMatches {
@@ -315,6 +321,11 @@ impl SetMatches {
}
/// The total number of regexes in the set that created these matches.
+ ///
+ /// **WARNING:** This always returns the same value as [`RegexSet::len`].
+ /// In particular, it does *not* return the number of elements yielded by
+ /// [`SetMatches::iter`]. The only way to determine the total number of
+ /// matched regexes is to iterate over them.
pub fn len(&self) -> usize {
self.matches.len()
}
diff --git a/vendor/regex/src/re_trait.rs b/vendor/regex/src/re_trait.rs
index d0c717df5..505810c84 100644
--- a/vendor/regex/src/re_trait.rs
+++ b/vendor/regex/src/re_trait.rs
@@ -20,7 +20,7 @@ impl Locations {
/// not match anything. The positions returned are *always* byte indices
/// with respect to the original string matched.
pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
- let (s, e) = (i * 2, i * 2 + 1);
+ let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?);
match (self.0.get(s), self.0.get(e)) {
(Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
_ => None,
diff --git a/vendor/regex/src/re_unicode.rs b/vendor/regex/src/re_unicode.rs
index 197510ea0..57689086d 100644
--- a/vendor/regex/src/re_unicode.rs
+++ b/vendor/regex/src/re_unicode.rs
@@ -25,7 +25,7 @@ pub fn escape(text: &str) -> String {
/// Match represents a single match of a regex in a haystack.
///
/// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t str,
start: usize,
@@ -45,6 +45,18 @@ impl<'t> Match<'t> {
self.end
}
+ /// Returns true if and only if this match has a length of zero.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.start == self.end
+ }
+
+ /// Returns the length, in bytes, of this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
/// Returns the range over the starting and ending byte offsets of the
/// match in the haystack.
#[inline]
@@ -65,6 +77,16 @@ impl<'t> Match<'t> {
}
}
+impl<'t> std::fmt::Debug for Match<'t> {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ f.debug_struct("Match")
+ .field("start", &self.start)
+ .field("end", &self.end)
+ .field("string", &self.as_str())
+ .finish()
+ }
+}
+
impl<'t> From<Match<'t>> for &'t str {
fn from(m: Match<'t>) -> &'t str {
m.as_str()
@@ -309,12 +331,7 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `get(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
- let mut locs = self.capture_locations();
- self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
- text,
- locs: locs.0,
- named_groups: self.0.capture_name_idx().clone(),
- })
+ self.captures_at(text, 0)
}
/// Returns an iterator over all the non-overlapping capture groups matched
@@ -595,7 +612,14 @@ impl Regex {
/// This method may have the same performance characteristics as
/// `is_match`, except it provides an end location for a match. In
/// particular, the location returned *may be shorter* than the proper end
- /// of the leftmost-first match.
+ /// of the leftmost-first match that you would find via `Regex::find`.
+ ///
+ /// Note that it is not guaranteed that this routine finds the shortest or
+ /// "earliest" possible match. Instead, the main idea of this API is that
+ /// it returns the offset at the point at which the internal regex engine
+ /// has determined that a match has occurred. This may vary depending on
+ /// which internal regex engine is used, and thus, the offset itself may
+ /// change.
///
/// # Example
///
@@ -615,12 +639,12 @@ impl Regex {
self.shortest_match_at(text, 0)
}
- /// Returns the same as shortest_match, but starts the search at the given
- /// offset.
+ /// Returns the same as `shortest_match`, but starts the search at the
+ /// given offset.
///
/// The significance of the starting point is that it takes the surrounding
- /// context into consideration. For example, the `\A` anchor can only
- /// match when `start == 0`.
+ /// context into consideration. For example, the `\A` anchor can only match
+ /// when `start == 0`.
pub fn shortest_match_at(
&self,
text: &str,
@@ -656,6 +680,25 @@ impl Regex {
.map(|(s, e)| Match::new(text, s, e))
}
+ /// Returns the same as [`Regex::captures`], but starts the search at the
+ /// given offset.
+ ///
+ /// The significance of the starting point is that it takes the surrounding
+ /// context into consideration. For example, the `\A` anchor can only
+ /// match when `start == 0`.
+ pub fn captures_at<'t>(
+ &self,
+ text: &'t str,
+ start: usize,
+ ) -> Option<Captures<'t>> {
+ let mut locs = self.capture_locations();
+ self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
+ text,
+ locs: locs.0,
+ named_groups: self.0.capture_name_idx().clone(),
+ })
+ }
+
/// This is like `captures`, but uses
/// [`CaptureLocations`](struct.CaptureLocations.html)
/// instead of
@@ -725,6 +768,46 @@ impl Regex {
self.0.capture_names().len()
}
+ /// Returns the total number of capturing groups that appear in every
+ /// possible match.
+ ///
+ /// If the number of capture groups can vary depending on the match, then
+ /// this returns `None`. That is, a value is only returned when the number
+ /// of matching groups is invariant or "static."
+ ///
+ /// Note that like [`Regex::captures_len`], this **does** include the
+ /// implicit capturing group corresponding to the entire match. Therefore,
+ /// when a non-None value is returned, it is guaranteed to be at least `1`.
+ /// Stated differently, a return value of `Some(0)` is impossible.
+ ///
+ /// # Example
+ ///
+ /// This shows a few cases where a static number of capture groups is
+ /// available and a few cases where it is not.
+ ///
+ /// ```
+ /// use regex::Regex;
+ ///
+ /// let len = |pattern| {
+ /// Regex::new(pattern).map(|re| re.static_captures_len())
+ /// };
+ ///
+ /// assert_eq!(Some(1), len("a")?);
+ /// assert_eq!(Some(2), len("(a)")?);
+ /// assert_eq!(Some(2), len("(a)|(b)")?);
+ /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
+ /// assert_eq!(None, len("(a)|b")?);
+ /// assert_eq!(None, len("a|(b)")?);
+ /// assert_eq!(None, len("(b)*")?);
+ /// assert_eq!(Some(2), len("(b)+")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn static_captures_len(&self) -> Option<usize> {
+ self.0.static_captures_len().map(|len| len.saturating_add(1))
+ }
+
/// Returns an empty set of capture locations that can be reused in
/// multiple calls to `captures_read` or `captures_read_at`.
pub fn capture_locations(&self) -> CaptureLocations {
@@ -866,6 +949,27 @@ impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
/// In order to build a value of this type, you'll need to call the
/// `capture_locations` method on the `Regex` being used to execute the search.
/// The value returned can then be reused in subsequent searches.
+///
+/// # Example
+///
+/// This example shows how to create and use `CaptureLocations` in a search.
+///
+/// ```
+/// use regex::Regex;
+///
+/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap();
+/// let mut locs = re.capture_locations();
+/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap();
+/// assert_eq!(0..17, m.range());
+/// assert_eq!(Some((0, 17)), locs.get(0));
+/// assert_eq!(Some((0, 5)), locs.get(1));
+/// assert_eq!(Some((6, 17)), locs.get(2));
+///
+/// // Asking for an invalid capture group always returns None.
+/// assert_eq!(None, locs.get(3));
+/// assert_eq!(None, locs.get(34973498648));
+/// assert_eq!(None, locs.get(9944060567225171988));
+/// ```
#[derive(Clone, Debug)]
pub struct CaptureLocations(re_trait::Locations);
diff --git a/vendor/regex/tests/regression.rs b/vendor/regex/tests/regression.rs
index e8b252538..291062a77 100644
--- a/vendor/regex/tests/regression.rs
+++ b/vendor/regex/tests/regression.rs
@@ -220,3 +220,44 @@ matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
// See: https://github.com/rust-lang/regex/issues/862
mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));
+
+// See: https://github.com/rust-lang/regex/issues/981
+#[cfg(feature = "unicode")]
+#[test]
+fn regression_bad_word_boundary() {
+ let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap();
+ let hay = "ubi-Darwin-x86_64.tar.gz";
+ assert!(!re.is_match(text!(hay)));
+ let hay = "ubi-Windows-x86_64.zip";
+ assert!(re.is_match(text!(hay)));
+}
+
+// See: https://github.com/rust-lang/regex/issues/982
+#[cfg(feature = "unicode-perl")]
+#[test]
+fn regression_unicode_perl_not_enabled() {
+ let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?";
+ let re = regex_new!(pat);
+ assert!(re.is_ok());
+}
+
+// See: https://github.com/rust-lang/regex/issues/995
+#[test]
+fn regression_big_regex_overflow() {
+ let pat = r" {2147483516}{2147483416}{5}";
+ let re = regex_new!(pat);
+ assert!(re.is_err());
+}
+
+#[test]
+fn regression_complete_literals_suffix_incorrect() {
+ let needles = vec![
+ "aA", "bA", "cA", "dA", "eA", "fA", "gA", "hA", "iA", "jA", "kA",
+ "lA", "mA", "nA", "oA", "pA", "qA", "rA", "sA", "tA", "uA", "vA",
+ "wA", "xA", "yA", "zA",
+ ];
+ let pattern = needles.join("|");
+ let re = regex!(&pattern);
+ let hay = "FUBAR";
+ assert_eq!(0, re.find_iter(text!(hay)).count());
+}
diff --git a/vendor/regex/tests/regression_fuzz.rs b/vendor/regex/tests/regression_fuzz.rs
index 4e76704d2..5f49530a7 100644
--- a/vendor/regex/tests/regression_fuzz.rs
+++ b/vendor/regex/tests/regression_fuzz.rs
@@ -29,3 +29,12 @@ fn big_regex_fails_to_compile() {
let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}";
assert!(regex_new!(pat).is_err());
}
+
+// This was caught while on master but before a release went out(!).
+//
+// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58173
+#[test]
+fn todo() {
+ let pat = "(?:z|xx)@|xx";
+ assert!(regex_new!(pat).is_ok());
+}
diff --git a/vendor/regex/tests/replace.rs b/vendor/regex/tests/replace.rs
index d65be072f..f23c57551 100644
--- a/vendor/regex/tests/replace.rs
+++ b/vendor/regex/tests/replace.rs
@@ -15,7 +15,7 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ");
replace!(
groups,
replace,
- r"(?-u)(\S+)\s+(\S+)",
+ r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
t!("$2 $1"),
"w2 w1"
@@ -23,7 +23,7 @@ replace!(
replace!(
double_dollar,
replace,
- r"(?-u)(\S+)\s+(\S+)",
+ r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
t!("$2 $$1"),
"w2 $1"
@@ -33,7 +33,7 @@ replace!(
replace!(
named,
replace_all,
- r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
+ r"(?P<first>[^ ]+)[ ]+(?P<last>[^ ]+)(?P<space>[ ]*)",
"w1 w2 w3 w4",
t!("$last $first$space"),
"w2 w1 w4 w3"
@@ -51,7 +51,7 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b");
replace!(
simple_expand,
replace_all,
- r"(?-u)(\w) (\w)",
+ r"([a-z]) ([a-z])",
"a b",
t!("$2 $1"),
"b a"
@@ -59,7 +59,7 @@ replace!(
replace!(
literal_dollar1,
replace_all,
- r"(?-u)(\w+) (\w+)",
+ r"([a-z]+) ([a-z]+)",
"a b",
t!("$$1"),
"$1"
@@ -67,7 +67,7 @@ replace!(
replace!(
literal_dollar2,
replace_all,
- r"(?-u)(\w+) (\w+)",
+ r"([a-z]+) ([a-z]+)",
"a b",
t!("$2 $$c $1"),
"b $c a"
@@ -75,7 +75,7 @@ replace!(
replace!(
no_expand1,
replace,
- r"(?-u)(\S+)\s+(\S+)",
+ r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
no_expand!("$2 $1"),
"$2 $1"
@@ -83,7 +83,7 @@ replace!(
replace!(
no_expand2,
replace,
- r"(?-u)(\S+)\s+(\S+)",
+ r"([^ ]+)[ ]+([^ ]+)",
"w1 w2",
no_expand!("$$1"),
"$$1"
diff --git a/vendor/regex/tests/set.rs b/vendor/regex/tests/set.rs
index 37fcf8700..d1144d662 100644
--- a/vendor/regex/tests/set.rs
+++ b/vendor/regex/tests/set.rs
@@ -65,3 +65,10 @@ fn len_and_empty() {
assert_eq!(not_empty.len(), 2);
assert!(!not_empty.is_empty());
}
+
+#[test]
+fn default_set_is_empty() {
+ let set: regex::bytes::RegexSet = Default::default();
+ assert_eq!(set.len(), 0);
+ assert!(set.is_empty());
+}
diff --git a/vendor/regex/tests/unicode.rs b/vendor/regex/tests/unicode.rs
index 9b3228624..d7dbdd31b 100644
--- a/vendor/regex/tests/unicode.rs
+++ b/vendor/regex/tests/unicode.rs
@@ -35,6 +35,8 @@ mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None);
// We should test more, but there's a lot. Write a script to generate more of
// these tests.
mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3)));
+mat!(uni_class_gencat_cased_letter2, r"\p{gc=LC}", "A", Some((0, 3)));
+mat!(uni_class_gencat_cased_letter3, r"\p{LC}", "A", Some((0, 3)));
mat!(
uni_class_gencat_close_punctuation,
r"\p{Close_Punctuation}",
@@ -77,6 +79,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
// See: https://github.com/rust-lang/regex/issues/719
mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
+mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1)));
mat!(
uni_class_gencat_initial_punctuation,
r"\p{Initial_Punctuation}",