diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 03:57:31 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 03:57:31 +0000 |
commit | dc0db358abe19481e475e10c32149b53370f1a1c (patch) | |
tree | ab8ce99c4b255ce46f99ef402c27916055b899ee /vendor/regex | |
parent | Releasing progress-linux version 1.71.1+dfsg1-2~progress7.99u1. (diff) | |
download | rustc-dc0db358abe19481e475e10c32149b53370f1a1c.tar.xz rustc-dc0db358abe19481e475e10c32149b53370f1a1c.zip |
Merging upstream version 1.72.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex')
27 files changed, 1032 insertions, 256 deletions
diff --git a/vendor/regex/.cargo-checksum.json b/vendor/regex/.cargo-checksum.json index d6ea1df9a..a85152d35 100644 --- a/vendor/regex/.cargo-checksum.json +++ b/vendor/regex/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"CHANGELOG.md":"c66cc76a297a1068a3aa81d08326175be887b60fb6330c4b7922f4ad286bd144","Cargo.lock":"fa78ce7955999185e017ef812db215124b65ae8aef67b4ea9ca5bd18c50b0d35","Cargo.toml":"566dca60827f0cbafd1d920457d6d7fad9bd5b85630832207de8cc9c35cab274","HACKING.md":"17818f7a17723608f6bdbe6388ad0a913d4f96f76a16649aaf4e274b1fa0ea97","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","PERFORMANCE.md":"0d5ef3866386918dfdefb1aa9a28cfe33cb3c8ceeb79f3f8ba5b88253dd95991","README.md":"f69204a0f446047d8f4d1f3d84b75f235adb5c26477f3a37b671411bc954d14c","UNICODE.md":"a8a8399540eed000d19420135a527f400247a04572e44d124c786b870f518776","examples/regexdna-input.txt":"156a49710bb3e1ed4bc2bbb0af0f383b747b3d0281453cfff39c296124c598f8","examples/regexdna-output.txt":"35e85b19b70a893d752fd43e54e1e9da08bac43559191cea85b33387c24c4cc1","examples/shootout-regex-dna-bytes.rs":"fa2daedb4e0a05f64f33f4af62fbb0176db998e3676f8637ab684b725367a7b4","examples/shootout-regex-dna-cheat.rs":"1f871a6eaaf8372299fa3c762051112fa89a14235b03f734fc50ebd51ecaee72","examples/shootout-regex-dna-replace.rs":"32ffdf13ac6c4ce3fc32116a048e9cc682aa34cdb8e5beaf565a22addbdcd9ab","examples/shootout-regex-dna-single-cheat.rs":"809f75bf1e1917a53623eb6f1a3ce3b7d2ed98a6a1dbc0bd4853bec49a0c6f94","examples/shootout-regex-dna-single.rs":"1ab14f5703cd4be2e75a2e792e0ba1d322b9e4b14535d396805a4316d577f5bb","examples/shootout-regex-dna.rs":"20ea46ab63f91e3ac6a64e997eadd436a9cbc2f1bdade28e4512052f0e25bc34","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/backtrack.rs":"52987d80448f3d7f5d4e3545ddfc09f1f30de7602d9b5489961db4b215a377fd","src/compile.rs":"79a59be2d2db650b5a322e15e9bf1d3227944410bc780fc6089da8f4d2609b77","src/dfa.rs":"10273980d1f08aaff495e11efa240249a2b2c08a4db7c49c8d6759bc65a3b174","src/error.rs":"71c85db839514f26ee024a689061743ea94a34eb7a3291e6c2b69b45a9682d09","src/exec.rs":"4726e2c210c6adb91b25390cbc864ab71deabb17ca87a46e83acef26a194bfc2","src/expand.rs":"71220309a3bac797f55129f49e79c03e96efec894ea338c735b78695367e04ca","src/find_byte.rs":"b387247b77e3269f057c3399aefe5a815032c3af918c876f80eb4b282e4eb95e","src/freqs.rs":"255555f3d95b08a5bb3bc2f38d5a06cc100a39c0f0127fe4f50c33afa1cadc65","src/input.rs":"13f49c1bce2fadd04a45b421d374cd0f8b72bef83f7e8fda958962aaccbe799a","src/lib.rs":"982fadba415c4c5b93f4d7d4a73a23ec88e2d96daaa03b679d14490ea0f63197","src/literal/imp.rs":"b7f63a861c299bea4baaab17353a420ee339c2cf76d3858c95f39342bd4463e7","src/literal/mod.rs":"533f1d68af088e9485170145e27518368e541a0337fdb44f63249ebf97310300","src/pattern.rs":"993d8b6b4bcea5e02bee3c76e17c356a5a47f8fc53c5555edfd1ebb71c0878bf","src/pikevm.rs":"6c0eaa7e878c945ac4c3c545c98f5706ad04846fc432a5086c8ee78eb030dfa7","src/pool.rs":"942e991ae31ef349bd76efd78b2a712c01166dec965bf93742977ed0870d5a10","src/prog.rs":"bebb3e50745bbc05d6c8240d972ba55a1818c51b1161dc1c21f3fe13c11d4884","src/re_builder.rs":"943344bf6e2fc90902ee04b11b741c32418ac6814b21b7982cc0a3a817713f3e","src/re_bytes.rs":"63ee1db1637a3764addb10e27248129acffaf78bb0a69624add4d9d6f1e97040","src/re_set.rs":"7921ac4a919b7a5deffe82d099a9ccaf5487aebd890dfb7a661e602c6ad3f1a9","src/re_trait.rs":"d237121b6f6b606836c72305cbcb3bbdbc54d1f6827d19a19cd0fbb4372e0145","src/re_unicode.rs":"4ca66d6e835df7c0f570c8cde52667ef90ba1687d5285f12fedef2e38ae925b4","src/sparse.rs":"0da3ddb7972109869248a764dbb10254555f4bb51c375e89fb3fab9cafa47320","src/testdata/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","src/testdata/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","src/testdata/basic.dat":"b5b33aa89d48a61cd67cb1fbfd8f70e62c83e30b86256f9f915a5190dd38ff06","src/testdata/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","src/testdata/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","src/utf8.rs":"f85a356ff5d5b19e417b73ce1dd84581b21d283f6dddd195547c30af9c60bd1a","test":"0d62fdca7da12fc19ea5306b5de1d83e68d9365a029c043d524334da138b0304","tests/api.rs":"7b2a0ef75e99b9776094967bd66e9cdeaa8e11359f5f0a12bd08ef0e8d0c11fc","tests/api_str.rs":"2ae38c04e7e8fac008b609a820d0b1561ba75f39b0edc0987d6d3d06132da77f","tests/bytes.rs":"edc50f526c5fee43df89d639ef18b237e4eb91e9d533bfc43f3cbab7417d38ba","tests/consistent.rs":"d69435154c09478076497216e43081a835ac65147181a4fbddad7bff469605b2","tests/crates_regex.rs":"91a59d470e0700b4bcb3ff735d06799f3107b8ef4875a2e9904607b164be0326","tests/crazy.rs":"c0d56380dff19bdd5d7a3eb731d0e2dc564e169a1b73c81e1879b1e87f5f5f77","tests/flags.rs":"05caace2c81a99d2168037f3a38035d4dffe9f85ef3ebd7ef18b1bc6612f1ea8","tests/fowler.rs":"d78cf914de40b1e125cc92b65ccb444d462586bd07b5e05de4e4a1b5de16aa76","tests/macros.rs":"6db70c16fc90df13e6b30d2b606f8b6dd4dc976697967f6ee001b15aab6d0b19","tests/macros_bytes.rs":"a049f528a93173a1bb176cd46932dce1880679f4a1752e099be920f0e4546fd0","tests/macros_str.rs":"e585b1461374c45a2eca44ca045bc3c1fe984b2b4212e432b0c695b420e708b7","tests/misc.rs":"395f52793fa022e4cdda78675b6a6fba1a3106b4b99c834c39f7801574054bd1","tests/multiline.rs":"1b1a3326ed976437c1357f01d81833ece7ea244f38826246eab55cacd5d0862a","tests/noparse.rs":"12b6be0eff3d80779d33c6459396c74c0f6ebf4ddc9f1d33c3e747ea9e3bf268","tests/regression.rs":"1c965fefb8c7a2b1dfdab3e3fdeebaf47846555c50c8005e5537f96a52a3e252","tests/regression_fuzz.rs":"a504ec563e0d23bd2039493b7b1767fe1f831d7d668f6f4b2ecd124fc7899bcd","tests/replace.rs":"66f97532e40697934e2a77605b9002dfd22c46b6033ccb755e7660d855229f41","tests/searcher.rs":"ce35e47b0a276a7e8c9060c6a0b225ffba163aebc61fbc15555a6897fa0e552c","tests/set.rs":"f1e2af6baeeaed3cc99ed347ff516fe7b2eb0027ef64b891502e1486598eaf8a","tests/shortest_match.rs":"a2c94390c0d61bc24796b4c1288c924e90c8c9c6156fdebb858175177a194a42","tests/suffix_reverse.rs":"b95f89397404871227d9efe6df23b9ded147f183db81597e608f693955c668b5","tests/test_backtrack.rs":"b70c5e5f1241efd76dd9f9dd4a4df8a7b38113bd407d1f5f56867f1176177a59","tests/test_backtrack_bytes.rs":"b8a111d4b4109c8bba7e2afb650572c495a14d357fb1f743c1076fb001f704b5","tests/test_backtrack_utf8bytes.rs":"c0c279785d18beac2b4e178e7bf6c14ed235d65f00ca467cfd9c333d79487649","tests/test_crates_regex.rs":"fd9525c2eef0e2f8cb7f787bc2b721bcd0b5d84f3bca49adfe48d657a99c721a","tests/test_default.rs":"b32c11a43da4379a3717dd7a5f152c811257c7d6595c9d3c51f2de102e320c87","tests/test_default_bytes.rs":"831d3e6bfb882feb15f700e30304bd34328f888fb4c15c7169371e25024ce9a7","tests/test_nfa.rs":"f119fc43a018249c39c813d57096b0654ff69f337345f2bbd9b0e61cc9137285","tests/test_nfa_bytes.rs":"89eae3bef6a1d0bcea6b5de5be35ad72f613f2ceb8b58fe82a6c6ef2ccdc07d0","tests/test_nfa_utf8bytes.rs":"7d830b4aa401887d7cf098b62fed4cd8017ef8b61f625c7c9a2159a6b4cfeb71","tests/unicode.rs":"1af9db7f09a6b0113b8a64733e06c8415fef720b2fdef227ae398d94332287cd","tests/word_boundary.rs":"7081317ddcec1e82dd4a2090a571c6abf2ff4bbfa8cd10395e1eb3f386157fae","tests/word_boundary_ascii.rs":"cd0be5b5b485de0ba7994b42e2864585556c3d2d8bf5eab05b58931d9aaf4b87","tests/word_boundary_unicode.rs":"75dbcc35d3abc0f9795c2ea99e216dc227b0a5b58e9ca5eef767815ff0513921"},"package":"8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"}
\ No newline at end of file +{"files":{"CHANGELOG.md":"8839af2eea6a564b11b5a8cf5023f121de9b5edd02829448670b2d880ded3bb4","Cargo.lock":"d8f5dca5987e52303b325d99fd65dea8e75202b2dab766dcf9be38e64fd73f46","Cargo.toml":"1bfd9b78fc3842caa6ec435ddc6a4f81123d26d0cf4c1e66c1f1ef05a3ec3e72","HACKING.md":"17818f7a17723608f6bdbe6388ad0a913d4f96f76a16649aaf4e274b1fa0ea97","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","PERFORMANCE.md":"0d5ef3866386918dfdefb1aa9a28cfe33cb3c8ceeb79f3f8ba5b88253dd95991","README.md":"cdcafba78fda99c94f9ea3290ad521fbdbb12043ca6925b10cef801eb4e5e223","UNICODE.md":"a8a8399540eed000d19420135a527f400247a04572e44d124c786b870f518776","examples/regexdna-input.txt":"156a49710bb3e1ed4bc2bbb0af0f383b747b3d0281453cfff39c296124c598f8","examples/regexdna-output.txt":"35e85b19b70a893d752fd43e54e1e9da08bac43559191cea85b33387c24c4cc1","examples/shootout-regex-dna-bytes.rs":"fa2daedb4e0a05f64f33f4af62fbb0176db998e3676f8637ab684b725367a7b4","examples/shootout-regex-dna-cheat.rs":"1f871a6eaaf8372299fa3c762051112fa89a14235b03f734fc50ebd51ecaee72","examples/shootout-regex-dna-replace.rs":"32ffdf13ac6c4ce3fc32116a048e9cc682aa34cdb8e5beaf565a22addbdcd9ab","examples/shootout-regex-dna-single-cheat.rs":"809f75bf1e1917a53623eb6f1a3ce3b7d2ed98a6a1dbc0bd4853bec49a0c6f94","examples/shootout-regex-dna-single.rs":"1ab14f5703cd4be2e75a2e792e0ba1d322b9e4b14535d396805a4316d577f5bb","examples/shootout-regex-dna.rs":"20ea46ab63f91e3ac6a64e997eadd436a9cbc2f1bdade28e4512052f0e25bc34","record/README.md":"02e6f85f8a43f18540e4a52a75d1001494df7aceac3873e9a13e3ceba190206d","record/compile-test/2023-04-19_1.7.3.csv":"460059ba2f10456175ff92bd75d4a365b14a1843e2b46e7b285d58da59e6d3ca","record/compile-test/2023-04-20_master.csv":"6b94df278e4ed82a3fd0d4bfe92a4614714e00435e983c7649ee9f54925f906e","record/compile-test/README.md":"ba2b606993edd8d705ad1677ec954862614e52b028407e1908bb5dfb07767f2d","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/backtrack.rs":"52987d80448f3d7f5d4e3545ddfc09f1f30de7602d9b5489961db4b215a377fd","src/compile.rs":"65b9f083d28a79bcef9584b8da3d87f21b561912de29c6c6053546e1e752980f","src/dfa.rs":"405f24adbf775b0902fd830cc5a5446de80da1a2a5eb950fca357aff5b19163f","src/error.rs":"476a86da4bb115cb85e9327aee6f423c1dade524517178186c747a3baa9be71d","src/exec.rs":"72693556149e1347283ff2499bf624e01fb917076bf4d103a6723f4ecfa9cf65","src/expand.rs":"59e459a9bbd0ae60478a6cbe48203091245e39bbd064e04b50d848d75f6de920","src/find_byte.rs":"b387247b77e3269f057c3399aefe5a815032c3af918c876f80eb4b282e4eb95e","src/freqs.rs":"255555f3d95b08a5bb3bc2f38d5a06cc100a39c0f0127fe4f50c33afa1cadc65","src/input.rs":"13f49c1bce2fadd04a45b421d374cd0f8b72bef83f7e8fda958962aaccbe799a","src/lib.rs":"7cb5ea7fbb41b71d6a9d0692442b8bdfccd10199dd1340e534202f988cfad493","src/literal/imp.rs":"26239f37d7c79a88f154ffb864be282598486d9df9363c918ac3106537119b3d","src/literal/mod.rs":"59fd8bc37784906d729167b69bd14f91094c4c82749984ee5ffd41ae62c38af2","src/pattern.rs":"993d8b6b4bcea5e02bee3c76e17c356a5a47f8fc53c5555edfd1ebb71c0878bf","src/pikevm.rs":"6c0eaa7e878c945ac4c3c545c98f5706ad04846fc432a5086c8ee78eb030dfa7","src/pool.rs":"942e991ae31ef349bd76efd78b2a712c01166dec965bf93742977ed0870d5a10","src/prog.rs":"8ab44101bb2aaf51f00872798f3d926ef150744898538b4ceb5f3d38fbf861f0","src/re_builder.rs":"943344bf6e2fc90902ee04b11b741c32418ac6814b21b7982cc0a3a817713f3e","src/re_bytes.rs":"15a53cccd7d573f668ac38158b140c0e0e51a31ac16de800f24e72c8d364561e","src/re_set.rs":"8b9b9b78fc1dbd8731298269f410c67689baedf4116fb617afd309fd4bfe116c","src/re_trait.rs":"df29beedc00933e34e7f89b4db645cba18db7f7e4cf3f1d48328bddada5191d5","src/re_unicode.rs":"940be2629a8176065f821b419693135fdfdb016b573e8e00a10d963712bf1fa8","src/sparse.rs":"0da3ddb7972109869248a764dbb10254555f4bb51c375e89fb3fab9cafa47320","src/testdata/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","src/testdata/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","src/testdata/basic.dat":"b5b33aa89d48a61cd67cb1fbfd8f70e62c83e30b86256f9f915a5190dd38ff06","src/testdata/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","src/testdata/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","src/utf8.rs":"f85a356ff5d5b19e417b73ce1dd84581b21d283f6dddd195547c30af9c60bd1a","test":"0d62fdca7da12fc19ea5306b5de1d83e68d9365a029c043d524334da138b0304","tests/api.rs":"7b2a0ef75e99b9776094967bd66e9cdeaa8e11359f5f0a12bd08ef0e8d0c11fc","tests/api_str.rs":"2ae38c04e7e8fac008b609a820d0b1561ba75f39b0edc0987d6d3d06132da77f","tests/bytes.rs":"edc50f526c5fee43df89d639ef18b237e4eb91e9d533bfc43f3cbab7417d38ba","tests/consistent.rs":"d69435154c09478076497216e43081a835ac65147181a4fbddad7bff469605b2","tests/crates_regex.rs":"91a59d470e0700b4bcb3ff735d06799f3107b8ef4875a2e9904607b164be0326","tests/crazy.rs":"c0d56380dff19bdd5d7a3eb731d0e2dc564e169a1b73c81e1879b1e87f5f5f77","tests/flags.rs":"05caace2c81a99d2168037f3a38035d4dffe9f85ef3ebd7ef18b1bc6612f1ea8","tests/fowler.rs":"d78cf914de40b1e125cc92b65ccb444d462586bd07b5e05de4e4a1b5de16aa76","tests/macros.rs":"6db70c16fc90df13e6b30d2b606f8b6dd4dc976697967f6ee001b15aab6d0b19","tests/macros_bytes.rs":"a049f528a93173a1bb176cd46932dce1880679f4a1752e099be920f0e4546fd0","tests/macros_str.rs":"e585b1461374c45a2eca44ca045bc3c1fe984b2b4212e432b0c695b420e708b7","tests/misc.rs":"395f52793fa022e4cdda78675b6a6fba1a3106b4b99c834c39f7801574054bd1","tests/multiline.rs":"1b1a3326ed976437c1357f01d81833ece7ea244f38826246eab55cacd5d0862a","tests/noparse.rs":"12b6be0eff3d80779d33c6459396c74c0f6ebf4ddc9f1d33c3e747ea9e3bf268","tests/regression.rs":"3b15568d79ae7d9845fda737a93cd518db01a7ed388b2ac437389a2a1d50f129","tests/regression_fuzz.rs":"3c99498af578044159336c63c8ac81d65bfc611a0aa80217400544d5caa66827","tests/replace.rs":"5f1bbf3f89de8cd021406a4affd0d07484ba194ac791ac307efd66f2792a2366","tests/searcher.rs":"ce35e47b0a276a7e8c9060c6a0b225ffba163aebc61fbc15555a6897fa0e552c","tests/set.rs":"a69fab05adabdbf27e788d51d7cea06acfd9017182e2f201d592b45c4fec5618","tests/shortest_match.rs":"a2c94390c0d61bc24796b4c1288c924e90c8c9c6156fdebb858175177a194a42","tests/suffix_reverse.rs":"b95f89397404871227d9efe6df23b9ded147f183db81597e608f693955c668b5","tests/test_backtrack.rs":"b70c5e5f1241efd76dd9f9dd4a4df8a7b38113bd407d1f5f56867f1176177a59","tests/test_backtrack_bytes.rs":"b8a111d4b4109c8bba7e2afb650572c495a14d357fb1f743c1076fb001f704b5","tests/test_backtrack_utf8bytes.rs":"c0c279785d18beac2b4e178e7bf6c14ed235d65f00ca467cfd9c333d79487649","tests/test_crates_regex.rs":"fd9525c2eef0e2f8cb7f787bc2b721bcd0b5d84f3bca49adfe48d657a99c721a","tests/test_default.rs":"b32c11a43da4379a3717dd7a5f152c811257c7d6595c9d3c51f2de102e320c87","tests/test_default_bytes.rs":"831d3e6bfb882feb15f700e30304bd34328f888fb4c15c7169371e25024ce9a7","tests/test_nfa.rs":"f119fc43a018249c39c813d57096b0654ff69f337345f2bbd9b0e61cc9137285","tests/test_nfa_bytes.rs":"89eae3bef6a1d0bcea6b5de5be35ad72f613f2ceb8b58fe82a6c6ef2ccdc07d0","tests/test_nfa_utf8bytes.rs":"7d830b4aa401887d7cf098b62fed4cd8017ef8b61f625c7c9a2159a6b4cfeb71","tests/unicode.rs":"d0a2fec28cb28910a5ec1a51849dcf7923673a2c3bc0ffc24025f7c37667add2","tests/word_boundary.rs":"7081317ddcec1e82dd4a2090a571c6abf2ff4bbfa8cd10395e1eb3f386157fae","tests/word_boundary_ascii.rs":"cd0be5b5b485de0ba7994b42e2864585556c3d2d8bf5eab05b58931d9aaf4b87","tests/word_boundary_unicode.rs":"75dbcc35d3abc0f9795c2ea99e216dc227b0a5b58e9ca5eef767815ff0513921"},"package":"d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"}
\ No newline at end of file diff --git a/vendor/regex/CHANGELOG.md b/vendor/regex/CHANGELOG.md index 44274acac..e019afb2f 100644 --- a/vendor/regex/CHANGELOG.md +++ b/vendor/regex/CHANGELOG.md @@ -1,3 +1,192 @@ +1.8.4 (2023-06-05) +================== +This is a patch release that fixes a bug where `(?-u:\B)` was allowed in +Unicode regexes, despite the fact that the current matching engines can report +match offsets between the code units of a single UTF-8 encoded codepoint. That +in turn means that match offsets that split a codepoint could be reported, +which in turn results in panicking when one uses them to slice a `&str`. + +This bug occurred in the transition to `regex 1.8` because the underlying +syntactical error that prevented this regex from compiling was intentionally +removed. That's because `(?-u:\B)` will be permitted in Unicode regexes in +`regex 1.9`, but the matching engines will guarantee to never report match +offsets that split a codepoint. When the underlying syntactical error was +removed, no code was added to ensure that `(?-u:\B)` didn't compile in the +`regex 1.8` transition release. This release, `regex 1.8.4`, adds that code +such that `Regex::new(r"(?-u:\B)")` returns to the `regex <1.8` behavior of +not compiling. (A `bytes::Regex` can still of course compile it.) + +Bug fixes: + +* [BUG #1006](https://github.com/rust-lang/regex/issues/1006): +Fix a bug where `(?-u:\B)` was allowed in Unicode regexes, and in turn could +lead to match offsets that split a codepoint in `&str`. + + +1.8.3 (2023-05-25) +================== +This is a patch release that fixes a bug where the regex would report a +match at every position even when it shouldn't. This could occur in a very +small subset of regexes, usually an alternation of simple literals that +have particular properties. (See the issue linked below for a more precise +description.) + +Bug fixes: + +* [BUG #999](https://github.com/rust-lang/regex/issues/999): +Fix a bug where a match at every position is erroneously reported. + + +1.8.2 (2023-05-22) +================== +This is a patch release that fixes a bug where regex compilation could panic +in debug mode for regexes with large counted repetitions. For example, +`a{2147483516}{2147483416}{5}` resulted in an integer overflow that wrapped +in release mode but panicking in debug mode. Despite the unintended wrapping +arithmetic in release mode, it didn't cause any other logical bugs since the +errant code was for new analysis that wasn't used yet. + +Bug fixes: + +* [BUG #995](https://github.com/rust-lang/regex/issues/995): +Fix a bug where regex compilation with large counted repetitions could panic. + + +1.8.1 (2023-04-21) +================== +This is a patch release that fixes a bug where a regex match could be reported +where none was found. Specifically, the bug occurs when a pattern contains some +literal prefixes that could be extracted _and_ an optional word boundary in the +prefix. + +Bug fixes: + +* [BUG #981](https://github.com/rust-lang/regex/issues/981): +Fix a bug where a word boundary could interact with prefix literal +optimizations and lead to a false positive match. + + +1.8.0 (2023-04-20) +================== +This is a sizeable release that will be soon followed by another sizeable +release. Both of them will combined close over 40 existing issues and PRs. + +This first release, despite its size, essentially represents preparatory work +for the second release, which will be even bigger. Namely, this release: + +* Increases the MSRV to Rust 1.60.0, which was released about 1 year ago. +* Upgrades its dependency on `aho-corasick` to the recently released 1.0 +version. +* Upgrades its dependency on `regex-syntax` to the simultaneously released +`0.7` version. The changes to `regex-syntax` principally revolve around a +rewrite of its literal extraction code and a number of simplifications and +optimizations to its high-level intermediate representation (HIR). + +The second release, which will follow ~shortly after the release above, will +contain a soup-to-nuts rewrite of every regex engine. This will be done by +bringing [`regex-automata`](https://github.com/BurntSushi/regex-automata) into +this repository, and then changing the `regex` crate to be nothing but an API +shim layer on top of `regex-automata`'s API. + +These tandem releases are the culmination of about 3 +years of on-and-off work that [began in earnest in March +2020](https://github.com/rust-lang/regex/issues/656). + +Because of the scale of changes involved in these releases, I would love to +hear about your experience. Especially if you notice undocumented changes in +behavior or performance changes (positive *or* negative). + +Most changes in the first release are listed below. For more details, please +see the commit log, which reflects a linear and decently documented history +of all changes. + +New features: + +* [FEATURE #501](https://github.com/rust-lang/regex/issues/501): +Permit many more characters to be escaped, even if they have no significance. +More specifically, any ASCII character except for `[0-9A-Za-z<>]` can now be +escaped. Also, a new routine, `is_escapeable_character`, has been added to +`regex-syntax` to query whether a character is escapeable or not. +* [FEATURE #547](https://github.com/rust-lang/regex/issues/547): +Add `Regex::captures_at`. This filles a hole in the API, but doesn't otherwise +introduce any new expressive power. +* [FEATURE #595](https://github.com/rust-lang/regex/issues/595): +Capture group names are now Unicode-aware. They can now begin with either a `_` +or any "alphabetic" codepoint. After the first codepoint, subsequent codepoints +can be any sequence of alpha-numeric codepoints, along with `_`, `.`, `[` and +`]`. Note that replacement syntax has not changed. +* [FEATURE #810](https://github.com/rust-lang/regex/issues/810): +Add `Match::is_empty` and `Match::len` APIs. +* [FEATURE #905](https://github.com/rust-lang/regex/issues/905): +Add an `impl Default for RegexSet`, with the default being the empty set. +* [FEATURE #908](https://github.com/rust-lang/regex/issues/908): +A new method, `Regex::static_captures_len`, has been added which returns the +number of capture groups in the pattern if and only if every possible match +always contains the same number of matching groups. +* [FEATURE #955](https://github.com/rust-lang/regex/issues/955): +Named captures can now be written as `(?<name>re)` in addition to +`(?P<name>re)`. +* FEATURE: `regex-syntax` now supports empty character classes. +* FEATURE: `regex-syntax` now has an optional `std` feature. (This will come +to `regex` in the second release.) +* FEATURE: The `Hir` type in `regex-syntax` has had a number of simplifications +made to it. +* FEATURE: `regex-syntax` has support for a new `R` flag for enabling CRLF +mode. This will be supported in `regex` proper in the second release. +* FEATURE: `regex-syntax` now has proper support for "regex that never +matches" via `Hir::fail()`. +* FEATURE: The `hir::literal` module of `regex-syntax` has been completely +re-worked. It now has more documentation, examples and advice. +* FEATURE: The `allow_invalid_utf8` option in `regex-syntax` has been renamed +to `utf8`, and the meaning of the boolean has been flipped. + +Performance improvements: + +* PERF: The upgrade to `aho-corasick 1.0` may improve performance in some +cases. It's difficult to characterize exactly which patterns this might impact, +but if there are a small number of longish (>= 4 bytes) prefix literals, then +it might be faster than before. + +Bug fixes: + +* [BUG #514](https://github.com/rust-lang/regex/issues/514): +Improve `Debug` impl for `Match` so that it doesn't show the entire haystack. +* BUGS [#516](https://github.com/rust-lang/regex/issues/516), +[#731](https://github.com/rust-lang/regex/issues/731): +Fix a number of issues with printing `Hir` values as regex patterns. +* [BUG #610](https://github.com/rust-lang/regex/issues/610): +Add explicit example of `foo|bar` in the regex syntax docs. +* [BUG #625](https://github.com/rust-lang/regex/issues/625): +Clarify that `SetMatches::len` does not (regretably) refer to the number of +matches in the set. +* [BUG #660](https://github.com/rust-lang/regex/issues/660): +Clarify "verbose mode" in regex syntax documentation. +* BUG [#738](https://github.com/rust-lang/regex/issues/738), +[#950](https://github.com/rust-lang/regex/issues/950): +Fix `CaptureLocations::get` so that it never panics. +* [BUG #747](https://github.com/rust-lang/regex/issues/747): +Clarify documentation for `Regex::shortest_match`. +* [BUG #835](https://github.com/rust-lang/regex/issues/835): +Fix `\p{Sc}` so that it is equivalent to `\p{Currency_Symbol}`. +* [BUG #846](https://github.com/rust-lang/regex/issues/846): +Add more clarifying documentation to the `CompiledTooBig` error variant. +* [BUG #854](https://github.com/rust-lang/regex/issues/854): +Clarify that `regex::Regex` searches as if the haystack is a sequence of +Unicode scalar values. +* [BUG #884](https://github.com/rust-lang/regex/issues/884): +Replace `__Nonexhaustive` variants with `#[non_exhaustive]` attribute. +* [BUG #893](https://github.com/rust-lang/regex/pull/893): +Optimize case folding since it can get quite slow in some pathological cases. +* [BUG #895](https://github.com/rust-lang/regex/issues/895): +Reject `(?-u:\W)` in `regex::Regex` APIs. +* [BUG #942](https://github.com/rust-lang/regex/issues/942): +Add a missing `void` keyword to indicate "no parameters" in C API. +* [BUG #965](https://github.com/rust-lang/regex/issues/965): +Fix `\p{Lc}` so that it is equivalent to `\p{Cased_Letter}`. +* [BUG #975](https://github.com/rust-lang/regex/issues/975): +Clarify documentation for `\pX` syntax. + + 1.7.3 (2023-03-24) ================== This is a small release that fixes a bug in `Regex::shortest_match_at` that @@ -743,7 +932,7 @@ Bug gixes: ================== This release includes a ground-up rewrite of the regex-syntax crate, which has been in development for over a year. - +731 New features: * Error messages for invalid regexes have been greatly improved. You get these diff --git a/vendor/regex/Cargo.lock b/vendor/regex/Cargo.lock index f91c8879b..6cf8da756 100644 --- a/vendor/regex/Cargo.lock +++ b/vendor/regex/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" dependencies = [ "memchr", ] @@ -19,9 +19,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", @@ -36,9 +36,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.139" +version = "0.2.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" [[package]] name = "memchr" @@ -75,7 +75,7 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.3" +version = "1.8.4" dependencies = [ "aho-corasick", "lazy_static", @@ -87,9 +87,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.29" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "wasi" diff --git a/vendor/regex/Cargo.toml b/vendor/regex/Cargo.toml index 37e44fb3b..b4371c4b9 100644 --- a/vendor/regex/Cargo.toml +++ b/vendor/regex/Cargo.toml @@ -10,9 +10,10 @@ # See Cargo.toml.orig for the original contents. [package] -edition = "2018" +edition = "2021" +rust-version = "1.60.0" name = "regex" -version = "1.7.3" +version = "1.8.4" authors = ["The Rust Project Developers"] exclude = [ "/scripts/*", @@ -31,13 +32,13 @@ license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex" [profile.bench] -debug = true +debug = 2 [profile.release] -debug = true +debug = 2 [profile.test] -debug = true +debug = 2 [lib] doctest = false @@ -80,15 +81,15 @@ name = "crates-regex" path = "tests/test_crates_regex.rs" [dependencies.aho-corasick] -version = "0.7.18" +version = "1.0.0" optional = true [dependencies.memchr] -version = "2.4.0" +version = "2.5.0" optional = true [dependencies.regex-syntax] -version = "0.6.29" +version = "0.7.2" default-features = false [dev-dependencies.lazy_static] diff --git a/vendor/regex/README.md b/vendor/regex/README.md index 861417da6..020b35395 100644 --- a/vendor/regex/README.md +++ b/vendor/regex/README.md @@ -9,7 +9,7 @@ by [RE2](https://github.com/google/re2). [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) -[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) +[![Rust](https://img.shields.io/badge/rust-1.60.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation @@ -218,7 +218,7 @@ The full set of features one can disable are ### Minimum Rust version policy -This crate's minimum supported `rustc` version is `1.41.1`. +This crate's minimum supported `rustc` version is `1.60.0`. The current **tentative** policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if diff --git a/vendor/regex/record/README.md b/vendor/regex/record/README.md new file mode 100644 index 000000000..432b06ab9 --- /dev/null +++ b/vendor/regex/record/README.md @@ -0,0 +1,4 @@ +This directory contains various recordings of results. These are committed to +the repository so that they can be compared over time. (At the time of writing, +there is no tooling for facilitating this comparison. It has to be done +manually.) diff --git a/vendor/regex/record/compile-test/2023-04-19_1.7.3.csv b/vendor/regex/record/compile-test/2023-04-19_1.7.3.csv new file mode 100644 index 000000000..af62da10a --- /dev/null +++ b/vendor/regex/record/compile-test/2023-04-19_1.7.3.csv @@ -0,0 +1,11 @@ +name,crate,revision,profile,duration,size,relative-size +regex__dev__std_perf_unicode,regex,9582040009,dev,1.824209152s,3434992,3113064 +regex__dev__std,regex,9582040009,dev,1.206314935s,1362392,1040464 +regex__dev__std_perf,regex,9582040009,dev,1.543583435s,2726384,2404456 +regex__dev__std_unicode,regex,9582040009,dev,1.490095643s,2066904,1744976 +regex__dev__std_unicode-case_unicode-perl,regex,9582040009,dev,1.292011694s,1812952,1491024 +regex__release__std_perf_unicode,regex,9582040009,release,2.398133563s,1616216,1294368 +regex__release__std,regex,9582040009,release,1.413680252s,694592,372744 +regex__release__std_perf,regex,9582040009,release,2.341496191s,1124696,802848 +regex__release__std_unicode,regex,9582040009,release,1.671407822s,1190208,868360 +regex__release__std_unicode-case_unicode-perl,regex,9582040009,release,1.441712198s,932160,610312 diff --git a/vendor/regex/record/compile-test/2023-04-20_master.csv b/vendor/regex/record/compile-test/2023-04-20_master.csv new file mode 100644 index 000000000..4c3e91674 --- /dev/null +++ b/vendor/regex/record/compile-test/2023-04-20_master.csv @@ -0,0 +1,11 @@ +name,crate,revision,profile,duration,size,relative-size +regex__dev__std_perf_unicode,regex,f1f99af2bc,dev,1.834267609s,3799536,3477608 +regex__dev__std,regex,f1f99af2bc,dev,1.263958602s,1427928,1106000 +regex__dev__std_perf,regex,f1f99af2bc,dev,1.631302845s,3234288,2912360 +regex__dev__std_unicode,regex,f1f99af2bc,dev,1.550536696s,1997272,1675344 +regex__dev__std_unicode-case_unicode-perl,regex,f1f99af2bc,dev,1.341622852s,1739224,1417296 +regex__release__std_perf_unicode,regex,f1f99af2bc,release,2.475080323s,1755480,1433632 +regex__release__std,regex,f1f99af2bc,release,1.45990031s,731456,409608 +regex__release__std_perf,regex,f1f99af2bc,release,2.421787211s,1259864,938016 +regex__release__std_unicode,regex,f1f99af2bc,release,1.693972619s,1227072,905224 +regex__release__std_unicode-case_unicode-perl,regex,f1f99af2bc,release,1.528003306s,969024,647176 diff --git a/vendor/regex/record/compile-test/README.md b/vendor/regex/record/compile-test/README.md new file mode 100644 index 000000000..7291d5d37 --- /dev/null +++ b/vendor/regex/record/compile-test/README.md @@ -0,0 +1,27 @@ +This directory contains the results of compilation tests. Specifically, +the results are from testing both the from scratch compilation time and +relative binary size increases of various features for both the `regex` and +`regex-automata` crates. + +Here's an example of how to run these tests for just the `regex` crate. You'll +need the `regex-cli` command installed, which can be found in the `regex-cli` +directory in the root of this repository. + +This must be run in the root of a checkout of this repository. + +``` +$ mkdir /tmp/regex-compile-test +$ regex-cli compile-test ./ /tmp/regex-compile-test | tee record/compile-test/2023-04-19_1.7.3.csv +``` + +You can then look at the results using a tool like [`xsv`][xsv]: + +``` +$ xsv table record/compile-test/2023-04-19_1.7.3.csv +``` + +Note that the relative binary size is computed by building a "baseline" hello +world program, and then subtracting that from the size of a binary that uses +the regex crate. + +[xsv]: https://github.com/BurntSushi/xsv diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs index 90ca25015..23e63ec89 100644 --- a/vendor/regex/src/compile.rs +++ b/vendor/regex/src/compile.rs @@ -4,7 +4,7 @@ use std::iter; use std::result; use std::sync::Arc; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::hir::{self, Hir, Look}; use regex_syntax::is_word_byte; use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; @@ -137,13 +137,24 @@ impl Compiler { } fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> { + if self.compiled.only_utf8 + && expr.properties().look_set().contains(Look::WordAsciiNegate) + { + return Err(Error::Syntax( + "ASCII-only \\B is not allowed in Unicode regexes \ + because it may result in invalid UTF-8 matches" + .to_string(), + )); + } // If we're compiling a forward DFA and we aren't anchored, then // add a `.*?` before the first capture group. // Other matching engines handle this by baking the logic into the // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - self.compiled.is_anchored_start = expr.is_anchored_start(); - self.compiled.is_anchored_end = expr.is_anchored_end(); + self.compiled.is_anchored_start = + expr.properties().look_set_prefix().contains(Look::Start); + self.compiled.is_anchored_end = + expr.properties().look_set_suffix().contains(Look::End); if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; @@ -159,6 +170,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_explicit_captures_len(); self.compile_finish() } @@ -168,10 +181,12 @@ impl Compiler { ) -> result::Result<Program, Error> { debug_assert!(exprs.len() > 1); - self.compiled.is_anchored_start = - exprs.iter().all(|e| e.is_anchored_start()); - self.compiled.is_anchored_end = - exprs.iter().all(|e| e.is_anchored_end()); + self.compiled.is_anchored_start = exprs + .iter() + .all(|e| e.properties().look_set_prefix().contains(Look::Start)); + self.compiled.is_anchored_end = exprs + .iter() + .all(|e| e.properties().look_set_suffix().contains(Look::End)); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; @@ -272,17 +287,21 @@ impl Compiler { self.check_size()?; match *expr.kind() { Empty => self.c_empty(), - Literal(hir::Literal::Unicode(c)) => self.c_char(c), - Literal(hir::Literal::Byte(b)) => { - assert!(self.compiled.uses_bytes()); - self.c_byte(b) + Literal(hir::Literal(ref bytes)) => { + if self.compiled.is_reverse { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + self.c_literal(&bytes) + } else { + self.c_literal(bytes) + } } Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), Class(hir::Class::Bytes(ref cls)) => { if self.compiled.uses_bytes() { self.c_class_bytes(cls.ranges()) } else { - assert!(cls.is_all_ascii()); + assert!(cls.is_ascii()); let mut char_ranges = vec![]; for r in cls.iter() { let (s, e) = (r.start() as char, r.end() as char); @@ -291,92 +310,94 @@ impl Compiler { self.c_class(&char_ranges) } } - Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::EndText) - } - Anchor(hir::Anchor::StartText) => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) => { - self.c_empty_look(prog::EmptyLook::EndText) - } - WordBoundary(hir::WordBoundary::Unicode) => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" - .to_string(), - )); + Look(ref look) => match *look { + hir::Look::Start if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::WordBoundary) - } - WordBoundary(hir::WordBoundary::UnicodeNegate) => { - if !cfg!(feature = "unicode-perl") { + hir::Look::Start => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), + hir::Look::StartLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartCRLF | hir::Look::EndCRLF => { return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" + "CRLF-aware line anchors are not supported yet" .to_string(), )); } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::NotWordBoundary) - } - WordBoundary(hir::WordBoundary::Ascii) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) - } - WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) - } - Group(ref g) => match g.kind { - hir::GroupKind::NonCapturing => self.c(&g.hir), - hir::GroupKind::CaptureIndex(index) => { - if index as usize >= self.compiled.captures.len() { - self.compiled.captures.push(None); + hir::Look::WordAscii => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + hir::Look::WordAsciiNegate => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + hir::Look::WordUnicode => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) } - hir::GroupKind::CaptureName { index, ref name } => { - if index as usize >= self.compiled.captures.len() { - let n = name.to_string(); - self.compiled.captures.push(Some(n.clone())); - self.capture_name_idx.insert(n, index as usize); + hir::Look::WordUnicodeNegate => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) } }, + Capture(hir::Capture { index, ref name, ref sub }) => { + if index as usize >= self.compiled.captures.len() { + let name = match *name { + None => None, + Some(ref boxed_str) => Some(boxed_str.to_string()), + }; + self.compiled.captures.push(name.clone()); + if let Some(name) = name { + self.capture_name_idx.insert(name, index as usize); + } + } + self.c_capture(2 * index as usize, sub) + } Concat(ref es) => { if self.compiled.is_reverse { self.c_concat(es.iter().rev()) @@ -420,21 +441,19 @@ impl Compiler { } fn c_dotstar(&mut self) -> Result { - Ok(if !self.compiled.only_utf8() { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(true)), - }))? - .unwrap() + let hir = if self.compiled.only_utf8() { + Hir::dot(hir::Dot::AnyChar) } else { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + Hir::dot(hir::Dot::AnyByte) + }; + Ok(self + .c(&Hir::repetition(hir::Repetition { + min: 0, + max: None, greedy: false, - hir: Box::new(Hir::any(false)), + sub: Box::new(hir), }))? - .unwrap() - }) + .unwrap()) } fn c_char(&mut self, c: char) -> ResultOrEmpty { @@ -457,7 +476,11 @@ impl Compiler { fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; - assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { @@ -482,7 +505,11 @@ impl Compiler { &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { - debug_assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } let first_split_entry = self.insts.len(); let mut holes = vec![]; @@ -513,6 +540,52 @@ impl Compiler { Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } + fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { + match core::str::from_utf8(bytes) { + Ok(string) => { + let mut it = string.chars(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(ch) => { + if let Some(p) = self.c_char(ch)? { + break p; + } + } + } + }; + for ch in it { + if let Some(p) = self.c_char(ch)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + Err(_) => { + assert!(self.compiled.uses_bytes()); + let mut it = bytes.iter().copied(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(byte) => { + if let Some(p) = self.c_byte(byte)? { + break p; + } + } + } + }; + for byte in it { + if let Some(p) = self.c_byte(byte)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + } + } + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator<Item = &'a Hir>, @@ -587,19 +660,15 @@ impl Compiler { } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - use regex_syntax::hir::RepetitionKind::*; - match rep.kind { - ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), - ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), - OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), - Range(hir::RepetitionRange::Exactly(min_max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) - } - Range(hir::RepetitionRange::AtLeast(min)) => { - self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) + match (rep.min, rep.max) { + (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), + (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), + (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), + (min, None) => { + self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) } - Range(hir::RepetitionRange::Bounded(min, max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min, max) + (min, Some(max)) => { + self.c_repeat_range(&rep.sub, rep.greedy, min, max) } } } diff --git a/vendor/regex/src/dfa.rs b/vendor/regex/src/dfa.rs index dc9952120..78ed71021 100644 --- a/vendor/regex/src/dfa.rs +++ b/vendor/regex/src/dfa.rs @@ -1576,7 +1576,7 @@ impl<'a> Fsm<'a> { /// inputs, a new state could be created for every byte of input. (This is /// bad for memory use, so we bound it with a cache.) fn approximate_size(&self) -> usize { - self.cache.size + self.prog.approximate_size() + self.cache.size } } diff --git a/vendor/regex/src/error.rs b/vendor/regex/src/error.rs index 3e0ec7521..6c341f604 100644 --- a/vendor/regex/src/error.rs +++ b/vendor/regex/src/error.rs @@ -6,8 +6,26 @@ use std::iter::repeat; pub enum Error { /// A syntax error. Syntax(String), - /// The compiled program exceeded the set size limit. - /// The argument is the size limit imposed. + /// The compiled program exceeded the set size + /// limit. The argument is the size limit imposed by + /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even + /// when not configured explicitly, it defaults to a reasonable limit. + /// + /// If you're getting this error, it occurred because your regex has been + /// compiled to an intermediate state that is too big. It is important to + /// note that exceeding this limit does _not_ mean the regex is too big to + /// _work_, but rather, the regex is big enough that it may wind up being + /// surprisingly slow when used in a search. In other words, this error is + /// meant to be a practical heuristic for avoiding a performance footgun, + /// and especially so for the case where the regex pattern is coming from + /// an untrusted source. + /// + /// There are generally two ways to move forward if you hit this error. + /// The first is to find some way to use a smaller regex. The second is to + /// increase the size limit via `RegexBuilder::size_limit`. However, if + /// your regex pattern is not from a trusted source, then neither of these + /// approaches may be appropriate. Instead, you'll have to determine just + /// how big of a regex you want to allow. CompiledTooBig(usize), /// Hints that destructuring should not be exhaustive. /// diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs index b9abcdc04..ee8b589d2 100644 --- a/vendor/regex/src/exec.rs +++ b/vendor/regex/src/exec.rs @@ -4,9 +4,9 @@ use std::panic::AssertUnwindSafe; use std::sync::Arc; #[cfg(feature = "perf-literal")] -use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; +use aho_corasick::{AhoCorasick, MatchKind}; +use regex_syntax::hir::literal; +use regex_syntax::hir::{Hir, Look}; use regex_syntax::ParserBuilder; use crate::backtrack; @@ -78,15 +78,18 @@ struct ExecReadOnly { /// not supported.) Note that this program contains an embedded `.*?` /// preceding the first capture group, unless the regex is anchored at the /// beginning. + #[allow(dead_code)] dfa: Program, /// The same as above, except the program is reversed (and there is no /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. + #[allow(dead_code)] dfa_reverse: Program, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside /// the matching engines. + #[allow(dead_code)] suffixes: LiteralSearcher, /// An Aho-Corasick automaton with leftmost-first match semantics. /// @@ -98,7 +101,7 @@ struct ExecReadOnly { /// if we were to exhaust the ID space, we probably would have long /// surpassed the compilation size limit. #[cfg(feature = "perf-literal")] - ac: Option<AhoCorasick<u32>>, + ac: Option<AhoCorasick>, /// match_type encodes as much upfront knowledge about how we're going to /// execute a search as possible. match_type: MatchType, @@ -121,8 +124,8 @@ pub struct ExecBuilder { /// literals. struct Parsed { exprs: Vec<Hir>, - prefixes: Literals, - suffixes: Literals, + prefixes: literal::Seq, + suffixes: literal::Seq, bytes: bool, } @@ -228,8 +231,8 @@ impl ExecBuilder { /// Parse the current set of patterns into their AST and extract literals. fn parse(&self) -> Result<Parsed, Error> { let mut exprs = Vec::with_capacity(self.options.pats.len()); - let mut prefixes = Some(Literals::empty()); - let mut suffixes = Some(Literals::empty()); + let mut prefixes = Some(literal::Seq::empty()); + let mut suffixes = Some(literal::Seq::empty()); let mut bytes = false; let is_set = self.options.pats.len() > 1; // If we're compiling a regex set and that set has any anchored @@ -243,54 +246,103 @@ impl ExecBuilder { .swap_greed(self.options.swap_greed) .ignore_whitespace(self.options.ignore_whitespace) .unicode(self.options.unicode) - .allow_invalid_utf8(!self.only_utf8) + .utf8(self.only_utf8) .nest_limit(self.options.nest_limit) .build(); let expr = parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; - bytes = bytes || !expr.is_always_utf8(); + let props = expr.properties(); + // This used to just check whether the HIR matched valid UTF-8 + // or not, but in regex-syntax 0.7, we changed our definition of + // "matches valid UTF-8" to exclude zero-width matches. And in + // particular, previously, we considered WordAsciiNegate (that + // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our + // matcher engines were built under this assumption and fixing + // them is not worth it with the imminent plan to switch over to + // regex-automata. So for now, we retain the previous behavior by + // just explicitly treating the presence of a negated ASCII word + // boundary as forcing use to use a byte oriented automaton. + bytes = bytes + || !props.is_utf8() + || props.look_set().contains(Look::WordAsciiNegate); if cfg!(feature = "perf-literal") { - if !expr.is_anchored_start() && expr.is_any_anchored_start() { + if !props.look_set_prefix().contains(Look::Start) + && props.look_set().contains(Look::Start) + { // Partial anchors unfortunately make it hard to use // prefixes, so disable them. prefixes = None; - } else if is_set && expr.is_anchored_start() { + } else if is_set + && props.look_set_prefix_any().contains(Look::Start) + { // Regex sets with anchors do not go well with literal // optimizations. prefixes = None; + } else if props.look_set_prefix_any().contains_word() { + // The new literal extractor ignores look-around while + // the old one refused to extract prefixes from regexes + // that began with a \b. These old creaky regex internals + // can't deal with it, so we drop it. + prefixes = None; + } else if props.look_set_prefix_any().contains(Look::StartLF) { + // Similar to the reasoning for word boundaries, this old + // regex engine can't handle literal prefixes with '(?m:^)' + // at the beginning of a regex. + prefixes = None; } - prefixes = prefixes.and_then(|mut prefixes| { - if !prefixes.union_prefixes(&expr) { - None - } else { - Some(prefixes) - } - }); - if !expr.is_anchored_end() && expr.is_any_anchored_end() { + if !props.look_set_suffix().contains(Look::End) + && props.look_set().contains(Look::End) + { // Partial anchors unfortunately make it hard to use // suffixes, so disable them. suffixes = None; - } else if is_set && expr.is_anchored_end() { + } else if is_set + && props.look_set_suffix_any().contains(Look::End) + { // Regex sets with anchors do not go well with literal // optimizations. suffixes = None; + } else if props.look_set_suffix_any().contains_word() { + // See the prefix case for reasoning here. + suffixes = None; + } else if props.look_set_suffix_any().contains(Look::EndLF) { + // See the prefix case for reasoning here. + suffixes = None; } - suffixes = suffixes.and_then(|mut suffixes| { - if !suffixes.union_suffixes(&expr) { - None + + let (mut pres, mut suffs) = + if prefixes.is_none() && suffixes.is_none() { + (literal::Seq::infinite(), literal::Seq::infinite()) } else { - Some(suffixes) - } + literal_analysis(&expr) + }; + // These old creaky regex internals can't handle cases where + // the literal sequences are exact but there are look-around + // assertions. So we make sure the sequences are inexact if + // there are look-around assertions anywhere. This forces the + // regex engines to run instead of assuming that a literal + // match implies an overall match. + if !props.look_set().is_empty() { + pres.make_inexact(); + suffs.make_inexact(); + } + prefixes = prefixes.and_then(|mut prefixes| { + prefixes.union(&mut pres); + Some(prefixes) + }); + suffixes = suffixes.and_then(|mut suffixes| { + suffixes.union(&mut suffs); + Some(suffixes) }); } exprs.push(expr); } Ok(Parsed { exprs, - prefixes: prefixes.unwrap_or_else(Literals::empty), - suffixes: suffixes.unwrap_or_else(Literals::empty), + prefixes: prefixes.unwrap_or_else(literal::Seq::empty), + suffixes: suffixes.unwrap_or_else(literal::Seq::empty), bytes, }) } @@ -356,7 +408,7 @@ impl ExecBuilder { } #[cfg(feature = "perf-literal")] - fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { + fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> { if parsed.exprs.len() != 1 { return None; } @@ -370,10 +422,9 @@ impl ExecBuilder { return None; } Some( - AhoCorasickBuilder::new() + AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) - .auto_configure(&lits) - .build_with_size::<u32, _, _>(&lits) + .build(&lits) // This should never happen because we'd long exceed the // compilation limit for regexes first. .expect("AC automaton too big"), @@ -1311,6 +1362,12 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { &self.ro.nfa.capture_name_idx } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option<usize> { + self.ro.nfa.static_captures_len + } } impl Clone for Exec { @@ -1382,7 +1439,18 @@ impl ExecReadOnly { // This case shouldn't happen. When the regex isn't // anchored, then complete prefixes should imply complete // suffixes. - Some(MatchType::Literal(MatchLiteralType::Unanchored)) + // + // The above is wrong! This case can happen. While + // complete prefixes should imply complete suffixes + // here, that doesn't necessarily mean we have a useful + // prefix matcher! It could be the case that the literal + // searcher decided the prefixes---even though they are + // "complete"---weren't good enough and thus created an + // empty matcher. If that happens and we return Unanchored + // here, then we'll end up using that matcher, which is + // very bad because it matches at every position. So... + // return None. + None }; } None @@ -1557,7 +1625,7 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { // optimization pipeline, because this is a terribly inflexible way to go // about things. - if !expr.is_alternation_literal() { + if !expr.properties().is_alternation_literal() { return None; } let alts = match *expr.kind() { @@ -1565,25 +1633,19 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { _ => return None, // one literal isn't worth it }; - let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { - Literal::Unicode(c) => { - let mut buf = [0; 4]; - dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); - } - Literal::Byte(b) => { - dst.push(b); - } - }; - let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } _ => unreachable!("expected literal, got {:?}", e), } } @@ -1595,6 +1657,48 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { Some(lits) } +#[cfg(not(feature = "perf-literal"))] +fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) { + (literal::Seq::infinite(), literal::Seq::infinite()) +} + +#[cfg(feature = "perf-literal")] +fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) { + const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)]; + + let mut prefixes = literal::Extractor::new() + .kind(literal::ExtractKind::Prefix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match prefixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + prefixes.keep_first_bytes(keep); + prefixes.minimize_by_preference(); + } + + let mut suffixes = literal::Extractor::new() + .kind(literal::ExtractKind::Suffix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match suffixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + suffixes.keep_last_bytes(keep); + suffixes.minimize_by_preference(); + } + + (prefixes, suffixes) +} + #[cfg(test)] mod test { #[test] diff --git a/vendor/regex/src/expand.rs b/vendor/regex/src/expand.rs index 67b514926..98fafc949 100644 --- a/vendor/regex/src/expand.rs +++ b/vendor/regex/src/expand.rs @@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { }) } -/// Returns true if and only if the given byte is allowed in a capture name. +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, @@ -236,4 +237,11 @@ mod tests { find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); } diff --git a/vendor/regex/src/lib.rs b/vendor/regex/src/lib.rs index 6b95739c5..82c1b77ad 100644 --- a/vendor/regex/src/lib.rs +++ b/vendor/regex/src/lib.rs @@ -199,6 +199,8 @@ instead.) This implementation executes regular expressions **only** on valid UTF-8 while exposing match locations as byte indices into the search string. (To relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) +Conceptually, the regex engine works by matching a haystack as if it were a +sequence of Unicode scalar values. Only simple case folding is supported. Namely, when matching case-insensitively, the characters are first mapped using the "simple" case @@ -285,9 +287,9 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). . any character except new line (includes new line with s flag) \d digit (\p{Nd}) \D not digit -\pN One-letter name Unicode character class +\pX Unicode character class identified by a one-letter name \p{Greek} Unicode character class (general category or script) -\PN Negated one-letter name Unicode character class +\PX Negated Unicode character class identified by a one-letter name \P{Greek} negated Unicode character class (general category or script) </pre> @@ -325,6 +327,25 @@ xy concatenation (x followed by y) x|y alternation (x or y, prefer x) </pre> +This example shows how an alternation works, and what it means to prefer a +branch in the alternation over subsequent branches. + +``` +use regex::Regex; + +let haystack = "samwise"; +// If 'samwise' comes first in our alternation, then it is +// preferred as a match, even if the regex engine could +// technically detect that 'sam' led to a match earlier. +let re = Regex::new(r"samwise|sam").unwrap(); +assert_eq!("samwise", re.find(haystack).unwrap().as_str()); +// But if 'sam' comes first, then it will match instead. +// In this case, it is impossible for 'samwise' to match +// because 'sam' is a prefix of it. +let re = Regex::new(r"sam|samwise").unwrap(); +assert_eq!("sam", re.find(haystack).unwrap().as_str()); +``` + ## Repetitions <pre class="rust"> @@ -360,12 +381,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`. <pre class="rust"> (exp) numbered capture group (indexed by opening parenthesis) -(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]]) +(?P<name>exp) named (also numbered) capture group (names must be alpha-numeric) +(?<name>exp) named (also numbered) capture group (names must be alpha-numeric) (?:exp) non-capturing group (?flags) set flags within current group (?flags:exp) set flags for exp (non-capturing) </pre> +Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + Flags are each a single character. For example, `(?x)` sets the flag `x` and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets @@ -379,9 +407,13 @@ m multi-line mode: ^ and $ match begin/end of line s allow . to match \n U swap the meaning of x* and x*? u Unicode support (enabled by default) -x ignore whitespace and allow line comments (starting with `#`) +x verbose mode, ignores whitespace and allow line comments (starting with `#`) </pre> +Note that in verbose mode, whitespace is ignored everywhere, including within +character classes. To insert whitespace, use its escaped form or a hex literal. +For example, `\ ` or `\x20` for an ASCII space. + Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: diff --git a/vendor/regex/src/literal/imp.rs b/vendor/regex/src/literal/imp.rs index 90b2f1160..75fa6e37b 100644 --- a/vendor/regex/src/literal/imp.rs +++ b/vendor/regex/src/literal/imp.rs @@ -1,8 +1,8 @@ use std::mem; -use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; +use aho_corasick::{self, packed, AhoCorasick}; use memchr::{memchr, memchr2, memchr3, memmem}; -use regex_syntax::hir::literal::{Literal, Literals}; +use regex_syntax::hir::literal::{Literal, Seq}; /// A prefix extracted from a compiled regular expression. /// @@ -26,7 +26,7 @@ enum Matcher { /// A single substring, using vector accelerated routines when available. Memmem(Memmem), /// An Aho-Corasick automaton. - AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, + AC { ac: AhoCorasick, lits: Vec<Literal> }, /// A packed multiple substring searcher, using SIMD. /// /// Note that Aho-Corasick will actually use this packed searcher @@ -39,27 +39,26 @@ enum Matcher { impl LiteralSearcher { /// Returns a matcher that never matches and never advances the input. pub fn empty() -> Self { - Self::new(Literals::empty(), Matcher::Empty) + Self::new(Seq::infinite(), Matcher::Empty) } /// Returns a matcher for literal prefixes from the given set. - pub fn prefixes(lits: Literals) -> Self { + pub fn prefixes(lits: Seq) -> Self { let matcher = Matcher::prefixes(&lits); Self::new(lits, matcher) } /// Returns a matcher for literal suffixes from the given set. - pub fn suffixes(lits: Literals) -> Self { + pub fn suffixes(lits: Seq) -> Self { let matcher = Matcher::suffixes(&lits); Self::new(lits, matcher) } - fn new(lits: Literals, matcher: Matcher) -> Self { - let complete = lits.all_complete(); + fn new(lits: Seq, matcher: Matcher) -> Self { LiteralSearcher { - complete, - lcp: Memmem::new(lits.longest_common_prefix()), - lcs: Memmem::new(lits.longest_common_suffix()), + complete: lits.is_exact(), + lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")), + lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")), matcher, } } @@ -150,7 +149,7 @@ impl LiteralSearcher { Empty => 0, Bytes(ref sset) => sset.dense.len(), Memmem(_) => 1, - AC { ref ac, .. } => ac.pattern_count(), + AC { ref ac, .. } => ac.patterns_len(), Packed { ref lits, .. } => lits.len(), } } @@ -162,27 +161,31 @@ impl LiteralSearcher { Empty => 0, Bytes(ref sset) => sset.approximate_size(), Memmem(ref single) => single.approximate_size(), - AC { ref ac, .. } => ac.heap_bytes(), - Packed { ref s, .. } => s.heap_bytes(), + AC { ref ac, .. } => ac.memory_usage(), + Packed { ref s, .. } => s.memory_usage(), } } } impl Matcher { - fn prefixes(lits: &Literals) -> Self { + fn prefixes(lits: &Seq) -> Self { let sset = SingleByteSet::prefixes(lits); Matcher::new(lits, sset) } - fn suffixes(lits: &Literals) -> Self { + fn suffixes(lits: &Seq) -> Self { let sset = SingleByteSet::suffixes(lits); Matcher::new(lits, sset) } - fn new(lits: &Literals, sset: SingleByteSet) -> Self { - if lits.literals().is_empty() { + fn new(lits: &Seq, sset: SingleByteSet) -> Self { + if lits.is_empty() || lits.min_literal_len() == Some(0) { return Matcher::Empty; } + let lits = match lits.literals() { + None => return Matcher::Empty, + Some(members) => members, + }; if sset.dense.len() >= 26 { // Avoid trying to match a large number of single bytes. // This is *very* sensitive to a frequency analysis comparison @@ -195,26 +198,26 @@ impl Matcher { if sset.complete { return Matcher::Bytes(sset); } - if lits.literals().len() == 1 { - return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + if lits.len() == 1 { + return Matcher::Memmem(Memmem::new(lits[0].as_bytes())); } - let pats = lits.literals().to_owned(); + let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect(); let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; - if lits.literals().len() <= 100 && !is_aho_corasick_fast { + if lits.len() <= 100 && !is_aho_corasick_fast { let mut builder = packed::Config::new() .match_kind(packed::MatchKind::LeftmostFirst) .builder(); if let Some(s) = builder.extend(&pats).build() { - return Matcher::Packed { s, lits: pats }; + return Matcher::Packed { s, lits: lits.to_owned() }; } } - let ac = AhoCorasickBuilder::new() + let ac = AhoCorasick::builder() .match_kind(aho_corasick::MatchKind::LeftmostFirst) - .dfa(true) - .build_with_size::<u32, _, _>(&pats) + .kind(Some(aho_corasick::AhoCorasickKind::DFA)) + .build(&pats) .unwrap(); - Matcher::AC { ac, lits: pats } + Matcher::AC { ac, lits: lits.to_owned() } } } @@ -257,7 +260,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } LiteralIter::Packed(ref mut lits) => { @@ -266,7 +269,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } } @@ -291,11 +294,15 @@ impl SingleByteSet { } } - fn prefixes(lits: &Literals) -> SingleByteSet { + fn prefixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(0) { + if let Some(&b) = lit.as_bytes().get(0) { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; @@ -308,11 +315,15 @@ impl SingleByteSet { sset } - fn suffixes(lits: &Literals) -> SingleByteSet { + fn suffixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if let Some(&b) = lit.as_bytes().last() { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; diff --git a/vendor/regex/src/literal/mod.rs b/vendor/regex/src/literal/mod.rs index 980f52330..b9fb77aed 100644 --- a/vendor/regex/src/literal/mod.rs +++ b/vendor/regex/src/literal/mod.rs @@ -6,7 +6,7 @@ mod imp; #[allow(missing_docs)] #[cfg(not(feature = "perf-literal"))] mod imp { - use regex_syntax::hir::literal::Literals; + use regex_syntax::hir::literal::Seq; #[derive(Clone, Debug)] pub struct LiteralSearcher(()); @@ -16,11 +16,11 @@ mod imp { LiteralSearcher(()) } - pub fn prefixes(_: Literals) -> Self { + pub fn prefixes(_: Seq) -> Self { LiteralSearcher(()) } - pub fn suffixes(_: Literals) -> Self { + pub fn suffixes(_: Seq) -> Self { LiteralSearcher(()) } diff --git a/vendor/regex/src/prog.rs b/vendor/regex/src/prog.rs index c211f71d8..100862cf1 100644 --- a/vendor/regex/src/prog.rs +++ b/vendor/regex/src/prog.rs @@ -27,6 +27,9 @@ pub struct Program { pub captures: Vec<Option<String>>, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc<HashMap<String, usize>>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option<usize>, /// A pointer to the start instruction. This can vary depending on how /// the program was compiled. For example, programs for use with the DFA /// engine have a `.*?` inserted at the beginning of unanchored regular @@ -83,6 +86,7 @@ impl Program { matches: vec![], captures: vec![], capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, start: 0, byte_classes: vec![0; 256], only_utf8: true, diff --git a/vendor/regex/src/re_bytes.rs b/vendor/regex/src/re_bytes.rs index 07e9f98ac..e3a3b019b 100644 --- a/vendor/regex/src/re_bytes.rs +++ b/vendor/regex/src/re_bytes.rs @@ -17,7 +17,7 @@ use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Match represents a single match of a regex in a haystack. /// /// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'t> { text: &'t [u8], start: usize, @@ -37,6 +37,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] @@ -57,6 +69,24 @@ impl<'t> Match<'t> { } } +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut fmt = f.debug_struct("Match"); + fmt.field("start", &self.start).field("end", &self.end); + if let Ok(s) = std::str::from_utf8(self.as_bytes()) { + fmt.field("bytes", &s); + } else { + // FIXME: It would be nice if this could be printed as a string + // with invalid UTF-8 replaced with hex escapes. A alloc would + // probably okay if that makes it easier, but regex-automata does + // (at time of writing) have internal routines that do this. So + // maybe we should expose them. + fmt.field("bytes", &self.as_bytes()); + } + fmt.finish() + } +} + impl<'t> From<Match<'t>> for Range<usize> { fn from(m: Match<'t>) -> Range<usize> { m.range() @@ -253,12 +283,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { - let mut locs = self.capture_locations(); - self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text, - locs: locs.0, - named_groups: self.0.capture_name_idx().clone(), - }) + self.captures_at(text, 0) } /// Returns an iterator over all the non-overlapping capture groups matched @@ -537,7 +562,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -598,6 +630,25 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of @@ -667,6 +718,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { @@ -856,6 +947,27 @@ impl<'r> FusedIterator for CaptureNames<'r> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); diff --git a/vendor/regex/src/re_set.rs b/vendor/regex/src/re_set.rs index a6d886d76..7c8253f0c 100644 --- a/vendor/regex/src/re_set.rs +++ b/vendor/regex/src/re_set.rs @@ -289,6 +289,12 @@ impl RegexSet { } } +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + /// A set of matches returned by a regex set. #[derive(Clone, Debug)] pub struct SetMatches { @@ -315,6 +321,11 @@ impl SetMatches { } /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. pub fn len(&self) -> usize { self.matches.len() } diff --git a/vendor/regex/src/re_trait.rs b/vendor/regex/src/re_trait.rs index d0c717df5..505810c84 100644 --- a/vendor/regex/src/re_trait.rs +++ b/vendor/regex/src/re_trait.rs @@ -20,7 +20,7 @@ impl Locations { /// not match anything. The positions returned are *always* byte indices /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); + let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?); match (self.0.get(s), self.0.get(e)) { (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), _ => None, diff --git a/vendor/regex/src/re_unicode.rs b/vendor/regex/src/re_unicode.rs index 197510ea0..57689086d 100644 --- a/vendor/regex/src/re_unicode.rs +++ b/vendor/regex/src/re_unicode.rs @@ -25,7 +25,7 @@ pub fn escape(text: &str) -> String { /// Match represents a single match of a regex in a haystack. /// /// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'t> { text: &'t str, start: usize, @@ -45,6 +45,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] @@ -65,6 +77,16 @@ impl<'t> Match<'t> { } } +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Match") + .field("start", &self.start) + .field("end", &self.end) + .field("string", &self.as_str()) + .finish() + } +} + impl<'t> From<Match<'t>> for &'t str { fn from(m: Match<'t>) -> &'t str { m.as_str() @@ -309,12 +331,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { - let mut locs = self.capture_locations(); - self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text, - locs: locs.0, - named_groups: self.0.capture_name_idx().clone(), - }) + self.captures_at(text, 0) } /// Returns an iterator over all the non-overlapping capture groups matched @@ -595,7 +612,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -615,12 +639,12 @@ impl Regex { self.shortest_match_at(text, 0) } - /// Returns the same as shortest_match, but starts the search at the given - /// offset. + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. /// /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. pub fn shortest_match_at( &self, text: &str, @@ -656,6 +680,25 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of @@ -725,6 +768,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { @@ -866,6 +949,27 @@ impl<'r, 't> FusedIterator for SplitN<'r, 't> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); diff --git a/vendor/regex/tests/regression.rs b/vendor/regex/tests/regression.rs index e8b252538..291062a77 100644 --- a/vendor/regex/tests/regression.rs +++ b/vendor/regex/tests/regression.rs @@ -220,3 +220,44 @@ matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); // See: https://github.com/rust-lang/regex/issues/862 mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/981 +#[cfg(feature = "unicode")] +#[test] +fn regression_bad_word_boundary() { + let re = regex_new!(r#"(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"#).unwrap(); + let hay = "ubi-Darwin-x86_64.tar.gz"; + assert!(!re.is_match(text!(hay))); + let hay = "ubi-Windows-x86_64.zip"; + assert!(re.is_match(text!(hay))); +} + +// See: https://github.com/rust-lang/regex/issues/982 +#[cfg(feature = "unicode-perl")] +#[test] +fn regression_unicode_perl_not_enabled() { + let pat = r"(\d+\s?(years|year|y))?\s?(\d+\s?(months|month|m))?\s?(\d+\s?(weeks|week|w))?\s?(\d+\s?(days|day|d))?\s?(\d+\s?(hours|hour|h))?"; + let re = regex_new!(pat); + assert!(re.is_ok()); +} + +// See: https://github.com/rust-lang/regex/issues/995 +#[test] +fn regression_big_regex_overflow() { + let pat = r" {2147483516}{2147483416}{5}"; + let re = regex_new!(pat); + assert!(re.is_err()); +} + +#[test] +fn regression_complete_literals_suffix_incorrect() { + let needles = vec![ + "aA", "bA", "cA", "dA", "eA", "fA", "gA", "hA", "iA", "jA", "kA", + "lA", "mA", "nA", "oA", "pA", "qA", "rA", "sA", "tA", "uA", "vA", + "wA", "xA", "yA", "zA", + ]; + let pattern = needles.join("|"); + let re = regex!(&pattern); + let hay = "FUBAR"; + assert_eq!(0, re.find_iter(text!(hay)).count()); +} diff --git a/vendor/regex/tests/regression_fuzz.rs b/vendor/regex/tests/regression_fuzz.rs index 4e76704d2..5f49530a7 100644 --- a/vendor/regex/tests/regression_fuzz.rs +++ b/vendor/regex/tests/regression_fuzz.rs @@ -29,3 +29,12 @@ fn big_regex_fails_to_compile() { let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}"; assert!(regex_new!(pat).is_err()); } + +// This was caught while on master but before a release went out(!). +// +// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=58173 +#[test] +fn todo() { + let pat = "(?:z|xx)@|xx"; + assert!(regex_new!(pat).is_ok()); +} diff --git a/vendor/regex/tests/replace.rs b/vendor/regex/tests/replace.rs index d65be072f..f23c57551 100644 --- a/vendor/regex/tests/replace.rs +++ b/vendor/regex/tests/replace.rs @@ -15,7 +15,7 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); replace!( groups, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", t!("$2 $1"), "w2 w1" @@ -23,7 +23,7 @@ replace!( replace!( double_dollar, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", t!("$2 $$1"), "w2 $1" @@ -33,7 +33,7 @@ replace!( replace!( named, replace_all, - r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)", + r"(?P<first>[^ ]+)[ ]+(?P<last>[^ ]+)(?P<space>[ ]*)", "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3" @@ -51,7 +51,7 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); replace!( simple_expand, replace_all, - r"(?-u)(\w) (\w)", + r"([a-z]) ([a-z])", "a b", t!("$2 $1"), "b a" @@ -59,7 +59,7 @@ replace!( replace!( literal_dollar1, replace_all, - r"(?-u)(\w+) (\w+)", + r"([a-z]+) ([a-z]+)", "a b", t!("$$1"), "$1" @@ -67,7 +67,7 @@ replace!( replace!( literal_dollar2, replace_all, - r"(?-u)(\w+) (\w+)", + r"([a-z]+) ([a-z]+)", "a b", t!("$2 $$c $1"), "b $c a" @@ -75,7 +75,7 @@ replace!( replace!( no_expand1, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", no_expand!("$2 $1"), "$2 $1" @@ -83,7 +83,7 @@ replace!( replace!( no_expand2, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", no_expand!("$$1"), "$$1" diff --git a/vendor/regex/tests/set.rs b/vendor/regex/tests/set.rs index 37fcf8700..d1144d662 100644 --- a/vendor/regex/tests/set.rs +++ b/vendor/regex/tests/set.rs @@ -65,3 +65,10 @@ fn len_and_empty() { assert_eq!(not_empty.len(), 2); assert!(!not_empty.is_empty()); } + +#[test] +fn default_set_is_empty() { + let set: regex::bytes::RegexSet = Default::default(); + assert_eq!(set.len(), 0); + assert!(set.is_empty()); +} diff --git a/vendor/regex/tests/unicode.rs b/vendor/regex/tests/unicode.rs index 9b3228624..d7dbdd31b 100644 --- a/vendor/regex/tests/unicode.rs +++ b/vendor/regex/tests/unicode.rs @@ -35,6 +35,8 @@ mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); // We should test more, but there's a lot. Write a script to generate more of // these tests. mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter2, r"\p{gc=LC}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter3, r"\p{LC}", "A", Some((0, 3))); mat!( uni_class_gencat_close_punctuation, r"\p{Close_Punctuation}", @@ -77,6 +79,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); // See: https://github.com/rust-lang/regex/issues/719 mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1))); mat!( uni_class_gencat_initial_punctuation, r"\p{Initial_Punctuation}", |