From 018c4950b9406055dec02ef0fb52f132e2bb1e2c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 19 Jun 2024 11:25:56 +0200 Subject: Merging upstream version 1.76.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/.cargo-checksum.json | 2 +- vendor/regex-automata/Cargo.toml | 5 +- vendor/regex-automata/src/dfa/accel.rs | 13 +- vendor/regex-automata/src/dfa/automaton.rs | 190 ++++- vendor/regex-automata/src/dfa/dense.rs | 138 +-- vendor/regex-automata/src/dfa/mod.rs | 2 +- vendor/regex-automata/src/dfa/onepass.rs | 18 +- vendor/regex-automata/src/dfa/regex.rs | 2 +- vendor/regex-automata/src/dfa/search.rs | 10 - vendor/regex-automata/src/dfa/sparse.rs | 293 +++---- vendor/regex-automata/src/hybrid/dfa.rs | 205 +++-- vendor/regex-automata/src/hybrid/error.rs | 115 ++- vendor/regex-automata/src/hybrid/mod.rs | 2 +- vendor/regex-automata/src/hybrid/regex.rs | 2 +- vendor/regex-automata/src/hybrid/search.rs | 10 +- vendor/regex-automata/src/meta/limited.rs | 12 - vendor/regex-automata/src/meta/regex.rs | 6 +- vendor/regex-automata/src/meta/stopat.rs | 12 - vendor/regex-automata/src/meta/strategy.rs | 8 +- vendor/regex-automata/src/meta/wrappers.rs | 5 +- .../regex-automata/src/nfa/thompson/backtrack.rs | 28 +- vendor/regex-automata/src/nfa/thompson/builder.rs | 6 +- vendor/regex-automata/src/nfa/thompson/compiler.rs | 10 +- vendor/regex-automata/src/nfa/thompson/map.rs | 2 +- vendor/regex-automata/src/nfa/thompson/nfa.rs | 6 +- .../regex-automata/src/nfa/thompson/range_trie.rs | 2 +- vendor/regex-automata/src/util/captures.rs | 3 +- vendor/regex-automata/src/util/determinize/mod.rs | 124 ++- .../regex-automata/src/util/determinize/state.rs | 39 +- vendor/regex-automata/src/util/lazy.rs | 6 +- vendor/regex-automata/src/util/look.rs | 935 +++++++++++++++++++-- vendor/regex-automata/src/util/mod.rs | 2 +- vendor/regex-automata/src/util/pool.rs | 67 +- vendor/regex-automata/src/util/start.rs | 243 +++++- vendor/regex-automata/tests/dfa/suite.rs | 6 +- vendor/regex-automata/tests/hybrid/api.rs | 4 +- vendor/regex-automata/tests/lib.rs | 1 + 37 files changed, 1956 insertions(+), 578 deletions(-) (limited to 'vendor/regex-automata') diff --git a/vendor/regex-automata/.cargo-checksum.json b/vendor/regex-automata/.cargo-checksum.json index a6d7742a2..718faae68 100644 --- a/vendor/regex-automata/.cargo-checksum.json +++ b/vendor/regex-automata/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"Cargo.toml":"374956c91c0582e4437674cdf8e67c0ea70ea2d4a7c2fb49d322727dbfca047f","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"61db25dbf26092fc80e8db89165692e55f9fb86b14e8451ebb28303f45932254","src/dfa/accel.rs":"800dada38f7a1d0fa443821dc04a8611c6cf06ef431e517f16867a27cbb4f27d","src/dfa/automaton.rs":"565ebf211769b4710091c4a15b5733296e9fbbc2a137d6eeb1c521b7b69463a0","src/dfa/dense.rs":"73c9c7662c0b4b7901eb17632187aac10bb24e16a89a4dfe78a7bf17bc98f9f1","src/dfa/determinize.rs":"91b9f69d28bdd064aa86716fe0772e4145050fd458bb7869a28660b4f7b64872","src/dfa/minimize.rs":"b5cadb462b9f24cd4aa7a665e75fb813cd06858a92b8986c9c5ae7fd9a60dfab","src/dfa/mod.rs":"e7210af01805f2f390374cd5b40ee502f9dc7633d6a57d988dcb17dfd93864cb","src/dfa/onepass.rs":"d1b29d531545ce30167d58eb24ac15ba10bce158e73483c09f219d5971c8e83c","src/dfa/regex.rs":"f970028c874e2a156db1591bbdc3915027ffa7f47d66d5bd6e97dace5a6a3d5b","src/dfa/remapper.rs":"ca096abc0f8e45c43a2adf3a7743b8857714ae7411a623edea41cc3ce906a169","src/dfa/search.rs":"237cdb8c6239ece5fe8279c4b6209c8094502cdecc9a4e3f977e469e60fd32ec","src/dfa/sparse.rs":"c3a05451a0019850b538dfd640fb12c92ac127b8a69c55f23489bd42c1c1f289","src/dfa/special.rs":"c2e60de5b98e68c9c45aaffbc67a08f049831a764a1ed29d1d1db0fb68efdce5","src/dfa/start.rs":"46b1dbaf8e4518ddddda6bbe596621aae36f8ba694390483a22355d9d799be8e","src/hybrid/dfa.rs":"861b3602bb9ac8b10abe0eae18a2641b2145fcfc7fb22b250ed2e3a345762f4c","src/hybrid/error.rs":"ffc6e65fd9e4694a67902f3516970e3e6cd6e33a7f59a5ab2ac16f740a049d9c","src/hybrid/id.rs":"6168aad5c81c627494ba0575a24d61fd0ae7efabaaceeadb8ff28472275e2813","src/hybrid/mod.rs":"49abcf332f19d2fe87c0a729b1b7715a87794e64f411f4d2bab9d8a4331d6ace","src/hybrid/regex.rs":"9f40aa2cfa89d7a97f9c9e32cb2ae591f4b6f3d51ddec41308d99ce924e130cf","src/hybrid/search.rs":"2aae7ab24c7e6b8d1a1aa81a2f6081f949e9fa42e960fd3fea29f57db8db9f68","src/lib.rs":"4e831d41057760c5f2f1274a206fa5a42f59dbca8f98ad3e782fe0fba0d6c37f","src/macros.rs":"3e4b39252bfa471fad384160a43f113ebfec7bec46a85d16f006622881dd2081","src/meta/error.rs":"710a6813314b1b11ace1b016a827067fff8b2624d47e15c7f52043bff5ab57da","src/meta/limited.rs":"cf629b08d64cb2e1c17d196a1ad6084f733a41e1c947715d9c0ea99ba7f7657d","src/meta/literal.rs":"52da98bb30995dedd22786e4728cb84e84c6093a284168bd91196b999dd0f6ec","src/meta/mod.rs":"f3b10b96fa08efaba3e4c9b81883cf40aac6e4c1f6ae55a497a534cf5805b46d","src/meta/regex.rs":"12ec35a66b889172439c4abebde5f9fb41e85765d6613f4bf622429e83d47b3c","src/meta/reverse_inner.rs":"945d6c2d4c7538e1609dbd430a096784d22abd33db58b1ba65c9c9af45a7d3c0","src/meta/stopat.rs":"b786cd0bd21f66c6f63df2d4bc2e544cd041d548d8001b4a818be1e0f84b6747","src/meta/strategy.rs":"4ee8d21def7323105e5b1101bdb1e152c5befa870a11f2bf0fa85ffbac5a6609","src/meta/wrappers.rs":"6998ff14226905eded36697f885a8ca7508b50ffb05c4b78348ff0e9463857d5","src/nfa/mod.rs":"1a731e217ed4053714500e84e58cc127f402e4e075f7d0e5b9aea715cd52405a","src/nfa/thompson/backtrack.rs":"e9a986d71aa9b0145d9f871c92f466e1b992592d8ac87f7fde36ede2e8016324","src/nfa/thompson/builder.rs":"77bdd42a7fbdedb8d6756f0161d278e677ab1fbe622ca77115c8b506a2a6db21","src/nfa/thompson/compiler.rs":"9cc351398c2d9ce10ac11a1c285f675bc351ecb816d3f33321513dd6bfcdc335","src/nfa/thompson/error.rs":"78488c2fdb85f819f53cc30bb11c7f96169112da5dd14c351e5cc3bcccf0e10e","src/nfa/thompson/literal_trie.rs":"c2d1d09b44da4648db797386c2410cbf63337afef8cb62e6e78cf34786892a11","src/nfa/thompson/map.rs":"96cdf3195f7efb374bcb1791ef5cc12a1cde189ab90402bf01d9b46fb7796b60","src/nfa/thompson/mod.rs":"0651520debd6f023ae1a2c422806aab37f8491e5bb092e20dfdc4fe4179d695c","src/nfa/thompson/nfa.rs":"9782d44b05986370b7f948067977fb20120562e2eca0e4366e35d7d18e81a679","src/nfa/thompson/pikevm.rs":"aaf792832d1bf15fad8a8f0b2e6597170361eb3cbcb9343eb5bd242ff346d750","src/nfa/thompson/range_trie.rs":"c9614074628bb56c9d0a137c1db7e13259a6500e4a46cdc7ddc84bee8f7e928f","src/util/alphabet.rs":"94cd73ce2f4e34e0ae0a146d3efdc85478263afdfefd6dc105e0abf0ec79d82b","src/util/captures.rs":"7aee3aae2836a397c1ad6e4535e0e0d177faf2d99e61476e8fb2710f69763668","src/util/determinize/mod.rs":"32fea73cf4a7a04238c3d3b09ea7afc7fd7c85e87dc115c6152f464ab88bddb2","src/util/determinize/state.rs":"2a0082d5cd2bd47ab75c3f04488655a3c47f1f75075b5d6f9b6e4eeb8980823e","src/util/empty.rs":"13ec7d6cbd1520db5b4c1dae294f4419fa88d39d2bfc16f4ef258473d609f91c","src/util/escape.rs":"5b2731b41a55cb50ab688132bb5640dbd51f14f141adaa864b9db7f0aa092c74","src/util/int.rs":"b7eec0a6cab0798ba66707988fce3ecfc841b93418028a7b1408c5d0f6271351","src/util/interpolate.rs":"5e4e6b6fb6e5a7603e393bf05c609735d86a7d1f54c2436e42111b4e1409b6dd","src/util/iter.rs":"58ae97b4156d7160a46b909f4635d88d10354d9d892d2fcb4c5e18e24cf38f14","src/util/lazy.rs":"e489a96fce952e9d196fd3f5564cf8ea3374eb4aef630ff8f12d82f194ed4336","src/util/look.rs":"e7a5a51f8ed70c2f97edaf3dfbe8859de37b570341447634c6028cb89ff412d7","src/util/memchr.rs":"573109ce4983907083ae0b29a084a324b9b53da369b4d96f7f3a21fd5c8eb5c9","src/util/mod.rs":"16c5fd72263d3a4df994111b81aca36da17f591f4853f21a6a906ac725843f97","src/util/pool.rs":"acc5a4922b276bc9801f5fb58539824460cb69b34a575cecbd7eb56b1d3b4de0","src/util/prefilter/aho_corasick.rs":"c54fa95f4d9e7ab53e2c6463a43f8953df6a440997fc9cd528f225db0dd32582","src/util/prefilter/byteset.rs":"1c80fa432acc23223a75a5181e37c40034764dffe42410e4b77af6f24f48bd5c","src/util/prefilter/memchr.rs":"36c6fe6354b2e729db6830166dd4862e439bc48c9e59258d88e4b6c5654e20ef","src/util/prefilter/memmem.rs":"6f6ed9450b14abf3e4a33d395337e51fbaa9743a0a16aac0009f7680aa60c500","src/util/prefilter/mod.rs":"2818e2e92632aee1c46b0dc01b654e544bfbf460236be86d28a2d836e9fc189a","src/util/prefilter/teddy.rs":"ed54d26858b56e1c8c87e44afae5f63d81ab930787d79e671f3a3513f576e9cd","src/util/primitives.rs":"8a9cc19ef2e1ab183943cdc2d2f095b02252476e32b7e9fff4a06a251749b068","src/util/search.rs":"66bf320ebbe403c119a966f3dfbd53178de0ceebd2ca1922f1ddbb79aed36837","src/util/sparse_set.rs":"3d4aa30b6aa9fc875d36506487a5095dbe8ed528b89e4146a65c7e7497520a4d","src/util/start.rs":"8d2fe005698c0bd3680a0dbfc4a34eebfe2f51081ec1584968383ac4c86fd5fe","src/util/syntax.rs":"720ac0d6600fad33f5967b5afe4e3de2096b857e4cda6fa16ba93b10a8230cab","src/util/unicode_data/mod.rs":"54c3e10bbc393e9881bfac3295815b160f59e69e2056bc29ee7cf0addd8e3cf7","src/util/unicode_data/perl_word.rs":"2e1a5d889598bd4e73af17d3a9f7d6b4cf2f6ab24920a5336e496bb255281e56","src/util/utf8.rs":"7a068009fdf07e693e521b1f0264725c0e6118dbe1eab55da9d0eab21785fcc1","src/util/wire.rs":"bfdf52615c516b6c07db3ce9c333ea61fdc535bd0b79560bbd7f6864ab83946e","test":"39d79ce3532c31a51c0be89a2939816fad0e4868d2b03992c202cbe64dce9f6c","tests/dfa/api.rs":"cc28e366b6bcbfcf379265acd492a92c62743c3f20e7a2b273019679aa9e1291","tests/dfa/mod.rs":"924d8fff500b9b7b140082623023e78007058a87323151cd8e361462945e4f16","tests/dfa/onepass/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/dfa/onepass/suite.rs":"6d63ec5469e6876656ae607cdbe07e6a4e17ace7836b67435763c9b1d233438a","tests/dfa/regression.rs":"ebcf2645290286aa7531eb2b7951385e5ed8167532437aeca2ad2049768fd796","tests/dfa/suite.rs":"cf08499bc8838d2ff16ea9b20b07ad03c9b89d6efe093f081e2982a21ea6d666","tests/fuzz/dense.rs":"3e1099a0cce61e85abc0ad81bc592e85f497f159ef0e5d1d32bac1936aa6f20c","tests/fuzz/mod.rs":"043773510e02f51def43ee0c2b8b867c53ecc8638c8a9233b2ac098de9c3ac1e","tests/fuzz/sparse.rs":"ba61db4927ab28953037a4b20317399c86d01b4d774e46c020ade19029215e25","tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9":"8961279a8237c3e318452024dd971b1d5a26b058260c297382a74daca1b7f0d1","tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9":"c2d52e3dea78d3f159b5b521d433358a7fee45ce20ed1545067d461f45ef66b8","tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000":"5b2d273023de3fb04037eaf2e6b4f51cced4c5a08d2e6b44e4be540774f939b9","tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9":"e2e22e2f46a9a75b5c876476442276cf675fe244c5cf918789e4f6b14078fbd9","tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98":"24a12712e1f2ba0a40b5782707908a74dd19941dc372ef525d65a7134f91988c","tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838":"a97f39b2febf9c73535681f7a86201e4b06d5a1ffcf135299c96c1cabfa9f6c4","tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570":"44fe3ef878d35e2d51c2c17ff89bbbe3a4650e09d0cbbd48625c0f5e4dd0848b","tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b":"d5534be36653b4af6cb94a7c63be58869bb8c204c5c63d67a4d6c986b44bb2e1","tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9":"77b844898610560afa09f2b8de73a85a0ba9a3b8cee4ff1bbf26b8c97ad4e8a2","tests/gen/README.md":"c3bfdf2f9ced501dd5bd75d01509a34e503efb2dff2f5f7b260580dde5519ed4","tests/gen/dense/mod.rs":"5ae1cfb46212a674118ada2f66f37b25188e84643d406b95eb4665d722344262","tests/gen/dense/multi_pattern_v2.rs":"29b1e9a799adecbdbe7cd05e9748f664c2b915b10b1d2f5d36cfb6453826d1d2","tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa":"8421d5a1bfc0b6c3bdc8fc90dff591a046b0aaf8e06ef7de7cc293004a35d061","tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa":"dcf2fd5fd49f5f53cf1ec66f61623402f39401cb3aea30d6677b98bb1e9541bf","tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa":"73c4f20d984e544dfa4cf05f3009d0a9b52fa84bc97b501ea0ccd179e2def4bc","tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa":"74471209f05754e8e20c8a0222a5877b1b15b8b8f33cd8cac89ea65f708b4aff","tests/gen/mod.rs":"043773510e02f51def43ee0c2b8b867c53ecc8638c8a9233b2ac098de9c3ac1e","tests/gen/sparse/mod.rs":"5ae1cfb46212a674118ada2f66f37b25188e84643d406b95eb4665d722344262","tests/gen/sparse/multi_pattern_v2.rs":"e00fb2a510a215460aab84573196b1f51bb65884ff494c2382534c04f6fdbfe9","tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa":"3287956bd2003cd69653b125f82aade95d99adbb20229bfdbb4958b8877c0a0b","tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa":"bdf285901eaaac4596380115c5bbb20ab2f42f593d8d9e9238a00ed69863f9c9","tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa":"e466dc085dd68b2d2220932a0e4d28759edd161c1fdad652240aa3825fd85268","tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa":"80358d0c26c1cc7284065b0075f5b8804d83e673a8a8c8327f93a1c1ff455399","tests/hybrid/api.rs":"4b8592c412e6ad0ce4a27ed1c1496acc92366ccb1c7ec23c6fd0596fc6ebbdfb","tests/hybrid/mod.rs":"4856a49a4d9b5e9e079c2719a5e75c32408b37e9b76cbdea057b388a3537af6d","tests/hybrid/suite.rs":"688972275c5ef38cdc5112a1e6e54ccd2bf8290008ae2b17344c6c81e17e3a5a","tests/lib.rs":"5e8a014d53097dba1f865e5e35c35a69cd12f54fad74b5c49a387f8768c30847","tests/meta/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/meta/suite.rs":"4c441f9df82508a5e60dd08f266183f772fc9b2b236fbf69cab87650ecf3b424","tests/nfa/mod.rs":"49055c358e38d97e42acb1602c671f97dddf24cafe089490f0e79ed208d74d9b","tests/nfa/thompson/backtrack/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/nfa/thompson/backtrack/suite.rs":"4e7baff70fc98b98b8297c6fd6d5818beb20343379e16cdb95bee46207ac4bd6","tests/nfa/thompson/mod.rs":"de9f5bcea1a8d1f03c85c55ad8c0747877d69e344fcd6c6886b0a402f0661291","tests/nfa/thompson/pikevm/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/nfa/thompson/pikevm/suite.rs":"263837ebf5b2e1906a06237982ea875386d83567e399b4ec1f669f10b1422599"},"package":"c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"} \ No newline at end of file +{"files":{"Cargo.toml":"68de160610af9bb545a752c5da0d82b581e98e0f2631e2253c8a992fc51e322b","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"61db25dbf26092fc80e8db89165692e55f9fb86b14e8451ebb28303f45932254","src/dfa/accel.rs":"2a045b0f6715e913d18d2212a7804fabaadfc3bcffad9382e35574d32eb0c492","src/dfa/automaton.rs":"c14707007bbb915fd5607424b0a4c8e53fa7daf6c7c1f4e3045d51ef15f9b202","src/dfa/dense.rs":"1de3a93f33525b419e3f47ab98d364234ccd68fc30273fd362fe8161a9a37ea3","src/dfa/determinize.rs":"91b9f69d28bdd064aa86716fe0772e4145050fd458bb7869a28660b4f7b64872","src/dfa/minimize.rs":"b5cadb462b9f24cd4aa7a665e75fb813cd06858a92b8986c9c5ae7fd9a60dfab","src/dfa/mod.rs":"ab1ac378d81bb5ea40a23cf903928adae4758e30f54646afde71869234965723","src/dfa/onepass.rs":"013f09b795955aefd07936994f08df4bc5b39698797f586b85171f778162aeab","src/dfa/regex.rs":"d16f0434a0b0f1341d6d5e0a162e6afa29411a786fb37b0e98bbcc0c6ba3cfec","src/dfa/remapper.rs":"ca096abc0f8e45c43a2adf3a7743b8857714ae7411a623edea41cc3ce906a169","src/dfa/search.rs":"79b9ab2b0636177bc26d1ad6f0059ca033decf74824cb5a36f1ac19f020d2713","src/dfa/sparse.rs":"c863d92a4d919fa880dfca3d59a8b5b672c6ffa8423578b34fc0af2ae62e1d7a","src/dfa/special.rs":"c2e60de5b98e68c9c45aaffbc67a08f049831a764a1ed29d1d1db0fb68efdce5","src/dfa/start.rs":"46b1dbaf8e4518ddddda6bbe596621aae36f8ba694390483a22355d9d799be8e","src/hybrid/dfa.rs":"a6ed6d3268e4008f88c1469029a84391edfee7851df2912640763e4ba2188635","src/hybrid/error.rs":"37db2a9759721de4ca2c49e21ab74dd3d998b67c5ab0e65a62085b57ec1d7ba3","src/hybrid/id.rs":"6168aad5c81c627494ba0575a24d61fd0ae7efabaaceeadb8ff28472275e2813","src/hybrid/mod.rs":"ca21e89062bdb5a0998d5cd1bc78609af1f6b795533e5982be969c383ac0463a","src/hybrid/regex.rs":"47815d025526330291f4cd749b4dd79b1122ef208fe6f0a49715c70fc1ea47c8","src/hybrid/search.rs":"76067f3f8675013dcdf7e9c9cc4d9d33d1107fb2cbcd7adcc05cfd42177d90cc","src/lib.rs":"4e831d41057760c5f2f1274a206fa5a42f59dbca8f98ad3e782fe0fba0d6c37f","src/macros.rs":"3e4b39252bfa471fad384160a43f113ebfec7bec46a85d16f006622881dd2081","src/meta/error.rs":"710a6813314b1b11ace1b016a827067fff8b2624d47e15c7f52043bff5ab57da","src/meta/limited.rs":"98b6b2d19f67d4ce3ddb110e06045f22a040590262fde33614ab900bdd06b25b","src/meta/literal.rs":"52da98bb30995dedd22786e4728cb84e84c6093a284168bd91196b999dd0f6ec","src/meta/mod.rs":"f3b10b96fa08efaba3e4c9b81883cf40aac6e4c1f6ae55a497a534cf5805b46d","src/meta/regex.rs":"b0fab107d3f972db89568e14fec0199ba4cd8076cc5fd61c2582db42885f196e","src/meta/reverse_inner.rs":"945d6c2d4c7538e1609dbd430a096784d22abd33db58b1ba65c9c9af45a7d3c0","src/meta/stopat.rs":"acb6122e17d10a9b1b5e72d6030e6d95748227975bad0ff5cbbcc2587edfa6df","src/meta/strategy.rs":"c882c5c261de5fe58bc65251d2d407e4cb483b9b80c2bec5eba958ef90e0072d","src/meta/wrappers.rs":"3cb0717f87b7082cc75cb02148b8cde30cffbee689bdb6275abcf1416747ceb4","src/nfa/mod.rs":"1a731e217ed4053714500e84e58cc127f402e4e075f7d0e5b9aea715cd52405a","src/nfa/thompson/backtrack.rs":"041015ea153c1e485e9cf39ec60d1e51c7ab9e400ecd77cad2078af45775339b","src/nfa/thompson/builder.rs":"7adf6aba69171f6acd47fea0fec85ba589154fead83f2042a1c6fe9486aa4dbd","src/nfa/thompson/compiler.rs":"a8bb24f7f125a294cb75af9d8332821142738278d8eff354647ae08f66a597af","src/nfa/thompson/error.rs":"78488c2fdb85f819f53cc30bb11c7f96169112da5dd14c351e5cc3bcccf0e10e","src/nfa/thompson/literal_trie.rs":"c2d1d09b44da4648db797386c2410cbf63337afef8cb62e6e78cf34786892a11","src/nfa/thompson/map.rs":"4dcfc0a43bbd91ff69cc8a47486b5f7cac048f7d5cf269b63933d7f2e177a533","src/nfa/thompson/mod.rs":"0651520debd6f023ae1a2c422806aab37f8491e5bb092e20dfdc4fe4179d695c","src/nfa/thompson/nfa.rs":"410c3745c159eb17bea18256ec03ee92e1fccca630f01a24618a75fffcf86866","src/nfa/thompson/pikevm.rs":"aaf792832d1bf15fad8a8f0b2e6597170361eb3cbcb9343eb5bd242ff346d750","src/nfa/thompson/range_trie.rs":"f081e74e8c08e2268f1ce410608fefa4b14d7d1c0487dbc27d27d51ea6265e30","src/util/alphabet.rs":"94cd73ce2f4e34e0ae0a146d3efdc85478263afdfefd6dc105e0abf0ec79d82b","src/util/captures.rs":"d2a118ba509b70e9922a10ea9f78771b14a521abb0ed4029be3ef6aeea44d032","src/util/determinize/mod.rs":"5e9e1f7dd060d69521b743afc2b900b21ad7942e17397084ac6563ea5dcf2fd9","src/util/determinize/state.rs":"c30eac89137df0f0128143eeb2e0c8d7ea4bd659825fa6721b5315141a326e3a","src/util/empty.rs":"13ec7d6cbd1520db5b4c1dae294f4419fa88d39d2bfc16f4ef258473d609f91c","src/util/escape.rs":"5b2731b41a55cb50ab688132bb5640dbd51f14f141adaa864b9db7f0aa092c74","src/util/int.rs":"b7eec0a6cab0798ba66707988fce3ecfc841b93418028a7b1408c5d0f6271351","src/util/interpolate.rs":"5e4e6b6fb6e5a7603e393bf05c609735d86a7d1f54c2436e42111b4e1409b6dd","src/util/iter.rs":"58ae97b4156d7160a46b909f4635d88d10354d9d892d2fcb4c5e18e24cf38f14","src/util/lazy.rs":"e16b3ed139210ca546fc302c463ce52a5dcfa77382f07c9097400ed8cddf78c8","src/util/look.rs":"fbfcaace79d0c6ad3698c9d6c025cb952f2e00cf88a48cf690d087fa73466689","src/util/memchr.rs":"573109ce4983907083ae0b29a084a324b9b53da369b4d96f7f3a21fd5c8eb5c9","src/util/mod.rs":"6c828a493f0f88c8b515aee4f8faf91ba653eb07e8fc3c23c0524553410803f9","src/util/pool.rs":"da1fad31f2fdf15cf3a6a605ece8d6162d8f6c42770c160af4c0fbf4ef148aa5","src/util/prefilter/aho_corasick.rs":"c54fa95f4d9e7ab53e2c6463a43f8953df6a440997fc9cd528f225db0dd32582","src/util/prefilter/byteset.rs":"1c80fa432acc23223a75a5181e37c40034764dffe42410e4b77af6f24f48bd5c","src/util/prefilter/memchr.rs":"36c6fe6354b2e729db6830166dd4862e439bc48c9e59258d88e4b6c5654e20ef","src/util/prefilter/memmem.rs":"6f6ed9450b14abf3e4a33d395337e51fbaa9743a0a16aac0009f7680aa60c500","src/util/prefilter/mod.rs":"2818e2e92632aee1c46b0dc01b654e544bfbf460236be86d28a2d836e9fc189a","src/util/prefilter/teddy.rs":"ed54d26858b56e1c8c87e44afae5f63d81ab930787d79e671f3a3513f576e9cd","src/util/primitives.rs":"8a9cc19ef2e1ab183943cdc2d2f095b02252476e32b7e9fff4a06a251749b068","src/util/search.rs":"66bf320ebbe403c119a966f3dfbd53178de0ceebd2ca1922f1ddbb79aed36837","src/util/sparse_set.rs":"3d4aa30b6aa9fc875d36506487a5095dbe8ed528b89e4146a65c7e7497520a4d","src/util/start.rs":"73ebcf2550cea56f67b9048fa3dc91f3a8db9897fbd2400dd9941efb0cb4827e","src/util/syntax.rs":"720ac0d6600fad33f5967b5afe4e3de2096b857e4cda6fa16ba93b10a8230cab","src/util/unicode_data/mod.rs":"54c3e10bbc393e9881bfac3295815b160f59e69e2056bc29ee7cf0addd8e3cf7","src/util/unicode_data/perl_word.rs":"2e1a5d889598bd4e73af17d3a9f7d6b4cf2f6ab24920a5336e496bb255281e56","src/util/utf8.rs":"7a068009fdf07e693e521b1f0264725c0e6118dbe1eab55da9d0eab21785fcc1","src/util/wire.rs":"bfdf52615c516b6c07db3ce9c333ea61fdc535bd0b79560bbd7f6864ab83946e","test":"39d79ce3532c31a51c0be89a2939816fad0e4868d2b03992c202cbe64dce9f6c","tests/dfa/api.rs":"cc28e366b6bcbfcf379265acd492a92c62743c3f20e7a2b273019679aa9e1291","tests/dfa/mod.rs":"924d8fff500b9b7b140082623023e78007058a87323151cd8e361462945e4f16","tests/dfa/onepass/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/dfa/onepass/suite.rs":"6d63ec5469e6876656ae607cdbe07e6a4e17ace7836b67435763c9b1d233438a","tests/dfa/regression.rs":"ebcf2645290286aa7531eb2b7951385e5ed8167532437aeca2ad2049768fd796","tests/dfa/suite.rs":"2812aa0167ee5e93eff3f7d45096a78c5f3a2440197a513b3cf0310286640f51","tests/fuzz/dense.rs":"3e1099a0cce61e85abc0ad81bc592e85f497f159ef0e5d1d32bac1936aa6f20c","tests/fuzz/mod.rs":"043773510e02f51def43ee0c2b8b867c53ecc8638c8a9233b2ac098de9c3ac1e","tests/fuzz/sparse.rs":"ba61db4927ab28953037a4b20317399c86d01b4d774e46c020ade19029215e25","tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9":"8961279a8237c3e318452024dd971b1d5a26b058260c297382a74daca1b7f0d1","tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9":"c2d52e3dea78d3f159b5b521d433358a7fee45ce20ed1545067d461f45ef66b8","tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000":"5b2d273023de3fb04037eaf2e6b4f51cced4c5a08d2e6b44e4be540774f939b9","tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9":"e2e22e2f46a9a75b5c876476442276cf675fe244c5cf918789e4f6b14078fbd9","tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98":"24a12712e1f2ba0a40b5782707908a74dd19941dc372ef525d65a7134f91988c","tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838":"a97f39b2febf9c73535681f7a86201e4b06d5a1ffcf135299c96c1cabfa9f6c4","tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570":"44fe3ef878d35e2d51c2c17ff89bbbe3a4650e09d0cbbd48625c0f5e4dd0848b","tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b":"d5534be36653b4af6cb94a7c63be58869bb8c204c5c63d67a4d6c986b44bb2e1","tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9":"77b844898610560afa09f2b8de73a85a0ba9a3b8cee4ff1bbf26b8c97ad4e8a2","tests/gen/README.md":"c3bfdf2f9ced501dd5bd75d01509a34e503efb2dff2f5f7b260580dde5519ed4","tests/gen/dense/mod.rs":"5ae1cfb46212a674118ada2f66f37b25188e84643d406b95eb4665d722344262","tests/gen/dense/multi_pattern_v2.rs":"29b1e9a799adecbdbe7cd05e9748f664c2b915b10b1d2f5d36cfb6453826d1d2","tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa":"8421d5a1bfc0b6c3bdc8fc90dff591a046b0aaf8e06ef7de7cc293004a35d061","tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa":"dcf2fd5fd49f5f53cf1ec66f61623402f39401cb3aea30d6677b98bb1e9541bf","tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa":"73c4f20d984e544dfa4cf05f3009d0a9b52fa84bc97b501ea0ccd179e2def4bc","tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa":"74471209f05754e8e20c8a0222a5877b1b15b8b8f33cd8cac89ea65f708b4aff","tests/gen/mod.rs":"043773510e02f51def43ee0c2b8b867c53ecc8638c8a9233b2ac098de9c3ac1e","tests/gen/sparse/mod.rs":"5ae1cfb46212a674118ada2f66f37b25188e84643d406b95eb4665d722344262","tests/gen/sparse/multi_pattern_v2.rs":"e00fb2a510a215460aab84573196b1f51bb65884ff494c2382534c04f6fdbfe9","tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa":"3287956bd2003cd69653b125f82aade95d99adbb20229bfdbb4958b8877c0a0b","tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa":"bdf285901eaaac4596380115c5bbb20ab2f42f593d8d9e9238a00ed69863f9c9","tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa":"e466dc085dd68b2d2220932a0e4d28759edd161c1fdad652240aa3825fd85268","tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa":"80358d0c26c1cc7284065b0075f5b8804d83e673a8a8c8327f93a1c1ff455399","tests/hybrid/api.rs":"bd4862275c52f94c6f6737bf174c97e3de30f8075ca23f43c129c72a0d0afed7","tests/hybrid/mod.rs":"4856a49a4d9b5e9e079c2719a5e75c32408b37e9b76cbdea057b388a3537af6d","tests/hybrid/suite.rs":"688972275c5ef38cdc5112a1e6e54ccd2bf8290008ae2b17344c6c81e17e3a5a","tests/lib.rs":"9775b3c62fb338ea5c1bd3513a6589eff4b5c8d35c599439d9363dbf98c6f8d4","tests/meta/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/meta/suite.rs":"4c441f9df82508a5e60dd08f266183f772fc9b2b236fbf69cab87650ecf3b424","tests/nfa/mod.rs":"49055c358e38d97e42acb1602c671f97dddf24cafe089490f0e79ed208d74d9b","tests/nfa/thompson/backtrack/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/nfa/thompson/backtrack/suite.rs":"4e7baff70fc98b98b8297c6fd6d5818beb20343379e16cdb95bee46207ac4bd6","tests/nfa/thompson/mod.rs":"de9f5bcea1a8d1f03c85c55ad8c0747877d69e344fcd6c6886b0a402f0661291","tests/nfa/thompson/pikevm/mod.rs":"d08f4ecb8ec243be584944c9602af1ed3a48a8732dd11cd573b0d1d182171303","tests/nfa/thompson/pikevm/suite.rs":"263837ebf5b2e1906a06237982ea875386d83567e399b4ec1f669f10b1422599"},"package":"5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"} \ No newline at end of file diff --git a/vendor/regex-automata/Cargo.toml b/vendor/regex-automata/Cargo.toml index fe4949f0d..8b7cf0aa6 100644 --- a/vendor/regex-automata/Cargo.toml +++ b/vendor/regex-automata/Cargo.toml @@ -11,8 +11,9 @@ [package] edition = "2021" +rust-version = "1.65" name = "regex-automata" -version = "0.3.8" +version = "0.4.3" authors = [ "The Rust Project Developers", "Andrew Gallant ", @@ -54,7 +55,7 @@ optional = true default-features = false [dependencies.regex-syntax] -version = "0.7.4" +version = "0.8.2" optional = true default-features = false diff --git a/vendor/regex-automata/src/dfa/accel.rs b/vendor/regex-automata/src/dfa/accel.rs index 5ea2423dd..c0ba18ea8 100644 --- a/vendor/regex-automata/src/dfa/accel.rs +++ b/vendor/regex-automata/src/dfa/accel.rs @@ -6,15 +6,16 @@ // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its // DFA with regex-cli: // -// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC -// dense::DFA( +// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table // D 000000: // Q 000001: // *000002: -// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 -// >000004: \x00-` => 3, a => 4, b-\xFF => 3 -// 000005: \x00-\xFF => 2, EOI => 2 -// ) +// A 000003: \x00-` => 3, a => 8, b-\xFF => 3 +// A 000004: \x00-` => 4, a => 7, b-\xFF => 4 +// 000005: \x00-` => 4, b-\xFF => 4 +// 000006: \x00-` => 3, a => 6, b-\xFF => 3 +// 000007: \x00-\xFF => 2, EOI => 2 +// 000008: \x00-\xFF => 2, EOI => 2 // // In particular, state 3 is accelerated (shown via the 'A' indicator) since // the only way to leave that state once entered is to see an 'a' byte. If diff --git a/vendor/regex-automata/src/dfa/automaton.rs b/vendor/regex-automata/src/dfa/automaton.rs index 7e2be9a15..fcfcf2997 100644 --- a/vendor/regex-automata/src/dfa/automaton.rs +++ b/vendor/regex-automata/src/dfa/automaton.rs @@ -7,6 +7,7 @@ use crate::{ prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, + start, }, }; @@ -226,8 +227,8 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this DFA for the given starting + /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -235,12 +236,41 @@ pub unsafe trait Automaton { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it may + /// be more succinct to use [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`]. Note, for example, that the + /// convenience routines return a [`MatchError`] on failure where as this + /// routine returns a [`StartError`]. + /// + /// # Errors + /// + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte). + /// This can also return an error if the given configuration contains an + /// unsupported [`Anchored`] configuration. + fn start_state( + &self, + config: &start::Config, + ) -> Result; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -251,23 +281,30 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_forward(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } - /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. + /// Return the ID of the start state for this DFA when executing a reverse + /// search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`Automaton::start_state`] + /// that converts the given [`Input`] to a [start + /// configuration](start::Config). Additionally, if an error occurs, it is + /// converted from a [`StartError`] to a [`MatchError`] using the offset + /// information in the given [`Input`]. /// /// # Errors /// @@ -278,7 +315,18 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result; + ) -> Result { + let config = start::Config::from_input_reverse(input); + self.start_state(&config).map_err(|err| match err { + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) + } + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) + } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -1084,7 +1132,7 @@ pub unsafe trait Automaton { /// // implementation defined. /// // /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. - /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`. /// let id = StateID::new(3 * dfa.stride()).unwrap(); /// let accelerator = dfa.accelerator(id); /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. @@ -1798,6 +1846,14 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { (**self).next_eoi_state(current) } + #[inline] + fn start_state( + &self, + config: &start::Config, + ) -> Result { + (**self).start_state(config) + } + #[inline] fn start_state_forward( &self, @@ -2015,6 +2071,90 @@ impl OverlappingState { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either based on +/// incorrect configuration or even based on whether the look-behind byte +/// triggers a quit state. Typically one does not need to handle this error +/// if you're using [`Automaton::start_state_forward`] (or its reverse +/// counterpart), as that routine automatically converts `StartError` to a +/// [`MatchError`] for you. +/// +/// This error may be returned by the [`Automaton::start_state`] routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError {} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// diff --git a/vendor/regex-automata/src/dfa/dense.rs b/vendor/regex-automata/src/dfa/dense.rs index 6da865f97..fd96bc878 100644 --- a/vendor/regex-automata/src/dfa/dense.rs +++ b/vendor/regex-automata/src/dfa/dense.rs @@ -30,7 +30,7 @@ use crate::{ use crate::{ dfa::{ accel::Accels, - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, @@ -40,8 +40,8 @@ use crate::{ int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -66,8 +66,9 @@ const VERSION: u32 = 2; /// /// The default configuration guarantees that a search will never return /// a "quit" error, although it is possible for a search to fail if -/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by -/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is +/// not by default) and an [`Anchored::Pattern`] mode is requested via +/// [`Input`](crate::Input). #[cfg(feature = "dfa-build")] #[derive(Clone, Debug, Default)] pub struct Config { @@ -113,8 +114,7 @@ impl Config { /// make searching slower than it otherwise would be if the transitions /// that leave accelerated states are traversed frequently. /// - /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for - /// an example. + /// See [`Automaton::accelerator`] for an example. /// /// This is enabled by default. pub fn accelerate(mut self, yes: bool) -> Config { @@ -882,20 +882,20 @@ impl Config { /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 600KB isn't enough! + /// // 700KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(600_000)) + /// .determinize_size_limit(Some(700_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 700KB probably is! + /// // ... but 800KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(700_000)) + /// .determinize_size_limit(Some(800_000)) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); @@ -1228,13 +1228,14 @@ impl Builder { } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other - // bytes. Otherwise, a non-quit byte may end up in the same class - // as a quit byte, and thus cause the DFA stop when it shouldn't. + // bytes. Otherwise, a non-quit byte may end up in the same + // class as a quit byte, and thus cause the DFA to stop when it + // shouldn't. // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match dense --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quitset.is_empty() { set.add_set(&quitset); } @@ -2345,6 +2346,24 @@ impl<'a> DFA<&'a [u32]> { dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. + for state in dfa.states() { + // If the state is an accel state, then it must have a non-empty + // accelerator. + if dfa.is_accel_state(state.id()) { + let index = dfa.accelerator_index(state.id()); + if index >= dfa.accels.len() { + return Err(DeserializeError::generic( + "found DFA state with invalid accelerator index", + )); + } + let needles = dfa.accels.needles(index); + if !(1 <= needles.len() && needles.len() <= 3) { + return Err(DeserializeError::generic( + "accelerator needles has invalid length", + )); + } + } + } Ok((dfa, nread)) } @@ -2885,31 +2904,33 @@ impl OwnedDFA { fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); - let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + let start_id = |dfa: &mut OwnedDFA, + anchored: Anchored, + start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st.start(inp, start).expect("valid Input configuration") + dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { - let inp = Input::new("").anchored(Anchored::No); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::No; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { - let inp = Input::new("").anchored(Anchored::Yes); - let sid = start_id(self, &inp, Start::NonWordByte); - if sid == start_id(self, &inp, Start::WordByte) - && sid == start_id(self, &inp, Start::Text) - && sid == start_id(self, &inp, Start::LineLF) - && sid == start_id(self, &inp, Start::LineCR) - && sid == start_id(self, &inp, Start::CustomLineTerminator) + let anchor = Anchored::Yes; + let sid = start_id(self, anchor, Start::NonWordByte); + if sid == start_id(self, anchor, Start::WordByte) + && sid == start_id(self, anchor, Start::Text) + && sid == start_id(self, anchor, Start::LineLF) + && sid == start_id(self, anchor, Start::LineCR) + && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } @@ -3216,35 +3237,21 @@ unsafe impl> Automaton for DFA { } #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_forward( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[cfg_attr(feature = "perf-inline", inline(always))] - fn start_state_reverse( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -4180,28 +4187,27 @@ impl> StartTable { #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -5086,6 +5092,8 @@ impl core::fmt::Display for BuildError { #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { + use crate::{Input, MatchError}; + use super::*; #[test] diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs index 4bb870435..fd58cac23 100644 --- a/vendor/regex-automata/src/dfa/mod.rs +++ b/vendor/regex-automata/src/dfa/mod.rs @@ -320,7 +320,7 @@ dramatically. #[cfg(feature = "dfa-search")] pub use crate::dfa::{ - automaton::{Automaton, OverlappingState}, + automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; diff --git a/vendor/regex-automata/src/dfa/onepass.rs b/vendor/regex-automata/src/dfa/onepass.rs index 44691d0c8..e62bbd383 100644 --- a/vendor/regex-automata/src/dfa/onepass.rs +++ b/vendor/regex-automata/src/dfa/onepass.rs @@ -2581,10 +2581,11 @@ impl Cache { /// Represents a single transition in a one-pass DFA. /// -/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds -/// to the transition epsilons, which contains the slots that should be saved -/// when this transition is followed and the conditional epsilon transitions -/// that must be satisfied in order to follow this transition. +/// The high 21 bits corresponds to the state ID. The bit following corresponds +/// to the special "match wins" flag. The remaining low 42 bits corresponds to +/// the transition epsilons, which contains the slots that should be saved when +/// this transition is followed and the conditional epsilon transitions that +/// must be satisfied in order to follow this transition. #[derive(Clone, Copy, Eq, PartialEq)] struct Transition(u64); @@ -2741,7 +2742,7 @@ impl PatternEpsilons { fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { PatternEpsilons( (self.0 & PatternEpsilons::PATTERN_ID_MASK) - | u64::from(epsilons.0), + | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), ) } } @@ -2814,12 +2815,15 @@ impl Epsilons { /// Return the set of look-around assertions in these epsilon transitions. fn looks(self) -> LookSet { - LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } } /// Set the look-around assertions on these epsilon transitions. fn set_looks(self, look_set: LookSet) -> Epsilons { - Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + Epsilons( + (self.0 & Epsilons::SLOT_MASK) + | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), + ) } } diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs index f39c1c055..5e7e6e38a 100644 --- a/vendor/regex-automata/src/dfa/regex.rs +++ b/vendor/regex-automata/src/dfa/regex.rs @@ -853,7 +853,7 @@ impl Builder { } /// Set the dense DFA compilation configuration for this builder using - /// [`dense::Config`](dense::Config). + /// [`dense::Config`]. /// /// This permits setting things like whether the underlying DFAs should /// be minimized. diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs index 8c012a594..5a82261f9 100644 --- a/vendor/regex-automata/src/dfa/search.rs +++ b/vendor/regex-automata/src/dfa/search.rs @@ -176,7 +176,6 @@ fn find_fwd_imp( // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -297,7 +296,6 @@ fn find_rev_imp( } else if dfa.is_dead_state(sid) { return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.haystack()[at], at)); } } @@ -422,7 +420,6 @@ fn find_overlapping_fwd_imp( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -526,7 +523,6 @@ pub(crate) fn find_overlapping_rev( } else if dfa.is_dead_state(sid) { return Ok(()); } else { - debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit( input.haystack()[state.at], state.at, @@ -600,9 +596,6 @@ fn eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) @@ -631,9 +624,6 @@ fn eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } - // N.B. We don't have to check 'is_quit' here because the EOI - // transition can never lead to a quit state. - debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs index 5d8ec2340..d461e0a0f 100644 --- a/vendor/regex-automata/src/dfa/sparse.rs +++ b/vendor/regex-automata/src/dfa/sparse.rs @@ -3,13 +3,12 @@ Types and routines specific to sparse DFAs. This module is the home of [`sparse::DFA`](DFA). -Unlike the [`dense`](super::dense) module, this module does not contain a -builder or configuration specific for sparse DFAs. Instead, the intended -way to build a sparse DFA is either by using a default configuration with -its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the -construction of a dense DFA with [`dense::Builder`](super::dense::Builder) -and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For -example, this configures a sparse DFA to do an overlapping search: +Unlike the [`dense`] module, this module does not contain a builder or +configuration specific for sparse DFAs. Instead, the intended way to build a +sparse DFA is either by using a default configuration with its constructor +[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a +dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. +For example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ @@ -52,7 +51,7 @@ use alloc::{vec, vec::Vec}; use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ - automaton::{fmt_state_indicator, Automaton}, + automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, @@ -63,8 +62,8 @@ use crate::{ int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, - search::{Anchored, Input, MatchError}, - start::{Start, StartByteMap}, + search::Anchored, + start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -74,18 +73,17 @@ const VERSION: u32 = 2; /// A sparse deterministic finite automaton (DFA) with variable sized states. /// -/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses -/// a more space efficient representation for its transitions. Consequently, -/// sparse DFAs may use much less memory than dense DFAs, but this comes at a -/// price. In particular, reading the more space efficient transitions takes -/// more work, and consequently, searching using a sparse DFA is typically -/// slower than a dense DFA. +/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient +/// representation for its transitions. Consequently, sparse DFAs may use much +/// less memory than dense DFAs, but this comes at a price. In particular, +/// reading the more space efficient transitions takes more work, and +/// consequently, searching using a sparse DFA is typically slower than a dense +/// DFA. /// /// A sparse DFA can be built using the default configuration via the -/// [`DFA::new`] constructor. Otherwise, one can configure various aspects -/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), -/// and then convert a dense DFA to a sparse DFA using -/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a +/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse +/// DFA using [`dense::DFA::to_sparse`]. /// /// In general, a sparse DFA supports all the same search operations as a dense /// DFA. @@ -140,11 +138,9 @@ impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -167,11 +163,9 @@ impl DFA> { /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// - /// If you want a non-default configuration, then use - /// the [`dense::Builder`](crate::dfa::dense::Builder) - /// to set your own configuration, and then call - /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create - /// a sparse DFA. + /// If you want a non-default configuration, then use the + /// [`dense::Builder`] to set your own configuration, and then call + /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// @@ -511,10 +505,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -553,10 +546,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// @@ -595,10 +587,9 @@ impl> DFA { /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// - /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s - /// serialization methods, this does not add any initial padding to the - /// returned bytes. Padding isn't required for sparse DFAs since they have - /// no alignment requirements. + /// Note that unlike a [`dense::DFA`]'s serialization methods, this does + /// not add any initial padding to the returned bytes. Padding isn't + /// required for sparse DFAs since they have no alignment requirements. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the @@ -903,9 +894,9 @@ impl<'a> DFA<&'a [u8]> { /// /// If any of the above are not true, then an error will be returned. /// - /// Note that unlike deserializing a - /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has - /// no alignment requirements. That is, an alignment of `1` is valid. + /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse + /// DFA has no alignment requirements. That is, an alignment of `1` is + /// valid. /// /// # Panics /// @@ -1001,8 +992,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate(&dfa.special)?; - dfa.st.validate(&dfa.special, &dfa.tt)?; + let seen = dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &seen)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1207,35 +1198,21 @@ unsafe impl> Automaton for DFA { } #[inline] - fn start_state_forward( + fn start_state( &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); - } - } - let start = self.st.start_map.fwd(&input); - self.st.start(input, start) - } - - #[inline] - fn start_state_reverse( - &self, - input: &Input<'_>, - ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + config: &start::Config, + ) -> Result { + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.st.start_map.get(byte) } - } - let start = self.st.start_map.rev(&input); - self.st.start(input, start) + }; + self.st.start(anchored, start) } #[inline] @@ -1411,63 +1388,8 @@ impl> Transitions { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { - // In order to validate everything, we not only need to make sure we - // can decode every state, but that every transition in every state - // points to a valid state. There are many duplicative transitions, so - // we record state IDs that we've verified so that we don't redo the - // decoding work. - // - // Except, when in no_std mode, we don't have dynamic memory allocation - // available to us, so we skip this optimization. It's not clear - // whether doing something more clever is worth it just yet. If you're - // profiling this code and need it to run faster, please file an issue. - // - // OK, so we also use this to record the set of valid state IDs. Since - // it is possible for a transition to point to an invalid state ID that - // still (somehow) deserializes to a valid state. So we need to make - // sure our transitions are limited to actually correct state IDs. - // The problem is, I'm not sure how to do this verification step in - // no-std no-alloc mode. I think we'd *have* to store the set of valid - // state IDs in the DFA itself. For now, we don't do this verification - // in no-std no-alloc mode. The worst thing that can happen is an - // incorrect result. But no panics or memory safety problems should - // result. Because we still do validate that the state itself is - // "valid" in the sense that everything it points to actually exists. - // - // ---AG - struct Seen { - #[cfg(feature = "alloc")] - set: alloc::collections::BTreeSet, - #[cfg(not(feature = "alloc"))] - set: core::marker::PhantomData, - } - - #[cfg(feature = "alloc")] - impl Seen { - fn new() -> Seen { - Seen { set: alloc::collections::BTreeSet::new() } - } - fn insert(&mut self, id: StateID) { - self.set.insert(id); - } - fn contains(&self, id: &StateID) -> bool { - self.set.contains(id) - } - } - - #[cfg(not(feature = "alloc"))] - impl Seen { - fn new() -> Seen { - Seen { set: core::marker::PhantomData } - } - fn insert(&mut self, _id: StateID) {} - fn contains(&self, _id: &StateID) -> bool { - false - } - } - - let mut verified: Seen = Seen::new(); + fn validate(&self, sp: &Special) -> Result { + let mut verified = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the // recorded state length is non-empty. @@ -1544,7 +1466,7 @@ impl> Transitions { "mismatching sparse state length", )); } - Ok(()) + Ok(verified) } /// Converts these transitions to a borrowed value. @@ -1682,7 +1604,7 @@ impl> Transitions { let state = &state[nr..]; if npats == 0 { return Err(DeserializeError::generic( - "state marked as a match, but has no pattern IDs", + "state marked as a match, but pattern length is zero", )); } @@ -1704,6 +1626,21 @@ impl> Transitions { } else { (&[][..], state) }; + if is_match && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) && pattern_ids.is_empty() { + return Err(DeserializeError::generic( + "state marked special as a match, but has no pattern IDs", + )); + } + if sp.is_match_state(id) != is_match { + return Err(DeserializeError::generic( + "whether state is a match or not is inconsistent", + )); + } // Now read this state's accelerator info. The first byte is the length // of the accelerator, which is typically 0 (for no acceleration) but @@ -2084,28 +2021,19 @@ impl> StartTable { fn validate( &self, sp: &Special, - trans: &Transitions, + seen: &Seen, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { + if !seen.contains(&id) { + return Err(DeserializeError::generic( + "found invalid start state ID", + )); + } if sp.is_match_state(id) { return Err(DeserializeError::generic( "start states cannot be match states", )); } - // Confirm that the start state points to a valid state. - let state = trans.try_state(sp, id)?; - // And like for the transition table, confirm that the transitions - // on all start states themselves point to a valid state. - // - // It'd probably be better to integrate this validation with the - // transition table, or otherwise store a sorted sequence of all - // valid state IDs in the sparse DFA itself. That way, we could - // check that every pointer to a state corresponds precisely to a - // correct and valid state. - for i in 0..state.ntrans { - let to = state.next_at(i); - let _ = trans.try_state(sp, to)?; - } } Ok(()) } @@ -2145,28 +2073,27 @@ impl> StartTable { /// panics. fn start( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { - return Err(MatchError::unsupported_anchored(mode)) + return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; @@ -2561,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +// In order to validate everything, we not only need to make sure we +// can decode every state, but that every transition in every state +// points to a valid state. There are many duplicative transitions, so +// we record state IDs that we've verified so that we don't redo the +// decoding work. +// +// Except, when in no_std mode, we don't have dynamic memory allocation +// available to us, so we skip this optimization. It's not clear +// whether doing something more clever is worth it just yet. If you're +// profiling this code and need it to run faster, please file an issue. +// +// OK, so we also use this to record the set of valid state IDs. Since +// it is possible for a transition to point to an invalid state ID that +// still (somehow) deserializes to a valid state. So we need to make +// sure our transitions are limited to actually correct state IDs. +// The problem is, I'm not sure how to do this verification step in +// no-std no-alloc mode. I think we'd *have* to store the set of valid +// state IDs in the DFA itself. For now, we don't do this verification +// in no-std no-alloc mode. The worst thing that can happen is an +// incorrect result. But no panics or memory safety problems should +// result. Because we still do validate that the state itself is +// "valid" in the sense that everything it points to actually exists. +// +// ---AG +#[derive(Debug)] +struct Seen { + #[cfg(feature = "alloc")] + set: alloc::collections::BTreeSet, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData, +} + +#[cfg(feature = "alloc")] +impl Seen { + fn new() -> Seen { + Seen { set: alloc::collections::BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } +} + +#[cfg(not(feature = "alloc"))] +impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + true + } +} + /* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs index 67261c1a3..bd9179b19 100644 --- a/vendor/regex-automata/src/hybrid/dfa.rs +++ b/vendor/regex-automata/src/hybrid/dfa.rs @@ -13,7 +13,7 @@ use alloc::vec::Vec; use crate::{ hybrid::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, @@ -28,7 +28,7 @@ use crate::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, - start::{Start, StartByteMap}, + start::{self, Start, StartByteMap}, }, }; @@ -1518,8 +1518,8 @@ impl DFA { Lazy::new(self, cache).cache_next_state(current, unit) } - /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. + /// Return the ID of the start state for this lazy DFA for the given + /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1527,85 +1527,122 @@ impl DFA { /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for forward searches. + /// * Whether a "look-behind" byte exists. For example, the `^` anchor + /// matches if and only if there is no look-behind byte. + /// * The specific value of that look-behind byte. For example, a `(?m:^)` + /// assertion only matches when there is either no look-behind byte, or + /// when the look-behind byte is a line terminator. + /// + /// The [starting configuration](start::Config) provides the above + /// information. + /// + /// This routine can be used for either forward or reverse searches. + /// Although, as a convenience, if you have an [`Input`], then it + /// may be more succinct to use [`DFA::start_state_forward`] or + /// [`DFA::start_state_reverse`]. Note, for example, that the convenience + /// routines return a [`MatchError`] on failure where as this routine + /// returns a [`StartError`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`StartError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte + /// or if the cache has become inefficient). This can also return an + /// error if the given configuration contains an unsupported [`Anchored`] + /// configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn start_state( + &self, + cache: &mut Cache, + config: &start::Config, + ) -> Result { + let lazy = LazyRef::new(self, cache); + let anchored = config.get_anchored(); + let start = match config.get_look_behind() { + None => Start::Text, + Some(byte) => { + if !self.quitset.is_empty() && self.quitset.contains(byte) { + return Err(StartError::quit(byte)); + } + self.start_map.get(byte) + } + }; + let start_id = lazy.get_cached_start_id(anchored, start)?; + if !start_id.is_unknown() { + return Ok(start_id); + } + Lazy::new(self, cache).cache_start_group(anchored, start) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. + /// + /// # Errors + /// + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.start() > 0 { - let offset = input.start() - 1; - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_forward(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.start()), + StartError::Quit { byte } => { + let offset = input + .start() + .checked_sub(1) + .expect("no quit in start without look-behind"); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.fwd(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// - /// Unlike typical DFA implementations, the start state for DFAs in this - /// crate is dependent on a few different factors: - /// - /// * The [`Anchored`] mode of the search. Unanchored, anchored and - /// anchored searches for a specific [`PatternID`] all use different start - /// states. - /// * The position at which the search begins, via [`Input::start`]. This - /// and the byte immediately preceding the start of the search (if one - /// exists) influence which look-behind assertions are true at the start - /// of the search. This in turn influences which start state is selected. - /// * Whether the search is a forward or reverse search. This routine can - /// only be used for reverse searches. + /// This is a convenience routine for calling [`DFA::start_state`] that + /// converts the given [`Input`] to a [start configuration](start::Config). + /// Additionally, if an error occurs, it is converted from a [`StartError`] + /// to a [`MatchError`] using the offset information in the given + /// [`Input`]. /// /// # Errors /// - /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if - /// it sees a "quit" byte or if the cache has been cleared too many - /// times). This can also return an error if the given `Input` contains an - /// unsupported [`Anchored`] configuration. + /// This may return a [`MatchError`] if the search needs to give up when + /// determining the start state (for example, if it sees a "quit" byte or + /// if the cache has become inefficient). This can also return an error if + /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { - if !self.quitset.is_empty() && input.end() < input.haystack().len() { - let offset = input.end(); - let byte = input.haystack()[offset]; - if self.quitset.contains(byte) { - return Err(MatchError::quit(byte, offset)); + let config = start::Config::from_input_reverse(input); + self.start_state(cache, &config).map_err(|err| match err { + StartError::Cache { .. } => MatchError::gave_up(input.end()), + StartError::Quit { byte } => { + let offset = input.end(); + MatchError::quit(byte, offset) } - } - let start_type = self.start_map.rev(input); - let start = LazyRef::new(self, cache) - .get_cached_start_id(input, start_type)?; - if !start.is_unknown() { - return Ok(start); - } - Lazy::new(self, cache).cache_start_group(input, start_type) + StartError::UnsupportedAnchored { mode } => { + MatchError::unsupported_anchored(mode) + } + }) } /// Returns the total number of patterns that match in this state. @@ -2066,8 +2103,10 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Here's an example that justifies 'inline(never)' /// /// ```ignore - /// regex-cli find hybrid dfa \ - /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// regex-cli find match hybrid \ + /// --cache-capacity 100000000 \ + /// -p '\pL{100}' + /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every @@ -2122,16 +2161,15 @@ impl<'i, 'c> Lazy<'i, 'c> { #[inline(never)] fn cache_start_group( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { - let mode = input.get_anchored(); - let nfa_start_id = match mode { + ) -> Result { + let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), @@ -2142,8 +2180,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let id = self .cache_start_one(nfa_start_id, start) - .map_err(|_| MatchError::gave_up(input.start()))?; - self.set_start_state(input, start, id); + .map_err(StartError::cache)?; + self.set_start_state(anchored, start, id); Ok(id) } @@ -2574,13 +2612,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - input: &Input<'_>, + anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match input.get_anchored() { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { @@ -2642,17 +2680,16 @@ impl<'i, 'c> LazyRef<'i, 'c> { #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - input: &Input<'_>, + anchored: Anchored, start: Start, - ) -> Result { + ) -> Result { let start_index = start.as_usize(); - let mode = input.get_anchored(); - let index = match mode { + let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { - return Err(MatchError::unsupported_anchored(mode)); + return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); @@ -3178,12 +3215,12 @@ impl Config { /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// - /// When enabling this option, callers _must_ be prepared to handle - /// a [`MatchError`](crate::MatchError) error during search. - /// When using a [`Regex`](crate::hybrid::regex::Regex), this - /// corresponds to using the `try_` suite of methods. Alternatively, - /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::quit`] error will never be returned while searching. + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. Alternatively, if callers can guarantee that + /// their input is ASCII only, then a [`MatchError::quit`] error will never + /// be returned while searching. /// /// This is disabled by default. /// @@ -3269,8 +3306,8 @@ impl Config { /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// - /// When enabling this option, callers _must_ be prepared to handle a - /// [`MatchError`](crate::MatchError) error during search. When using a + /// When enabling this option, callers _must_ be prepared to + /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// @@ -3795,8 +3832,8 @@ impl Config { // // Test case: // - // regex-cli find hybrid regex -w @conn.json.1000x.log \ - // '^#' '\b10\.55\.182\.100\b' + // regex-cli find match hybrid --unicode-word-boundary \ + // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs index 604daf3c3..d134e7ec9 100644 --- a/vendor/regex-automata/src/hybrid/error.rs +++ b/vendor/regex-automata/src/hybrid/error.rs @@ -1,4 +1,4 @@ -use crate::{hybrid::id::LazyStateIDError, nfa}; +use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// @@ -95,6 +95,113 @@ impl core::fmt::Display for BuildError { } } +/// An error that can occur when computing the start state for a search. +/// +/// Computing a start state can fail for a few reasons, either +/// based on incorrect configuration or even based on whether +/// the look-behind byte triggers a quit state. Typically +/// one does not need to handle this error if you're using +/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) +/// (or its reverse counterpart), as that routine automatically converts +/// `StartError` to a [`MatchError`](crate::MatchError) for you. +/// +/// This error may be returned by the +/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. +/// +/// This error implements the `std::error::Error` trait when the `std` feature +/// is enabled. +/// +/// This error is marked as non-exhaustive. New variants may be added in a +/// semver compatible release. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum StartError { + /// An error that occurs when cache inefficiency has dropped below the + /// configured heuristic thresholds. + Cache { + /// The underlying cache error that occurred. + err: CacheError, + }, + /// An error that occurs when a starting configuration's look-behind byte + /// is in this DFA's quit set. + Quit { + /// The quit byte that was found. + byte: u8, + }, + /// An error that occurs when the caller requests an anchored mode that + /// isn't supported by the DFA. + UnsupportedAnchored { + /// The anchored mode given that is unsupported. + mode: Anchored, + }, +} + +impl StartError { + pub(crate) fn cache(err: CacheError) -> StartError { + StartError::Cache { err } + } + + pub(crate) fn quit(byte: u8) -> StartError { + StartError::Quit { byte } + } + + pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { + StartError::UnsupportedAnchored { mode } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StartError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match *self { + StartError::Cache { ref err } => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for StartError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match *self { + StartError::Cache { .. } => write!( + f, + "error computing start state because of cache inefficiency" + ), + StartError::Quit { byte } => write!( + f, + "error computing start state because the look-behind byte \ + {:?} triggered a quit state", + crate::util::escape::DebugByte(byte), + ), + StartError::UnsupportedAnchored { mode: Anchored::Yes } => { + write!( + f, + "error computing start state because \ + anchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { mode: Anchored::No } => { + write!( + f, + "error computing start state because \ + unanchored searches are not supported or enabled" + ) + } + StartError::UnsupportedAnchored { + mode: Anchored::Pattern(pid), + } => { + write!( + f, + "error computing start state because \ + anchored searches for a specific pattern ({}) \ + are not supported or enabled", + pid.as_usize(), + ) + } + } + } +} + /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its @@ -126,11 +233,7 @@ impl CacheError { } #[cfg(feature = "std")] -impl std::error::Error for CacheError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - None - } -} +impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs index 44e67e129..2feb839d1 100644 --- a/vendor/regex-automata/src/hybrid/mod.rs +++ b/vendor/regex-automata/src/hybrid/mod.rs @@ -133,7 +133,7 @@ compiled DFAs. */ pub use self::{ - error::{BuildError, CacheError}, + error::{BuildError, CacheError, StartError}, id::LazyStateID, }; diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs index 75667daf9..b3b1fe317 100644 --- a/vendor/regex-automata/src/hybrid/regex.rs +++ b/vendor/regex-automata/src/hybrid/regex.rs @@ -878,7 +878,7 @@ impl Builder { } /// Set the lazy DFA compilation configuration for this builder using - /// [`dfa::Config`](dfa::Config). + /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs index f23283685..1f4a505db 100644 --- a/vendor/regex-automata/src/hybrid/search.rs +++ b/vendor/regex-automata/src/hybrid/search.rs @@ -105,14 +105,14 @@ fn find_fwd_imp( // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // - // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // - // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb - // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb - // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb + // regex-cli find half hybrid -p '\w{50}' -UBb bigfile + // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile + // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // @@ -353,7 +353,7 @@ fn find_rev_imp( // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // - // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb + // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' diff --git a/vendor/regex-automata/src/meta/limited.rs b/vendor/regex-automata/src/meta/limited.rs index 192a2625e..5653adc9a 100644 --- a/vendor/regex-automata/src/meta/limited.rs +++ b/vendor/regex-automata/src/meta/limited.rs @@ -69,9 +69,6 @@ pub(crate) fn dfa_try_search_half_rev( } else if dfa.is_dead_state(sid) { return Ok(mat); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -155,9 +152,6 @@ pub(crate) fn hybrid_try_search_half_rev( } else if sid.is_dead() { return Ok(mat); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } } @@ -209,9 +203,6 @@ fn dfa_eoi_rev( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { @@ -246,9 +237,6 @@ fn hybrid_eoi_rev( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(byte, sp.start - 1)); } } else { diff --git a/vendor/regex-automata/src/meta/regex.rs b/vendor/regex-automata/src/meta/regex.rs index 3a04b14d8..a06d2bb48 100644 --- a/vendor/regex-automata/src/meta/regex.rs +++ b/vendor/regex-automata/src/meta/regex.rs @@ -2706,7 +2706,7 @@ impl Config { /// you're compiling untrusted patterns. /// /// Note that this limit is applied to _each_ NFA built, and if any of - /// them excceed the limit, then construction will fail. This limit does + /// them exceed the limit, then construction will fail. This limit does /// _not_ correspond to the total memory used by all NFAs in the meta regex /// engine. /// @@ -3640,8 +3640,8 @@ mod tests { // I found this in the course of building out the benchmark suite for // rebar. #[test] - fn regression() { - env_logger::init(); + fn regression_suffix_literal_count() { + let _ = env_logger::try_init(); let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); diff --git a/vendor/regex-automata/src/meta/stopat.rs b/vendor/regex-automata/src/meta/stopat.rs index e8d716689..c4dcd797a 100644 --- a/vendor/regex-automata/src/meta/stopat.rs +++ b/vendor/regex-automata/src/meta/stopat.rs @@ -81,9 +81,6 @@ pub(crate) fn dfa_try_search_half_fwd( } else if dfa.is_dead_state(sid) { return Ok(mat.ok_or(at)); } else if dfa.is_quit_state(sid) { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // Ideally we wouldn't use a DFA that specialized start states @@ -122,9 +119,6 @@ pub(crate) fn hybrid_try_search_half_fwd( } else if sid.is_dead() { return Ok(mat.ok_or(at)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(mat.ok_or(at)); - } return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // We should NEVER get an unknown state ID back from @@ -162,9 +156,6 @@ fn dfa_eoi_fwd( let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } @@ -201,9 +192,6 @@ fn hybrid_eoi_fwd( let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { - if mat.is_some() { - return Ok(()); - } return Err(MatchError::quit(b, sp.end)); } } diff --git a/vendor/regex-automata/src/meta/strategy.rs b/vendor/regex-automata/src/meta/strategy.rs index ea6c6ab57..04f2ba3c3 100644 --- a/vendor/regex-automata/src/meta/strategy.rs +++ b/vendor/regex-automata/src/meta/strategy.rs @@ -353,6 +353,7 @@ impl Pre<()> { // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ + #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { &self.group_info } @@ -378,6 +379,7 @@ impl Strategy for Pre

{ self.pre.memory_usage() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; @@ -393,6 +395,7 @@ impl Strategy for Pre

{ .map(|sp| Match::new(PatternID::ZERO, sp)) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, @@ -401,10 +404,12 @@ impl Strategy for Pre

{ self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { self.search(cache, input).is_some() } + #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, @@ -421,6 +426,7 @@ impl Strategy for Pre

{ Some(m.pattern()) } + #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, @@ -1275,7 +1281,7 @@ impl ReverseSuffix { e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( - "using lazy DFA for reverse inner search at {:?}, \ + "using lazy DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, diff --git a/vendor/regex-automata/src/meta/wrappers.rs b/vendor/regex-automata/src/meta/wrappers.rs index 08110d9bb..6cb19ba0d 100644 --- a/vendor/regex-automata/src/meta/wrappers.rs +++ b/vendor/regex-automata/src/meta/wrappers.rs @@ -212,7 +212,10 @@ impl BoundedBacktrackerEngine { .configure(backtrack_config) .build_from_nfa(nfa.clone()) .map_err(BuildError::nfa)?; - debug!("BoundedBacktracker built"); + debug!( + "BoundedBacktracker built (max haystack length: {:?})", + engine.max_haystack_len() + ); Ok(Some(BoundedBacktrackerEngine(engine))) } #[cfg(not(feature = "nfa-backtrack"))] diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs index eba037c1d..df99e456d 100644 --- a/vendor/regex-automata/src/nfa/thompson/backtrack.rs +++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs @@ -820,8 +820,11 @@ impl BoundedBacktracker { // bytes to the capacity in bits. let capacity = 8 * self.get_config().get_visited_capacity(); let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); - let real_capacity = blocks * Visited::BLOCK_SIZE; - (real_capacity / self.nfa.states().len()) - 1 + let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE); + // It's possible for `real_capacity` to be smaller than the number of + // NFA states for particularly large regexes, so we saturate towards + // zero. + (real_capacity / self.nfa.states().len()).saturating_sub(1) } } @@ -1882,3 +1885,24 @@ fn div_ceil(lhs: usize, rhs: usize) -> usize { (lhs / rhs) + 1 } } + +#[cfg(test)] +mod tests { + use super::*; + + // This is a regression test for the maximum haystack length computation. + // Previously, it assumed that the total capacity of the backtracker's + // bitset would always be greater than the number of NFA states. But there + // is of course no guarantee that this is true. This regression test + // ensures that not only does `max_haystack_len` not panic, but that it + // should return `0`. + #[cfg(feature = "syntax")] + #[test] + fn max_haystack_len_overflow() { + let re = BoundedBacktracker::builder() + .configure(BoundedBacktracker::config().visited_capacity(10)) + .build(r"[0-9A-Za-z]{100}") + .unwrap(); + assert_eq!(0, re.max_haystack_len()); + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs index b57e5bc0f..6b69e8784 100644 --- a/vendor/regex-automata/src/nfa/thompson/builder.rs +++ b/vendor/regex-automata/src/nfa/thompson/builder.rs @@ -61,7 +61,7 @@ enum State { Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureStart { /// The ID of the pattern that this capture was defined. @@ -77,7 +77,7 @@ enum State { }, /// An empty state that records the end of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to - /// record position information for a captue group when using the NFA for + /// record position information for a capture group when using the NFA for /// search. CaptureEnd { /// The ID of the pattern that this capture was defined. @@ -128,7 +128,7 @@ enum State { } impl State { - /// If this state is an unconditional espilon transition, then this returns + /// If this state is an unconditional epsilon transition, then this returns /// the target of the transition. fn goto(&self) -> Option { match *self { diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs index 065e9ef27..2d2172957 100644 --- a/vendor/regex-automata/src/nfa/thompson/compiler.rs +++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs @@ -1466,7 +1466,7 @@ impl Compiler { // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // - // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes @@ -1557,6 +1557,14 @@ impl Compiler { hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, + hir::Look::WordStartAscii => Look::WordStartAscii, + hir::Look::WordEndAscii => Look::WordEndAscii, + hir::Look::WordStartUnicode => Look::WordStartUnicode, + hir::Look::WordEndUnicode => Look::WordEndUnicode, + hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, + hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, + hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, + hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs index c36ce5386..c92d4c0b8 100644 --- a/vendor/regex-automata/src/nfa/thompson/map.rs +++ b/vendor/regex-automata/src/nfa/thompson/map.rs @@ -65,7 +65,7 @@ const INIT: u64 = 14695981039346656037; /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// -/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs index 2108fa338..1f57f8ebd 100644 --- a/vendor/regex-automata/src/nfa/thompson/nfa.rs +++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs @@ -1841,14 +1841,12 @@ impl SparseTransitions { // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // - // allcpssmall=all-codepoints-utf8-10x - // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs index 2d43a5b6f..75c9b796b 100644 --- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs +++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs @@ -594,7 +594,7 @@ impl State { // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // - // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + // hyperfine "regex-cli debug thompson -qr --no-captures '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } diff --git a/vendor/regex-automata/src/util/captures.rs b/vendor/regex-automata/src/util/captures.rs index cd3a5f8f7..05db6a993 100644 --- a/vendor/regex-automata/src/util/captures.rs +++ b/vendor/regex-automata/src/util/captures.rs @@ -433,7 +433,6 @@ impl Captures { /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long - /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; @@ -445,6 +444,8 @@ impl Captures { /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); /// // Looking for a non-existent capturing group will return None: /// assert_eq!(None, caps.get_group(3)); + /// # // literals are too big for 32-bit usize: #1039 + /// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, caps.get_group(9944060567225171988)); /// /// # Ok::<(), Box>(()) diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs index 30a82afb8..ba32991d0 100644 --- a/vendor/regex-automata/src/util/determinize/mod.rs +++ b/vendor/regex-automata/src/util/determinize/mod.rs @@ -145,9 +145,10 @@ pub(crate) fn next( } Some(_) => {} None => { - look_have = look_have.insert(Look::End); - look_have = look_have.insert(Look::EndLF); - look_have = look_have.insert(Look::EndCRLF); + look_have = look_have + .insert(Look::End) + .insert(Look::EndLF) + .insert(Look::EndCRLF); } } if unit.is_byte(lookm.get_line_terminator()) { @@ -160,11 +161,26 @@ pub(crate) fn next( look_have = look_have.insert(Look::StartCRLF); } if state.is_from_word() == unit.is_word_byte() { - look_have = look_have.insert(Look::WordUnicodeNegate); - look_have = look_have.insert(Look::WordAsciiNegate); + look_have = look_have + .insert(Look::WordAsciiNegate) + .insert(Look::WordUnicodeNegate); } else { - look_have = look_have.insert(Look::WordUnicode); - look_have = look_have.insert(Look::WordAscii); + look_have = + look_have.insert(Look::WordAscii).insert(Look::WordUnicode); + } + if !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndHalfAscii) + .insert(Look::WordEndHalfUnicode); + } + if state.is_from_word() && !unit.is_word_byte() { + look_have = look_have + .insert(Look::WordEndAscii) + .insert(Look::WordEndUnicode); + } else if !state.is_from_word() && unit.is_word_byte() { + look_have = look_have + .insert(Look::WordStartAscii) + .insert(Look::WordStartUnicode); } // If we have new assertions satisfied that are among the set of // assertions that exist in this state (that is, just because we added @@ -220,6 +236,14 @@ pub(crate) fn next( { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } + // And also for the start-half word boundary assertions. As long as the + // look-behind byte is not a word char, then the assertions are satisfied. + if nfa.look_set_any().contains_word() && !unit.is_word_byte() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } @@ -563,47 +587,95 @@ pub(crate) fn set_lookbehind_from_start( ) { let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); + let lookset = nfa.look_set_any(); match *start { - Start::NonWordByte => {} + Start::NonWordByte => { + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } + } Start::WordByte => { - builder.set_is_from_word(); + if lookset.contains_word() { + builder.set_is_from_word(); + } } Start::Text => { - builder.set_look_have(|have| { - have.insert(Look::Start) - .insert(Look::StartLF) - .insert(Look::StartCRLF) - }); + if lookset.contains_anchor_haystack() { + builder.set_look_have(|have| have.insert(Look::Start)); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| { + have.insert(Look::StartLF).insert(Look::StartCRLF) + }); + } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineLF => { if rev { - builder.set_is_half_crlf(); - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_crlf() { + builder.set_is_half_crlf(); + } + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } } else { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } } - if lineterm == b'\n' { + if lookset.contains_anchor_line() && lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::LineCR => { - if rev { - builder.set_look_have(|have| have.insert(Look::StartCRLF)); - } else { - builder.set_is_half_crlf(); + if lookset.contains_anchor_crlf() { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } } - if lineterm == b'\r' { + if lookset.contains_anchor_line() && lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } + if lookset.contains_word() { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } Start::CustomLineTerminator => { - builder.set_look_have(|have| have.insert(Look::StartLF)); + if lookset.contains_anchor_line() { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } // This is a bit of a tricky case, but if the line terminator was // set to a word byte, then we also need to behave as if the start // configuration is Start::WordByte. That is, we need to mark our // state as having come from a word byte. - if utf8::is_word_byte(lineterm) { - builder.set_is_from_word(); + if lookset.contains_word() { + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } else { + builder.set_look_have(|have| { + have.insert(Look::WordStartHalfAscii) + .insert(Look::WordStartHalfUnicode) + }); + } } } } diff --git a/vendor/regex-automata/src/util/determinize/state.rs b/vendor/regex-automata/src/util/determinize/state.rs index e64123587..effa6f44d 100644 --- a/vendor/regex-automata/src/util/determinize/state.rs +++ b/vendor/regex-automata/src/util/determinize/state.rs @@ -197,7 +197,7 @@ impl StateBuilderEmpty { } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { - self.0.extend_from_slice(&[0, 0, 0, 0, 0]); + self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } @@ -348,16 +348,17 @@ impl StateBuilderNFA { /// generated by a transition over a "word" byte. (Callers may not always set /// this. For example, if the NFA has no word boundary assertion, then needing /// to track whether a state came from a word byte or not is superfluous and -/// wasteful.) +/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition +/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is +/// enabled. /// -/// Byte 1 corresponds to the look-behind assertions that were satisfied by -/// the transition that created this state. This generally only includes the -/// StartLF and Start assertions. (Look-ahead assertions are not tracked as -/// part of states. Instead, these are applied by re-computing the epsilon -/// closure of a state when computing the transition function. See `next` in -/// the parent module.) +/// Bytes 1..5 correspond to the look-behind assertions that were satisfied +/// by the transition that created this state. (Look-ahead assertions are not +/// tracked as part of states. Instead, these are applied by re-computing the +/// epsilon closure of a state when computing the transition function. See +/// `next` in the parent module.) /// -/// Byte 2 corresponds to the set of look-around assertions (including both +/// Bytes 5..9 correspond to the set of look-around assertions (including both /// look-behind and look-ahead) that appear somewhere in this state's set of /// NFA state IDs. This is used to determine whether this state's epsilon /// closure should be re-computed when computing the transition function. @@ -366,7 +367,7 @@ impl StateBuilderNFA { /// function, we should only re-compute the epsilon closure if those new /// assertions are relevant to this particular state. /// -/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer /// corresponding to the number of patterns encoded in this state. If the state /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte @@ -452,7 +453,7 @@ impl<'a> Repr<'a> { /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { - LookSet::read_repr(&self.0[3..]) + LookSet::read_repr(&self.0[5..]) } /// Returns the total number of match pattern IDs in this state. @@ -476,7 +477,7 @@ impl<'a> Repr<'a> { if !self.has_pattern_ids() { PatternID::ZERO } else { - let offset = 9 + index * PatternID::SIZE; + let offset = 13 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. wire::read_pattern_id_unchecked(&self.0[offset..]).0 @@ -507,7 +508,7 @@ impl<'a> Repr<'a> { f(PatternID::ZERO); return; } - let mut pids = &self.0[9..self.pattern_offset_end()]; + let mut pids = &self.0[13..self.pattern_offset_end()]; while !pids.is_empty() { let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; @@ -539,11 +540,11 @@ impl<'a> Repr<'a> { fn pattern_offset_end(&self) -> usize { let encoded = self.encoded_pattern_len(); if encoded == 0 { - return 5; + return 9; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. - encoded.checked_mul(4).unwrap().checked_add(9).unwrap() + encoded.checked_mul(4).unwrap().checked_add(13).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. @@ -557,7 +558,7 @@ impl<'a> Repr<'a> { } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. - usize::try_from(wire::read_u32(&self.0[5..9])).unwrap() + usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() } } @@ -643,7 +644,7 @@ impl<'a> ReprVec<'a> { /// Mutate the set of look-around (both behind and ahead) assertions that /// appear at least once in this state's set of NFA states. fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { - set(self.look_need()).write_repr(&mut self.0[3..]); + set(self.look_need()).write_repr(&mut self.0[5..]); } /// Add a pattern ID to this state. All match states must have at least @@ -703,14 +704,14 @@ impl<'a> ReprVec<'a> { return; } let patsize = PatternID::SIZE; - let pattern_bytes = self.0.len() - 9; + let pattern_bytes = self.0.len() - 13; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); - wire::NE::write_u32(count32, &mut self.0[5..9]); + wire::NE::write_u32(count32, &mut self.0[9..13]); } /// Add an NFA state ID to this state. The order in which NFA states are diff --git a/vendor/regex-automata/src/util/lazy.rs b/vendor/regex-automata/src/util/lazy.rs index de27a2a6e..0d0b4fb2a 100644 --- a/vendor/regex-automata/src/util/lazy.rs +++ b/vendor/regex-automata/src/util/lazy.rs @@ -384,11 +384,7 @@ mod lazy { // SAFETY: state is DONE if and only if data has been fully // initialized. At which point, it is safe to drop. unsafe { - // MSRV(1.60): Use assume_init_drop. The below is how - // assume_init_drop is implemented. - core::ptr::drop_in_place( - (*self.data.as_ptr()).as_mut_ptr(), - ) + self.data.get_mut().assume_init_drop(); } } } diff --git a/vendor/regex-automata/src/util/look.rs b/vendor/regex-automata/src/util/look.rs index aee31b34e..73e51c0f6 100644 --- a/vendor/regex-automata/src/util/look.rs +++ b/vendor/regex-automata/src/util/look.rs @@ -96,6 +96,42 @@ pub enum Look { WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, + /// Match the start of an ASCII-only word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartAscii = 1 << 10, + /// Match the end of an ASCII-only word boundary. That is, this matches + /// a position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndAscii = 1 << 11, + /// Match the start of a Unicode word boundary. That is, this matches a + /// position at either the beginning of the haystack or where the previous + /// character is not a word character and the following character is a word + /// character. + WordStartUnicode = 1 << 12, + /// Match the end of a Unicode word boundary. That is, this matches a + /// position at either the end of the haystack or where the previous + /// character is a word character and the following character is not a word + /// character. + WordEndUnicode = 1 << 13, + /// Match the start half of an ASCII-only word boundary. That is, this + /// matches a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfAscii = 1 << 14, + /// Match the end half of an ASCII-only word boundary. That is, this + /// matches a position at either the end of the haystack or where the + /// following character is not a word character. + WordEndHalfAscii = 1 << 15, + /// Match the start half of a Unicode word boundary. That is, this matches + /// a position at either the beginning of the haystack or where the + /// previous character is not a word character. + WordStartHalfUnicode = 1 << 16, + /// Match the end half of a Unicode word boundary. That is, this matches + /// a position at either the end of the haystack or where the following + /// character is not a word character. + WordEndHalfUnicode = 1 << 17, } impl Look { @@ -117,6 +153,14 @@ impl Look { Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, + Look::WordStartAscii => Look::WordEndAscii, + Look::WordEndAscii => Look::WordStartAscii, + Look::WordStartUnicode => Look::WordEndUnicode, + Look::WordEndUnicode => Look::WordStartUnicode, + Look::WordStartHalfAscii => Look::WordEndHalfAscii, + Look::WordEndHalfAscii => Look::WordStartHalfAscii, + Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, + Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } @@ -125,28 +169,36 @@ impl Look { /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] - pub const fn as_repr(self) -> u16 { + pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. - self as u16 + self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] - pub const fn from_repr(repr: u16) -> Option { + pub const fn from_repr(repr: u32) -> Option { match repr { - 0b00_0000_0001 => Some(Look::Start), - 0b00_0000_0010 => Some(Look::End), - 0b00_0000_0100 => Some(Look::StartLF), - 0b00_0000_1000 => Some(Look::EndLF), - 0b00_0001_0000 => Some(Look::StartCRLF), - 0b00_0010_0000 => Some(Look::EndCRLF), - 0b00_0100_0000 => Some(Look::WordAscii), - 0b00_1000_0000 => Some(Look::WordAsciiNegate), - 0b01_0000_0000 => Some(Look::WordUnicode), - 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0000_0000_0001 => Some(Look::Start), + 0b00_0000_0000_0000_0010 => Some(Look::End), + 0b00_0000_0000_0000_0100 => Some(Look::StartLF), + 0b00_0000_0000_0000_1000 => Some(Look::EndLF), + 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), + 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), + 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), + 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), + 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), + 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), + 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), + 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), + 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), + 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), + 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), + 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), + 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), + 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } @@ -171,6 +223,14 @@ impl Look { Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', + Look::WordStartAscii => '<', + Look::WordEndAscii => '>', + Look::WordStartUnicode => '〈', + Look::WordEndUnicode => '〉', + Look::WordStartHalfAscii => '◁', + Look::WordEndHalfAscii => '▷', + Look::WordStartHalfUnicode => '◀', + Look::WordEndHalfUnicode => '▶', } } } @@ -184,14 +244,14 @@ impl Look { pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that - /// of a bitset, where each assertion occupies bit `i` where `i = - /// Look::as_repr()`. + /// of a bitset, where each assertion occupies bit `i` where + /// `i = Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. - pub bits: u16, + pub bits: u32, } impl LookSet { @@ -294,13 +354,22 @@ impl LookSet { pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) + || self.contains(Look::WordStartUnicode) + || self.contains(Look::WordEndUnicode) + || self.contains(Look::WordStartHalfUnicode) + || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { - self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordStartAscii) + || self.contains(Look::WordEndAscii) + || self.contains(Look::WordStartHalfAscii) + || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. @@ -379,29 +448,31 @@ impl LookSet { *self = self.intersect(other); } - /// Return a `LookSet` from the slice given as a native endian 16-bit + /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { - let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } - /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// - /// This panics if `slice.len() < 2`. + /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; + slice[2] = raw[2]; + slice[3] = raw[3]; } /// Checks that all assertions in this set can be matched. @@ -456,9 +527,9 @@ impl Iterator for LookSetIter { return None; } // We'll never have more than u8::MAX distinct look-around assertions, - // so 'repr' will always fit into a u16. - let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); - let look = Look::from_repr(1 << repr)?; + // so 'bit' will always fit into a u16. + let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } @@ -566,6 +637,23 @@ impl LookMatcher { } /// Like `matches`, but forcefully inlined. + /// + /// # Panics + /// + /// This panics when testing any Unicode word boundary assertion in this + /// set and when the Unicode word data is not available. Specifically, this + /// only occurs when the `unicode-word-boundary` feature is not enabled. + /// + /// Since it's generally expected that this routine is called inside of + /// a matching engine, callers should check the error condition when + /// building the matching engine. If there is a Unicode word boundary + /// in the matcher and the data isn't available, then the matcher should + /// fail to build. + /// + /// Callers can check the error condition with [`LookSet::available`]. + /// + /// This also may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_inline( &self, @@ -586,6 +674,26 @@ impl LookMatcher { Look::WordUnicodeNegate => { self.is_word_unicode_negate(haystack, at).unwrap() } + Look::WordStartAscii => self.is_word_start_ascii(haystack, at), + Look::WordEndAscii => self.is_word_end_ascii(haystack, at), + Look::WordStartUnicode => { + self.is_word_start_unicode(haystack, at).unwrap() + } + Look::WordEndUnicode => { + self.is_word_end_unicode(haystack, at).unwrap() + } + Look::WordStartHalfAscii => { + self.is_word_start_half_ascii(haystack, at) + } + Look::WordEndHalfAscii => { + self.is_word_end_half_ascii(haystack, at) + } + Look::WordStartHalfUnicode => { + self.is_word_start_half_unicode(haystack, at).unwrap() + } + Look::WordEndHalfUnicode => { + self.is_word_end_half_unicode(haystack, at).unwrap() + } } } @@ -680,6 +788,46 @@ impl LookMatcher { return false; } } + if set.contains(Look::WordStartAscii) { + if !self.is_word_start_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndAscii) { + if !self.is_word_end_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartUnicode) { + if !self.is_word_start_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndUnicode) { + if !self.is_word_end_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordStartHalfAscii) { + if !self.is_word_start_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordEndHalfAscii) { + if !self.is_word_end_half_ascii(haystack, at) { + return false; + } + } + if set.contains(Look::WordStartHalfUnicode) { + if !self.is_word_start_half_unicode(haystack, at).unwrap() { + return false; + } + } + if set.contains(Look::WordEndHalfUnicode) { + if !self.is_word_end_half_unicode(haystack, at).unwrap() { + return false; + } + } true } @@ -703,7 +851,15 @@ impl LookMatcher { Look::WordAscii | Look::WordAsciiNegate | Look::WordUnicode - | Look::WordUnicodeNegate => { + | Look::WordUnicodeNegate + | Look::WordStartAscii + | Look::WordEndAscii + | Look::WordStartUnicode + | Look::WordEndUnicode + | Look::WordStartHalfAscii + | Look::WordEndHalfAscii + | Look::WordStartHalfUnicode + | Look::WordEndHalfUnicode => { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. This isn't technically correct // for Unicode word boundaries, but DFAs can't handle those @@ -931,6 +1087,177 @@ impl LookMatcher { }; Ok(word_before == word_after) } + + /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_before && word_after + } + + /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given + /// position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + word_before && !word_after + } + + /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(!word_before && word_after) + } + + /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + let word_before = is_word_char::rev(haystack, at)?; + let word_after = is_word_char::fwd(haystack, at)?; + Ok(word_before && !word_after) + } + + /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_start_half_ascii( + &self, + haystack: &[u8], + at: usize, + ) -> bool { + let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); + !word_before + } + + /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + #[inline] + pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { + let word_after = + at < haystack.len() && utf8::is_word_byte(haystack[at]); + !word_after + } + + /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_start_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_start_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the right side must be in \w. + let word_before = at > 0 + && match utf8::decode_last(&haystack[..at]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::rev(haystack, at)?, + }; + Ok(!word_before) + } + + /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the + /// given position in `haystack`. + /// + /// # Panics + /// + /// This may panic when `at > haystack.len()`. Note that `at == + /// haystack.len()` is legal and guaranteed not to panic. + /// + /// # Errors + /// + /// This returns an error when Unicode word boundary tables + /// are not available. Specifically, this only occurs when the + /// `unicode-word-boundary` feature is not enabled. + #[inline] + pub fn is_word_end_half_unicode( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + // See `is_word_unicode_negate` for why we need to do this. We don't + // need to do it for `is_word_end_unicode` because that guarantees + // that the position matched falls on a valid UTF-8 boundary given + // that the left side must be in \w. + let word_after = at < haystack.len() + && match utf8::decode(&haystack[at..]) { + None | Some(Err(_)) => return Ok(false), + Some(Ok(_)) => is_word_char::fwd(haystack, at)?, + }; + Ok(!word_after) + } } impl Default for LookMatcher { @@ -1024,7 +1351,9 @@ impl core::fmt::Display for UnicodeWordBoundaryError { // There are perhaps other choices as well. Why did I stop at these 4? Because // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA // approach eventually, as the benefits of the DFA approach are somewhat -// compelling. The 'boundary-words-holmes' benchmark tests this: +// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that +// the commands below no longer work. If necessary, we should re-capitulate +// the benchmark from whole cloth in rebar.) // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv // @@ -1322,8 +1651,7 @@ mod is_word_char { fn is_word_character(c: char) -> bool { use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; - // MSRV(1.59): Use 'u8::try_from(c)' instead. - if u8::try_from(u32::from(c)).map_or(false, utf8::is_word_byte) { + if u8::try_from(c).map_or(false, utf8::is_word_byte) { return true; } PERL_WORD @@ -1656,50 +1984,478 @@ mod tests { } #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::Start)); - assert!(!f.contains(Look::End)); - assert!(!f.contains(Look::StartLF)); - assert!(!f.contains(Look::EndLF)); - assert!(!f.contains(Look::WordUnicode)); - assert!(!f.contains(Look::WordUnicodeNegate)); - assert!(!f.contains(Look::WordAscii)); - assert!(!f.contains(Look::WordAsciiNegate)); + fn look_matches_word_start_ascii() { + let look = Look::WordStartAscii; - f = f.insert(Look::Start); - assert!(f.contains(Look::Start)); - f = f.remove(Look::Start); - assert!(!f.contains(Look::Start)); + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) - f = f.insert(Look::End); - assert!(f.contains(Look::End)); - f = f.remove(Look::End); - assert!(!f.contains(Look::End)); + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); - f = f.insert(Look::StartLF); - assert!(f.contains(Look::StartLF)); - f = f.remove(Look::StartLF); - assert!(!f.contains(Look::StartLF)); + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); - f = f.insert(Look::EndLF); - assert!(f.contains(Look::EndLF)); - f = f.remove(Look::EndLF); - assert!(!f.contains(Look::EndLF)); + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); - f = f.insert(Look::StartCRLF); - assert!(f.contains(Look::StartCRLF)); - f = f.remove(Look::StartCRLF); - assert!(!f.contains(Look::StartCRLF)); + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); - f = f.insert(Look::EndCRLF); - assert!(f.contains(Look::EndCRLF)); - f = f.remove(Look::EndCRLF); - assert!(!f.contains(Look::EndCRLF)); + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); - f = f.insert(Look::WordUnicode); - assert!(f.contains(Look::WordUnicode)); - f = f.remove(Look::WordUnicode); + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_ascii() { + let look = Look::WordEndAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_unicode() { + let look = Look::WordStartUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_unicode() { + let look = Look::WordEndUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(!testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(!testlook!(look, "a ", 2)); + assert!(!testlook!(look, " a ", 0)); + assert!(!testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(!testlook!(look, "𝛃 ", 5)); + assert!(!testlook!(look, " 𝛃 ", 0)); + assert!(!testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(!testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_start_half_ascii() { + let look = Look::WordStartHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_matches_word_end_half_ascii() { + let look = Look::WordEndHalfAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(testlook!(look, "𝛃", 1)); + assert!(testlook!(look, "𝛃", 2)); + assert!(testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 1)); + assert!(testlook!(look, "𝛃𐆀", 2)); + assert!(testlook!(look, "𝛃𐆀", 3)); + assert!(testlook!(look, "𝛃𐆀", 5)); + assert!(testlook!(look, "𝛃𐆀", 6)); + assert!(testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_start_half_unicode() { + let look = Look::WordStartHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(testlook!(look, "a", 0)); + assert!(!testlook!(look, "a", 1)); + assert!(!testlook!(look, "a ", 1)); + assert!(testlook!(look, " a ", 1)); + assert!(!testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(testlook!(look, "𝛃", 0)); + assert!(!testlook!(look, "𝛃", 4)); + assert!(!testlook!(look, "𝛃 ", 4)); + assert!(testlook!(look, " 𝛃 ", 1)); + assert!(!testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(testlook!(look, "𝛃𐆀", 0)); + assert!(!testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + #[cfg(all(not(miri), feature = "unicode-word-boundary"))] + fn look_matches_word_end_half_unicode() { + let look = Look::WordEndHalfUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!testlook!(look, "a", 0)); + assert!(testlook!(look, "a", 1)); + assert!(testlook!(look, "a ", 1)); + assert!(!testlook!(look, " a ", 1)); + assert!(testlook!(look, " a ", 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃", 0)); + assert!(testlook!(look, "𝛃", 4)); + assert!(testlook!(look, "𝛃 ", 4)); + assert!(!testlook!(look, " 𝛃 ", 1)); + assert!(testlook!(look, " 𝛃 ", 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 0)); + assert!(testlook!(look, "𝛃𐆀", 4)); + + // Non word boundaries for ASCII. + assert!(testlook!(look, "", 0)); + assert!(!testlook!(look, "ab", 1)); + assert!(testlook!(look, "a ", 2)); + assert!(testlook!(look, " a ", 0)); + assert!(testlook!(look, " a ", 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!testlook!(look, "𝛃b", 4)); + assert!(!testlook!(look, "b𝛃", 1)); + assert!(testlook!(look, "𝛃 ", 5)); + assert!(testlook!(look, " 𝛃 ", 0)); + assert!(testlook!(look, " 𝛃 ", 6)); + assert!(!testlook!(look, "𝛃", 1)); + assert!(!testlook!(look, "𝛃", 2)); + assert!(!testlook!(look, "𝛃", 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!testlook!(look, "𝛃𐆀", 1)); + assert!(!testlook!(look, "𝛃𐆀", 2)); + assert!(!testlook!(look, "𝛃𐆀", 3)); + assert!(!testlook!(look, "𝛃𐆀", 5)); + assert!(!testlook!(look, "𝛃𐆀", 6)); + assert!(!testlook!(look, "𝛃𐆀", 7)); + assert!(testlook!(look, "𝛃𐆀", 8)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::Start)); + assert!(!f.contains(Look::End)); + assert!(!f.contains(Look::StartLF)); + assert!(!f.contains(Look::EndLF)); + assert!(!f.contains(Look::WordUnicode)); + assert!(!f.contains(Look::WordUnicodeNegate)); + assert!(!f.contains(Look::WordAscii)); + assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::Start); + assert!(f.contains(Look::Start)); + f = f.remove(Look::Start); + assert!(!f.contains(Look::Start)); + + f = f.insert(Look::End); + assert!(f.contains(Look::End)); + f = f.remove(Look::End); + assert!(!f.contains(Look::End)); + + f = f.insert(Look::StartLF); + assert!(f.contains(Look::StartLF)); + f = f.remove(Look::StartLF); + assert!(!f.contains(Look::StartLF)); + + f = f.insert(Look::EndLF); + assert!(f.contains(Look::EndLF)); + f = f.remove(Look::EndLF); + assert!(!f.contains(Look::EndLF)); + + f = f.insert(Look::StartCRLF); + assert!(f.contains(Look::StartCRLF)); + f = f.remove(Look::StartCRLF); + assert!(!f.contains(Look::StartCRLF)); + + f = f.insert(Look::EndCRLF); + assert!(f.contains(Look::EndCRLF)); + f = f.remove(Look::EndCRLF); + assert!(!f.contains(Look::EndCRLF)); + + f = f.insert(Look::WordUnicode); + assert!(f.contains(Look::WordUnicode)); + f = f.remove(Look::WordUnicode); assert!(!f.contains(Look::WordUnicode)); f = f.insert(Look::WordUnicodeNegate); @@ -1716,6 +2472,46 @@ mod tests { assert!(f.contains(Look::WordAsciiNegate)); f = f.remove(Look::WordAsciiNegate); assert!(!f.contains(Look::WordAsciiNegate)); + + f = f.insert(Look::WordStartAscii); + assert!(f.contains(Look::WordStartAscii)); + f = f.remove(Look::WordStartAscii); + assert!(!f.contains(Look::WordStartAscii)); + + f = f.insert(Look::WordEndAscii); + assert!(f.contains(Look::WordEndAscii)); + f = f.remove(Look::WordEndAscii); + assert!(!f.contains(Look::WordEndAscii)); + + f = f.insert(Look::WordStartUnicode); + assert!(f.contains(Look::WordStartUnicode)); + f = f.remove(Look::WordStartUnicode); + assert!(!f.contains(Look::WordStartUnicode)); + + f = f.insert(Look::WordEndUnicode); + assert!(f.contains(Look::WordEndUnicode)); + f = f.remove(Look::WordEndUnicode); + assert!(!f.contains(Look::WordEndUnicode)); + + f = f.insert(Look::WordStartHalfAscii); + assert!(f.contains(Look::WordStartHalfAscii)); + f = f.remove(Look::WordStartHalfAscii); + assert!(!f.contains(Look::WordStartHalfAscii)); + + f = f.insert(Look::WordEndHalfAscii); + assert!(f.contains(Look::WordEndHalfAscii)); + f = f.remove(Look::WordEndHalfAscii); + assert!(!f.contains(Look::WordEndHalfAscii)); + + f = f.insert(Look::WordStartHalfUnicode); + assert!(f.contains(Look::WordStartHalfUnicode)); + f = f.remove(Look::WordStartHalfUnicode); + assert!(!f.contains(Look::WordStartHalfUnicode)); + + f = f.insert(Look::WordEndHalfUnicode); + assert!(f.contains(Look::WordEndHalfUnicode)); + f = f.remove(Look::WordEndHalfUnicode); + assert!(!f.contains(Look::WordEndHalfUnicode)); } #[test] @@ -1724,7 +2520,7 @@ mod tests { assert_eq!(0, set.iter().count()); let set = LookSet::full(); - assert_eq!(10, set.iter().count()); + assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); @@ -1735,6 +2531,9 @@ mod tests { let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordEndHalfUnicode); + assert_eq!(1, set.iter().count()); } #[test] @@ -1743,6 +2542,6 @@ mod tests { let res = alloc::format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = alloc::format!("{:?}", LookSet::full()); - assert_eq!("Az^$rRbB𝛃𝚩", res); + assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } diff --git a/vendor/regex-automata/src/util/mod.rs b/vendor/regex-automata/src/util/mod.rs index bb739df1d..b3eef64e6 100644 --- a/vendor/regex-automata/src/util/mod.rs +++ b/vendor/regex-automata/src/util/mod.rs @@ -40,6 +40,7 @@ pub mod look; pub mod pool; pub mod prefilter; pub mod primitives; +pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; @@ -52,6 +53,5 @@ pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; -pub(crate) mod start; pub(crate) mod unicode_data; pub(crate) mod utf8; diff --git a/vendor/regex-automata/src/util/pool.rs b/vendor/regex-automata/src/util/pool.rs index c03d7b013..d90d4ecff 100644 --- a/vendor/regex-automata/src/util/pool.rs +++ b/vendor/regex-automata/src/util/pool.rs @@ -177,6 +177,7 @@ impl T> Pool { /// the value to go back into the pool) and then calling get again is /// *not* guaranteed to return the same value received in the first `get` /// call. + #[inline] pub fn get(&self) -> PoolGuard<'_, T, F> { PoolGuard(self.0.get()) } @@ -200,6 +201,7 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// This circumvents the guard's `Drop` implementation. This can be useful /// in circumstances where the automatic `Drop` results in poorer codegen, /// such as calling non-inlined functions. + #[inline] pub fn put(this: PoolGuard<'_, T, F>) { inner::PoolGuard::put(this.0); } @@ -208,12 +210,14 @@ impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { type Target = T; + #[inline] fn deref(&self) -> &T { self.0.value() } } impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { + #[inline] fn deref_mut(&mut self) -> &mut T { self.0.value_mut() } @@ -451,11 +455,44 @@ mod inner { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) fn new(create: F) -> Pool { - // MSRV(1.63): Mark this function as 'const'. I've arranged the - // code such that it should "just work." Then mark the public - // 'Pool::new' method as 'const' too. (The alloc-only Pool::new - // is already 'const', so that should "just work" too.) The only - // thing we're waiting for is Mutex::new to be const. + // FIXME: Now that we require 1.65+, Mutex::new is available as + // const... So we can almost mark this function as const. But of + // course, we're creating a Vec of stacks below (we didn't when I + // originally wrote this code). It seems like the best way to work + // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead + // of a `Vec`. I refrained from making this change at time + // of writing (2023/10/08) because I was making a lot of other + // changes at the same time and wanted to do this more carefully. + // Namely, because of the cache line optimization, that `[Stack; + // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if + // at all) that would be. + // + // Another choice would be to lazily allocate the stacks, but... + // I'm not so sure about that. Seems like a fair bit of complexity? + // + // Maybe there's a simple solution I'm missing. + // + // ... OK, I tried to fix this. First, I did it by putting `stacks` + // in an `UnsafeCell` and using a `Once` to lazily initialize it. + // I benchmarked it and everything looked okay. I then made this + // function `const` and thought I was just about done. But the + // public pool type wraps its inner pool in a `Box` to keep its + // size down. Blech. + // + // So then I thought that I could push the box down into this + // type (and leave the non-std version unboxed) and use the same + // `UnsafeCell` technique to lazily initialize it. This has the + // downside of the `Once` now needing to get hit in the owner fast + // path, but maybe that's OK? However, I then realized that we can + // only lazily initialize `stacks`, `owner` and `owner_val`. The + // `create` function needs to be put somewhere outside of the box. + // So now the pool is a `Box`, `Once` and a function. Now we're + // starting to defeat the point of boxing in the first place. So I + // backed out that change too. + // + // Back to square one. I maybe we just don't make a pool's + // constructor const and live with it. It's probably not a huge + // deal. let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); for _ in 0..stacks.capacity() { stacks.push(CacheLine(Mutex::new(vec![]))); @@ -469,6 +506,7 @@ mod inner { impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. + #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { // Our fast path checks if the caller is the thread that "owns" // this pool. Or stated differently, whether it is the first thread @@ -562,6 +600,7 @@ mod inner { /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. + #[inline] fn put_value(&self, value: Box) { let caller = THREAD_ID.with(|id| *id); let stack_id = caller % self.stacks.len(); @@ -587,11 +626,13 @@ mod inner { } /// Create a guard that represents the special owned T. + #[inline] fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Err(caller), discard: false } } /// Create a guard that contains a value from the pool's stack. + #[inline] fn guard_stack(&self, value: Box) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Ok(value), discard: false } } @@ -599,6 +640,7 @@ mod inner { /// Create a guard that contains a value from the pool's stack with an /// instruction to throw away the value instead of putting it back /// into the pool. + #[inline] fn guard_stack_transient(&self, value: Box) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Ok(value), discard: true } } @@ -633,6 +675,7 @@ mod inner { impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. + #[inline] pub(super) fn value(&self) -> &T { match self.value { Ok(ref v) => &**v, @@ -657,6 +700,7 @@ mod inner { } /// Return the underlying value as a mutable borrow. + #[inline] pub(super) fn value_mut(&mut self) -> &mut T { match self.value { Ok(ref mut v) => &mut **v, @@ -681,6 +725,7 @@ mod inner { } /// Consumes this guard and puts it back into the pool. + #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop @@ -729,6 +774,7 @@ mod inner { } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] fn drop(&mut self) { self.put_imp(); } @@ -806,6 +852,7 @@ mod inner { impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. + #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { let mut stack = self.stack.lock(); let value = match stack.pop() { @@ -815,6 +862,7 @@ mod inner { PoolGuard { pool: self, value: Some(value) } } + #[inline] fn put(&self, guard: PoolGuard<'_, T, F>) { let mut guard = core::mem::ManuallyDrop::new(guard); if let Some(value) = guard.value.take() { @@ -825,6 +873,7 @@ mod inner { /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. + #[inline] fn put_value(&self, value: Box) { let mut stack = self.stack.lock(); stack.push(value); @@ -847,16 +896,19 @@ mod inner { impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. + #[inline] pub(super) fn value(&self) -> &T { self.value.as_deref().unwrap() } /// Return the underlying value as a mutable borrow. + #[inline] pub(super) fn value_mut(&mut self) -> &mut T { self.value.as_deref_mut().unwrap() } /// Consumes this guard and puts it back into the pool. + #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop @@ -878,6 +930,7 @@ mod inner { } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { + #[inline] fn drop(&mut self) { self.put_imp(); } @@ -931,6 +984,7 @@ mod inner { /// Lock this mutex and return a guard providing exclusive access to /// `T`. This blocks if some other thread has already locked this /// mutex. + #[inline] fn lock(&self) -> MutexGuard<'_, T> { while self .locked @@ -963,18 +1017,21 @@ mod inner { impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { type Target = T; + #[inline] fn deref(&self) -> &T { self.data } } impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { + #[inline] fn deref_mut(&mut self) -> &mut T { self.data } } impl<'a, T> Drop for MutexGuard<'a, T> { + #[inline] fn drop(&mut self) { // Drop means 'data' is no longer accessible, so we can unlock // the mutex. diff --git a/vendor/regex-automata/src/util/start.rs b/vendor/regex-automata/src/util/start.rs index 4e360d083..27153780e 100644 --- a/vendor/regex-automata/src/util/start.rs +++ b/vendor/regex-automata/src/util/start.rs @@ -1,17 +1,195 @@ /*! -Provides some helpers for dealing with start state configurations in DFAs. - -[`Start`] represents the possible starting configurations, while -[`StartByteMap`] represents a way to retrieve the `Start` configuration for a -given position in a haystack. +Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, - search::Input, + search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; +/// The configuration used to determine a DFA's start state for a search. +/// +/// A DFA has a single starting state in the typical textbook description. That +/// is, it corresponds to the set of all starting states for the NFA that built +/// it, along with their espsilon closures. In this crate, however, DFAs have +/// many possible start states due to a few factors: +/// +/// * DFAs support the ability to run either anchored or unanchored searches. +/// Each type of search needs its own start state. For example, an unanchored +/// search requires starting at a state corresponding to a regex with a +/// `(?s-u:.)*?` prefix, which will match through anything. +/// * DFAs also optionally support starting an anchored search for any one +/// specific pattern. Each such pattern requires its own start state. +/// * If a look-behind assertion like `^` or `\b` is used in the regex, then +/// the DFA will need to inspect a single byte immediately before the start of +/// the search to choose the correct start state. +/// +/// Indeed, this configuration precisely encapsulates all of the above factors. +/// The [`Config::anchored`] method sets which kind of anchored search to +/// perform while the [`Config::look_behind`] method provides a way to set +/// the byte that occurs immediately before the start of the search. +/// +/// Generally speaking, this type is only useful when you want to run searches +/// without using an [`Input`]. In particular, an `Input` wants a haystack +/// slice, but callers may not have a contiguous sequence of bytes as a +/// haystack in all cases. This type provides a lower level of control such +/// that callers can provide their own anchored configuration and look-behind +/// byte explicitly. +/// +/// # Example +/// +/// This shows basic usage that permits running a search with a DFA without +/// using the `Input` abstraction. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter() { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This example shows how to correctly run a search that doesn't begin at +/// the start of a haystack. Notice how we set the look-behind byte, and as +/// a result, the `\b` assertion does not match. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// let config = start::Config::new() +/// .anchored(Anchored::Yes) +/// .look_behind(Some(b'q')); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // No match! +/// assert!(!dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// If we had instead not set a look-behind byte, then the DFA would assume +/// that it was starting at the beginning of the haystack, and thus `\b` should +/// match. This in turn would result in erroneously reporting a match: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// util::start, +/// Anchored, +/// }; +/// +/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; +/// let haystack = "quartz"; +/// +/// // Whoops, forgot the look-behind byte... +/// let config = start::Config::new().anchored(Anchored::Yes); +/// let mut state = dfa.start_state(&config)?; +/// for &b in haystack.as_bytes().iter().skip(1) { +/// state = dfa.next_state(state, b); +/// } +/// state = dfa.next_eoi_state(state); +/// // And now we get a match unexpectedly. +/// assert!(dfa.is_match_state(state)); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + look_behind: Option, + anchored: Anchored, +} + +impl Config { + /// Create a new default start configuration. + /// + /// The default is an unanchored search that starts at the beginning of the + /// haystack. + pub fn new() -> Config { + Config { anchored: Anchored::No, look_behind: None } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a forward search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// preceding the start of the search. If the start of the search is at + /// offset `0`, then no look-behind byte is set. + pub fn from_input_forward(input: &Input<'_>) -> Config { + let look_behind = input + .start() + .checked_sub(1) + .and_then(|i| input.haystack().get(i).copied()); + Config { look_behind, anchored: input.get_anchored() } + } + + /// A convenience routine for building a start configuration from an + /// [`Input`] for a reverse search. + /// + /// This automatically sets the look-behind byte to the byte immediately + /// following the end of the search. If the end of the search is at + /// offset `haystack.len()`, then no look-behind byte is set. + pub fn from_input_reverse(input: &Input<'_>) -> Config { + let look_behind = input.haystack().get(input.end()).copied(); + Config { look_behind, anchored: input.get_anchored() } + } + + /// Set the look-behind byte at the start of a search. + /// + /// Unless the search is intended to logically start at the beginning of a + /// haystack, this should _always_ be set to the byte immediately preceding + /// the start of the search. If no look-behind byte is set, then the start + /// configuration will assume it is at the beginning of the haystack. For + /// example, the anchor `^` will match. + /// + /// The default is that no look-behind byte is set. + pub fn look_behind(mut self, byte: Option) -> Config { + self.look_behind = byte; + self + } + + /// Set the anchored mode of a search. + /// + /// The default is an unanchored search. + pub fn anchored(mut self, mode: Anchored) -> Config { + self.anchored = mode; + self + } + + /// Return the look-behind byte in this configuration, if one exists. + pub fn get_look_behind(&self) -> Option { + self.look_behind + } + + /// Return the anchored mode in this configuration. + pub fn get_anchored(&self) -> Anchored { + self.anchored + } +} + /// A map from every possible byte value to its corresponding starting /// configuration. /// @@ -71,30 +249,11 @@ impl StartByteMap { StartByteMap { map } } - /// Return the forward starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn fwd(&self, input: &Input) -> Start { - match input - .start() - .checked_sub(1) - .and_then(|i| input.haystack().get(i)) - { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - - /// Return the reverse starting configuration for the given `input`. - #[cfg_attr(feature = "perf-inline", inline(always))] - pub(crate) fn rev(&self, input: &Input) -> Start { - match input.haystack().get(input.end()) { - None => Start::Text, - Some(&byte) => self.get(byte), - } - } - + /// Return the starting configuration for the given look-behind byte. + /// + /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] - fn get(&self, byte: u8) -> Start { + pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } @@ -253,21 +412,32 @@ mod tests { #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.fwd(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); - assert_eq!(Start::Text, smap.rev(&Input::new("").range(1..0))); + let input = Input::new("").range(1..0); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.fwd(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_forward(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); @@ -287,8 +457,11 @@ mod tests { fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); - let input = &Input::new(haystack).range(start..end); - smap.rev(input) + let input = Input::new(haystack).range(start..end); + let config = Config::from_input_reverse(&input); + let start = + config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); + start }; assert_eq!(Start::Text, f("", 0, 0)); diff --git a/vendor/regex-automata/tests/dfa/suite.rs b/vendor/regex-automata/tests/dfa/suite.rs index f3445e02a..8ed6dd007 100644 --- a/vendor/regex-automata/tests/dfa/suite.rs +++ b/vendor/regex-automata/tests/dfa/suite.rs @@ -9,7 +9,6 @@ use { util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, - regex_syntax::hir, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, @@ -285,10 +284,7 @@ fn compiler( // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { - let looks = hir.properties().look_set(); - if looks.contains(hir::Look::WordUnicode) - || looks.contains(hir::Look::WordUnicodeNegate) - { + if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } diff --git a/vendor/regex-automata/tests/hybrid/api.rs b/vendor/regex-automata/tests/hybrid/api.rs index e82d808e3..4b04c4f8f 100644 --- a/vendor/regex-automata/tests/hybrid/api.rs +++ b/vendor/regex-automata/tests/hybrid/api.rs @@ -55,7 +55,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); - let err = MatchError::gave_up(25); + let err = MatchError::gave_up(24); // Notice that we make the same amount of progress in each search! That's // because the cache is reused and already has states to handle the first // N bytes. @@ -83,7 +83,7 @@ fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); - let err = MatchError::gave_up(27); + let err = MatchError::gave_up(26); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) diff --git a/vendor/regex-automata/tests/lib.rs b/vendor/regex-automata/tests/lib.rs index 1465e51eb..67c979aa8 100644 --- a/vendor/regex-automata/tests/lib.rs +++ b/vendor/regex-automata/tests/lib.rs @@ -61,6 +61,7 @@ fn suite() -> anyhow::Result { load!("unicode"); load!("utf8"); load!("word-boundary"); + load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); -- cgit v1.2.3