diff options
Diffstat (limited to 'vendor/regex-automata-0.2.0')
88 files changed, 41532 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/.cargo-checksum.json b/vendor/regex-automata-0.2.0/.cargo-checksum.json new file mode 100644 index 000000000..63e5b1a67 --- /dev/null +++ b/vendor/regex-automata-0.2.0/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"0122506f03800693bb58426493e7faa1ec90c002e542fcbfaf5dbd086e56f2be","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","PLANS.md":"405c695de578604ab6a425709709ba8cb69db0b7fed103f44aad2e2069bef7ac","README.md":"de887d97b46825f6fde7c9b1066619eb9a729178b93492d900bc7c183337dd81","TODO":"296f208a1c13fa55c449452e5e0df7aeee7431c0bc81497a3f0c7d2b01483ddb","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/dfa/accel.rs":"cac45cfa62a3521684aee5583aa519425cc0de57d01a23f511433ad46ca426dc","src/dfa/automaton.rs":"9bd295a5a6e7ee99046703e1f8dc02c621e6ddac85344f7f37bb020b71383118","src/dfa/dense.rs":"4739d4959e415a9191d0c0dd0a07d2cc95ce6243831a806f7103bcfd509e9e2c","src/dfa/determinize.rs":"14666440637e91bf2a16a074e666b92cbdbd0b39b4ce21180be9235af47f541e","src/dfa/error.rs":"1f057e5a2f2ae87031676e5cce70c8226514de181dbcce2e491a930d28751b9e","src/dfa/minimize.rs":"a5e85fe9570307a053592eefb3bf6a72d9fdbcfb8311f5a0dd22e39085e87811","src/dfa/mod.rs":"bb02f594cae82e28f2bddea1453c35d8f38ea7f97fb5ee8cc588f628b1fcc667","src/dfa/regex.rs":"18eded661f818be36ef175acd49198140317ecb36d9171c3ebbabdf17c8fcf86","src/dfa/search.rs":"b3526fc40846c71cf687bf4a2f1f0e682b2615d7e3f62707e9e68bc79f2fe9a3","src/dfa/search_unsafe.rs":"047cd4fcdc4129c704e7269c0af2f71d6b8a64b0de01ad7174843c3fb9fbe013","src/dfa/sparse.rs":"c6c7540347e04c2be9b4e0b1b4eed9dc661707bba7386382805e492b704d113d","src/dfa/special.rs":"51d4254e3fcaa711e0739fecd8ee64392426e3bd4a6a74b37096157dc4dbf107","src/dfa/transducer.rs":"ad5766b1e781a8ec1f6113c4eaa53373c9d29260d357de0f71f7cc19a76f1f21","src/hybrid/dfa.rs":"2f6176a317c64716af2ff210c404e712e5a2eac64ad30617c5fda728e1342be9","src/hybrid/error.rs":"99c1e1a7a8d8e88724faaeee5e56383a05b284b74f33d864124d411c52c77361","src/hybrid/id.rs":"051ef2cfeb613fd20a19b42515ce5de8e812313007db6030fd1aaec13cafbabf","src/hybrid/mod.rs":"4f145341030bc6fd1392fbaf916dc9ba5cd1912562e94b758a6458da17abeef8","src/hybrid/regex.rs":"7c0ca05c9801e91af718b50a2f685d0e61fdaad0e88d8c3c23afe71c0a13bb14","src/hybrid/search.rs":"0eb9f26400c9cd949038c8a4c96b96a7879dac994a86a4cf9ed8837f3328e4d5","src/lib.rs":"06641dff57899f19ab7929404c92e21bc48835a65e3e08f366821c7b9ccfe08f","src/macros.rs":"a73da3a0725a7c0afbaf36cd64a185af12db5707fd7740bf08b188c2ae9872db","src/nfa/mod.rs":"3ec8d362fd16e3cb1742930bae77ba128e592c7f574cd186d342b98f39abd06f","src/nfa/thompson/compiler.rs":"9548c025a9fb9d551af9829cf68251084c3b24e1c5db3b365d6463b07ca02164","src/nfa/thompson/error.rs":"7c0c556cdc52635345a0efcfecce3add05cd91770dd8b9353c854d41a9f4b862","src/nfa/thompson/map.rs":"03f88cd3ee01cb46b542918d8eba7fd927a4409c0cf7080f57a19bbc9733020b","src/nfa/thompson/mod.rs":"0b5b274b29ede0a552728579396d74114bfc049c34576fb3bd9358c306ac9dd3","src/nfa/thompson/pikevm.rs":"cf97a464e3c307ffed65530ebf4d17b1d3a9961525e14a49542373b818f47ad1","src/nfa/thompson/range_trie.rs":"8576bc8a4d9fa3f66c88f15b22b3dbbf26534c17c5e621cbbec40801c8141628","src/util/alphabet.rs":"350829d2abf132486086d1f331826177748106c4d8a1c7cff839a82e04f323df","src/util/bytes.rs":"273dbd419f4d561fa1204990abb0c25fa58045b1d9dfeaa8ea40a747e08bfa59","src/util/determinize/mod.rs":"8539e34529589cc56e53dac6f0d29e150da9291e9b72f28f7821c12153dff1e9","src/util/determinize/state.rs":"ccff32679266cd8f4b18b4bf0beba3563167df53ca4f5dc46061fbc1222ca420","src/util/id.rs":"b6b3efabcdfdc0e56a277b903e40c684ba1182547b7e33cc4fbc1ad6ea348664","src/util/lazy.rs":"7ead513dd094d6c30c7196271afbb346b2b3601bbe33488fcd5284d9e8426027","src/util/matchtypes.rs":"24b05d62a95c271029170e73f9ff2bd16f264b6298abf01bcd4660ae2a86a6cd","src/util/mod.rs":"0e054937cc1a84f70dffa4ace1f0111d0b9a177154b423915b411185194a3c8f","src/util/prefilter.rs":"3dcc4f4a75c38fc00435b7ea88cfa9bb3481c8e5655e8325b0f0e1f2b8d1c65f","src/util/sparse_set.rs":"04aac2d8ae2299b85494df85ebafaef2891d36d3b856155cffa3b59fcc8993b4","src/util/start.rs":"2f8c28712bb97265139aefa961cef1b40bb0cbaa73cbbd1e6115ba4cc2bfe196","src/util/syntax.rs":"09f93982215c9bea3200ec2efd21b3d7ec53d5200546eb48a56040eda026db9a","tests/data/bytes.toml":"aee9df19c5cdd52ddac44490c6df6226cef33077a979d6b964ffe73aaf724bbf","tests/data/crazy.toml":"759293076a76d7efe8eb87b3207a0587c7e969637cd985ca985aa15f71dc0c57","tests/data/earliest.toml":"6ba10ea322fc8939ca0b849812b364a0d0b7594a3df1efee62fd03b7d048c399","tests/data/empty.toml":"45f314d2f9c624056665ba80ebcb4626b551a0bc4780d9c7ca160dd5caa6abaf","tests/data/expensive.toml":"d046774120b99f9516fa7893a3e51fa76182133c16e20d4582956125623775fb","tests/data/flags.toml":"b415e2c48a2520bb182a2f795e11229e56b6e2bf93f7177d64e30118b858cef8","tests/data/fowler/basic.toml":"226ea90327f02c62ed673fc747493bc2bb0f647629f08f92ce26a130f653a4fd","tests/data/fowler/dat/README":"441bb1ed49be2b02d99d3f65974313d7d274b154e53bfa3da2a3df0538b24f04","tests/data/fowler/dat/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","tests/data/fowler/dat/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","tests/data/fowler/dat/repetition-expensive.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","tests/data/fowler/dat/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","tests/data/fowler/nullsubexpr.toml":"3e975fc6ca8e14d615ed8483c7a35158d70a8cfcc7676ef15ed68ec5eef63164","tests/data/fowler/repetition-expensive.toml":"9d9203b6c3c380947afb41c717b834eb35746de4f21e124f6f15119a6460b150","tests/data/fowler/repetition-long.toml":"a598f6682e71a8689630edb35d69f43a1305090f77cfc39dff3f60e7284618e2","tests/data/fowler/repetition.toml":"ce1888a6550fce8a7986686684ef3eff762430459d50726bc4918d0e211c3847","tests/data/iter.toml":"d1995a7b65b12aa34b4226c3ca039fcf52dcaf96a6f061064da84e981e6796e0","tests/data/misc.toml":"a32697c95595b0ad28f3c12422caddf79eaba35047f32583f2df1c6b17bc0eaf","tests/data/multiline.toml":"70dabae358d0701132d55b4378d7aa78ae5aa3fabad38ff2a6a91e83b44b78bf","tests/data/no-unicode.toml":"11be343498e0e834b422ead1168204dbaac1fb32a5384e66f0b98cdb63b39057","tests/data/overlapping.toml":"8394b104f24abd62ebed5c4b8b4708db8dba7f973a6fd10f1711d340bf0e5b5c","tests/data/regression.toml":"718d151906584f521b5bb65bae8f03a516da6a0e87312b652b96d63a9a4be64c","tests/data/set.toml":"c2412cf09030ff7ef034e44c2b051e91841f0e2cd990576bb636bd1d1da18827","tests/data/unicode.toml":"af0ee5ba8ec93fbafe4647bbac97287003743db8b7eac3e2d4dfd17f02912328","tests/data/word-boundary.toml":"20cdd14cd0cab146e0fc541dfdf913e764340997db8ab4e2d80f94dd2f9b309d","tests/dfa/api.rs":"9de253770e6bc9b2ca32f1533655740677f245fd61e9188358acb51c6655f98e","tests/dfa/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/dfa/suite.rs":"2d3007c970a05e2ed7babd120d9a5a4e01b034780fc05b9d905e857a8255ab08","tests/hybrid/api.rs":"c954cdcbbc04ef939ae38d32aae3dee1847c6ea2a36ec6e2a4bedb19aaa861e4","tests/hybrid/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/hybrid/suite.rs":"1fd79a8699eb418a28897269daa3e86f7fc792ffa4fe9318c57aabfd10176f38","tests/nfa/mod.rs":"49055c358e38d97e42acb1602c671f97dddf24cafe089490f0e79ed208d74d9b","tests/nfa/thompson/mod.rs":"ab5f818ad62de599a2ddcedfd1774bf51e3245060ab8e3864bb07f146fe81a5a","tests/nfa/thompson/pikevm/api.rs":"af39a4787bb089060ee6b87e5ab1979c1863731ebbd9d1b0ba1ac6e93f6c0633","tests/nfa/thompson/pikevm/mod.rs":"dfa9fca1b57cdb4c5cba52686922c0c788c0be43c83af2b50a0d244d8012b031","tests/nfa/thompson/pikevm/suite.rs":"9d56601bb80a67c935f1f9aa4c4d130e1766e827bc34a62a48fb20297d8af2db","tests/regression.rs":"2d72466e872be88941a59582216823eb95bda461a5b2237b438a1fbfdcf813ac","tests/tests.rs":"7cf459df359f75fad2a44f7929521bcbc6fc78da6576af4306aec5386d35ffe3","tests/util.rs":"97573ea40567a62b54babe14a91b689f1d8ff663e2cb5e77103c7dede443e977"},"package":"e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782"}
\ No newline at end of file diff --git a/vendor/regex-automata-0.2.0/COPYING b/vendor/regex-automata-0.2.0/COPYING new file mode 100644 index 000000000..bb9c20a09 --- /dev/null +++ b/vendor/regex-automata-0.2.0/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/vendor/regex-automata-0.2.0/Cargo.toml b/vendor/regex-automata-0.2.0/Cargo.toml new file mode 100644 index 000000000..153f11fb3 --- /dev/null +++ b/vendor/regex-automata-0.2.0/Cargo.toml @@ -0,0 +1,88 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "regex-automata" +version = "0.2.0" +authors = ["Andrew Gallant <jamslam@gmail.com>"] +exclude = [ + "/.github", + "/scripts/*", + "/regex-cli", + "/regex-test", +] +autoexamples = false +autotests = false +description = "Automata construction and matching using regular expressions." +homepage = "https://github.com/BurntSushi/regex-automata" +documentation = "https://docs.rs/regex-automata" +readme = "README.md" +keywords = [ + "regex", + "dfa", + "automata", + "automaton", + "nfa", +] +categories = ["text-processing"] +license = "Unlicense/MIT" +repository = "https://github.com/BurntSushi/regex-automata" +resolver = "2" + +[profile.bench] +debug = true + +[profile.dev] +opt-level = 3 +debug = true + +[profile.release] +debug = true + +[profile.test] +opt-level = 3 +debug = true + +[lib] +bench = false + +[[test]] +name = "integration" +path = "tests/tests.rs" + +[dependencies.fst] +version = "0.4.5" +optional = true + +[dependencies.log] +version = "0.4.14" +optional = true + +[dependencies.memchr] +version = "2.4.0" +default-features = false + +[dependencies.regex-syntax] +version = "0.6.24" +optional = true + +[features] +alloc = ["syntax"] +default = [ + "std", + "alloc", + "syntax", +] +logging = ["log"] +std = [] +syntax = ["regex-syntax"] +transducer = ["fst"] diff --git a/vendor/regex-automata-0.2.0/LICENSE-MIT b/vendor/regex-automata-0.2.0/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/vendor/regex-automata-0.2.0/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/regex-automata-0.2.0/PLANS.md b/vendor/regex-automata-0.2.0/PLANS.md new file mode 100644 index 000000000..2fa9392ef --- /dev/null +++ b/vendor/regex-automata-0.2.0/PLANS.md @@ -0,0 +1,165 @@ +pattern_limit should not be defined inside nfa::thompson, but rather at the +top-level. + +----- + +Main problem right now is exemplified by the set60 and set70 failing tests. In +particular, when finding the starting position while matching multiple regexes +simultaneously, the reverse search is messed up. The reverse search doesn't +depend on which regex matched in the forward direction, which means it won't +always find the correcting starting location. Unfortunately, the only way to +fix this, as far as I can tell, is to add a group of start states for every +regex in the DFA. Then once we do the reverse search, we need to choose the +correct start state based on which regex matched in the forward direction. + +This is a nasty change. + +So it looks like this only applies when doing an overlapping search in reverse +to find the start of a match. That means we should make this configurable +but enable it by default for the reverse automata. It should be configurable +so that folks can construct a regex that doesn't have the ability to do +overlapping searches correctly. If an overlapping search is attempted with +a reverse automaton that lacks starting states for each pattern, then the +implementation should panic. + +BUT! It is also convenient to provide this option in general for folks that +want a DFA that can match any pattern while also being able to match a specific +pattern. + +Straw man: + +* Update dense::Config to have a `starts_for_each_pattern` option. It should + be disabled by default. +* In `RegexBuilder::build_many_with_size` tweak the reverse DFA configuration + to have the aforementioned option enabled. +* It would be interesting to add new APIs to `Regex` that support matching + specific patterns, but I think this is a complication. If we did want to do + this, then we should just add it to the `_at` variants and leave the rest of + the API untouched. +* Add a `pattern_id: Option<PatternID>` parameter to each of the five + `*_at` methods on the `dfa::Automaton` trait. A value of `None` retains the + existing behavior. A `Some` value means that the starting state for that + specific pattern must be chosen, which in turn implies an anchored search. + (This means `starts_for_each_pattern` has utility for single-pattern DFAs + since it makes it possible to build a DFA that can do both unanchored and + anchored searches.) +* Thread this new parameter down into the various functions in `dfa::search` + all the way down into `init_fwd` and `init_rev`. These functions will then + pass it to `dfa.start_state_{forward,reverse}`. +* This is where things get gruesome since we now need to completely re-work how + start states are represented in dense and sparse DFAs _and_ it needs to be + configurable. It looks like the `Start` type from `dfa::automaton` can + basically remain unchanged, since it still represents one of the four + possible starting states that will need to be applied for every pattern. +* For `dfa::dense`, change `StartList` to `StartTable`. Currently, its only + header is the state ID count, which is always 4. We'll want to change this + to the stride and add a new header value that encodes the number of patterns. + When the number of patterns is zero, then existing behavior is preserved and + represents the case where `starts_for_each_pattern` is disabled (or in the + case of an empty DFA). When non-zero, a table of starting state IDs is + encoded with each row corresponding to the 4 starting states for each + pattern. Before this table (even if it's empty), the 4 starting states for + the entire DFA are encoded. +* For `dfa::sparse`, do the same as above. They are essentially the same right + now anyway, with the only difference being that sparse DFAs use `&[u8]` + instead of `&[S]` (because sparse DFAs don't have any alignment + requirements). +* Modify `DFA::empty` to accept a `starts_for_each_pattern` bool that, when + true, creates a start table with the header, the start states for the entire + DFA and a row of start states for each pattern. When false, no rows are + added. +* Expose whether there are starting states for each pattern via a predicate + on the DFA. +* Modify the determinizer's `add_starts` method to basically do what it does, + but also do it for each pattern when the DFA is configured for it. It should + continue to reuse states as appropriate or not generate new states if they + aren't needed. This will want to use the `NFA::start_pattern` method, which + provides the starting NFA state ID for the given pattern. +* Fix the dense->sparse conversion. At this point, this piece should be fairly + straight-forward since the sparse representation of starting states is + basically identical to the dense representation. + +At this point, I think the bug should resolve itself. + +^^^^ DONE! IT WORKS! + +----- + + +Add top-level SyntaxConfig (or some such) that has all of the regex-syntax +options forwarded, but with automata oriented docs. Then use this for all of +the engines instead of having to repeat every option for every builder. + +----- + +These produce different results. PCRE2 looks correct. Basically, we should be +using the context around the `at` position correctly, which we aren't doing +right now. Seems tricky to get right, particularly when confirming the match +with a reverse DFA. + +Maybe our 'at' functions need to take a full range... Sigh. This is indeed what +RE2 does. GAH. + +fn main() { + let re = regex::Regex::new(r"(?-u)\b\sbar").unwrap(); + let s = "foo bar baz"; + println!("{:?}", re.find_at(s, 3).map(|m| m.as_str())); + + let re = pcre2::bytes::Regex::new(r"\b\sbar").unwrap(); + let s = "foo bar baz"; + println!("{:?}", re.find_at(s.as_bytes(), 3).unwrap()); +} + +^^^^ This is fixed now, but we still need to find a way to add test coverage +for "context" searches. It'd be nice to do this automatically, but we'll +probably just added a new 'context = [start, end]' option. + +----- + + +* Create regex-test crate, based on glob-test. Try to anticipate the needs for + the full regex test suite. + * See if we can clean up tests. + * Provide a way to mark a test as expensive. + * Provide a way to test is_match_at and find_at. + * Test shortest_match_at too? Huge pain. Add tests for it. + * Port ALL tests from the regex crate. Will probably need a way to mark a + test as skipped. + * Document tests better. +* Find a way to remove byteorder dependency. +* Reorganize crate API: + * Have errors contain `Box<Error+Send+Sync>` instead of `String`. + * Make errors non-exhaustive. + * Audit `StateID` trait for safety. + * Brainstorm hard about `DFA` trait and the fact that DenseDFA and SparseDFA + have inefficient implementations of some methods. Maybe use multiple + traits? Answer: get rid of premultiply/classes knobs and just enable + them by default. Should remove a huge amount of code. + * Check whether `unsafe` is really needed to eliminate bounds checks. Use + micro-benchmarks and bigger CLI workloads using `regex-automata-debug`. + * Re-write module docs for `dfa` as they are no longer top-level. Keep most. + * Retain any pertinent top-level crate docs, but don't rewrite yet. + * Clean up builders if we can. e.g., Determinizer, minimizer, it's all a mess + right now. + * Clean up and add 'always_match' and 'never_match' constructors for every + regex engine. + * See about supporting ^, $, \A, \z, \b and \B in DFAs. Do the non-Unicode + version of \b unfortunately. Carefully scrutinize how the regex crate's + lazy DFA does it and try to make it comprehensible. Done! Except for the + part about making it comprehensible. +* Rethink prefilters? +* Add `regex-automata-generate` CLI tool. This should just be a copy of + the `ucd-generate dfa` and `ucd-generate regex` commands. + +Then build new public `nfa` sub-module. + * For Unicode \b, generate \w DFA (forwards and reverse) and embed it into + source for fast checking. That way, we don't need to ever do explicit UTF-8 + decoding anywhere. Yay. + +Then `lazy` sub-module. + +Then `onepass`. + +Then `jit`. + +... and beyond? CRAZY. But it can be done! Build STRONG base layers. diff --git a/vendor/regex-automata-0.2.0/README.md b/vendor/regex-automata-0.2.0/README.md new file mode 100644 index 000000000..23e0bffe0 --- /dev/null +++ b/vendor/regex-automata-0.2.0/README.md @@ -0,0 +1,222 @@ +regex-automata +============== +A low level regular expression library that uses deterministic finite automata. +It supports a rich syntax with Unicode support, has extensive options for +configuring the best space vs time trade off for your use case and provides +support for cheap deserialization of automata for use in `no_std` environments. + +[](https://github.com/BurntSushi/regex-automata/actions) +[](https://crates.io/crates/regex-automata) + + +Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). + + +### Documentation + +https://docs.rs/regex-automata + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +regex-automata = "0.1" +``` + +**WARNING**: The `master` branch currently contains code for the `0.2` release, +but this README still targets the `0.1` release. Namely, it is recommended to +stick with the `0.1` release. The `0.2` release was made prematurely in order +to unblock some folks. + + +### Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +```rust +use regex_automata::Regex; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +``` + +For more examples and information about the various knobs that can be turned, +please see the [docs](https://docs.rs/regex-automata/0.1). + + +### Support for `no_std` + +This crate comes with a `std` feature that is enabled by default. When the +`std` feature is enabled, the API of this crate will include the facilities +necessary for compiling, serializing, deserializing and searching with regular +expressions. When the `std` feature is disabled, the API of this crate will +shrink such that it only includes the facilities necessary for deserializing +and searching with regular expressions. + +The intended workflow for `no_std` environments is thus as follows: + +* Write a program with the `std` feature that compiles and serializes a + regular expression. Serialization should only happen after first converting + the DFAs to use a fixed size state identifier instead of the default `usize`. + You may also need to serialize both little and big endian versions of each + DFA. (So that's 4 DFAs in total for each regex.) +* In your `no_std` environment, follow the examples above for deserializing + your previously serialized DFAs into regexes. You can then search with them + as you would any regex. + +Deserialization can happen anywhere. For example, with bytes embedded into a +binary or with a file memory mapped at runtime. + +Note that the +[`ucd-generate`](https://github.com/BurntSushi/ucd-generate) +tool will do the first step for you with its `dfa` or `regex` sub-commands. + + +### Cargo features + +* `std` - **Enabled** by default. This enables the ability to compile finite + automata. This requires the `regex-syntax` dependency. Without this feature + enabled, finite automata can only be used for searching (using the approach + described above). +* `transducer` - **Disabled** by default. This provides implementations of the + `Automaton` trait found in the `fst` crate. This permits using finite + automata generated by this crate to search finite state transducers. This + requires the `fst` dependency. + + +### Differences with the regex crate + +The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a +general purpose regular expression engine. It aims to automatically balance low +compile times, fast search times and low memory usage, while also providing +a convenient API for users. In contrast, this crate provides a lower level +regular expression interface that is a bit less convenient while providing more +explicit control over memory usage and search times. + +Here are some specific negative differences: + +* **Compilation can take an exponential amount of time and space** in the size + of the regex pattern. While most patterns do not exhibit worst case + exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will + build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should + not be compiled with this library. (In the future, the API may expose an + option to return an error if the DFA gets too big.) +* This crate does not support sub-match extraction, which can be achieved with + the regex crate's "captures" API. This may be added in the future, but is + unlikely. +* While the regex crate doesn't necessarily sport fast compilation times, the + regexes in this crate are almost universally slow to compile, especially when + they contain large Unicode character classes. For example, on my system, + compiling `\w{3}` with byte classes enabled takes just over 1 second and + almost 5MB of memory! (Compiling a sparse regex takes about the same time + but only uses about 500KB of memory.) Conversly, compiling the same regex + without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and + less than 5KB of memory. For this reason, you should only use Unicode + character classes if you absolutely need them! +* This crate does not support regex sets. +* This crate does not support zero-width assertions such as `^`, `$`, `\b` or + `\B`. +* As a lower level crate, this library does not do literal optimizations. In + exchange, you get predictable performance regardless of input. The + philosophy here is that literal optimizations should be applied at a higher + level, although there is no easy support for this in the ecosystem yet. +* There is no `&str` API like in the regex crate. In this crate, all APIs + operate on `&[u8]`. By default, match indices are guaranteed to fall on + UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled. + +With some of the downsides out of the way, here are some positive differences: + +* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply + deserialized. Deserialization always takes constant time since searching can + be performed directly on the raw serialized bytes of a DFA. +* This crate was specifically designed so that the searching phase of a DFA has + minimal runtime requirements, and can therefore be used in `no_std` + environments. While `no_std` environments cannot compile regexes, they can + deserialize pre-compiled regexes. +* Since this crate builds DFAs ahead of time, it will generally out-perform + the `regex` crate on equivalent tasks. The performance difference is likely + not large. However, because of a complex set of optimizations in the regex + crate (like literal optimizations), an accurate performance comparison may be + difficult to do. +* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search + performance a small amount, but uses much less storage space. Potentially + even less than what the regex crate uses. +* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`, + which enables one to do less work in some cases. For example, if you only + need the end of a match and not the start of a match, then you can use a DFA + directly without building a `Regex`, which always requires a second DFA to + find the start of a match. +* Aside from choosing between dense and sparse DFAs, there are several options + for configuring the space usage vs search time trade off. These include + things like choosing a smaller state identifier representation, to + premultiplying state identifiers and splitting a DFA's alphabet into + equivalence classes. Finally, DFA minimization is also provided, but can + increase compilation times dramatically. + + +### Future work + +* Look into being smarter about generating NFA states for large Unicode + character classes. These can create a lot of additional work for both the + determinizer and the minimizer, and I suspect this is the key thing we'll + want to improve if we want to make DFA compile times faster. I *believe* + it's possible to potentially build minimal or nearly minimal NFAs for the + special case of Unicode character classes by leveraging Daciuk's algorithms + for building minimal automata in linear time for sets of strings. See + https://blog.burntsushi.net/transducers/#construction for more details. The + key adaptation I think we need to make is to modify the algorithm to operate + on byte ranges instead of enumerating every codepoint in the set. Otherwise, + it might not be worth doing. +* Add support for regex sets. It should be possible to do this by "simply" + introducing more match states. I think we can also report the positions at + each match, similar to how Aho-Corasick works. I think the long pole in the + tent here is probably the API design work and arranging it so that we don't + introduce extra overhead into the non-regex-set case without duplicating a + lot of code. It seems doable. +* Stretch goal: support capturing groups by implementing "tagged" DFA + (transducers). Laurikari's paper is the usual reference here, but Trofimovich + has a much more thorough treatment here: + https://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf + I've only read the paper once. I suspect it will require at least a few more + read throughs before I understand it. + See also: https://re2c.org +* Possibly less ambitious goal: can we select a portion of Trofimovich's work + to make small fixed length look-around work? It would be really nice to + support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $. +* Experiment with code generating Rust code. There is an early experiment in + src/codegen.rs that is thoroughly bit-rotted. At the time, I was + experimenting with whether or not codegen would significant decrease the size + of a DFA, since if you squint hard enough, it's kind of like a sparse + representation. However, it didn't shrink as much as I thought it would, so + I gave up. The other problem is that Rust doesn't support gotos, so I don't + even know whether the "match on each state" in a loop thing will be fast + enough. Either way, it's probably a good option to have. For one thing, it + would be endian independent where as the serialization format of the DFAs in + this crate are endian dependent (so you need two versions of every DFA, but + you only need to compile one of them for any given arch). +* Experiment with unrolling the match loops and fill out the benchmarks. +* Add some kind of streaming API. I believe users of the library can already + implement something for this outside of the crate, but it would be good to + provide an official API. The key thing here is figuring out the API. I + suspect we might want to support several variants. +* Make a decision on whether or not there is room for literal optimizations + in this crate. My original intent was to not let this crate sink down into + that very very very deep rabbit hole. But instead, we might want to provide + some way for literal optimizations to hook into the match routines. The right + path forward here is to probably build something outside of the crate and + then see about integrating it. After all, users can implement their own + match routines just as efficiently as what the crate provides. +* A key downside of DFAs is that they can take up a lot of memory and can be + quite costly to build. Their worst case compilation time is O(2^n), where + n is the number of NFA states. A paper by Yang and Prasanna (2011) actually + seems to provide a way to character state blow up such that it is detectable. + If we could know whether a regex will exhibit state explosion or not, then + we could make an intelligent decision about whether to ahead-of-time compile + a DFA. + See: https://www.researchgate.net/profile/Xu-Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000/Characterization-of-a-global-germplasm-collection-and-its-potential-utilization-for-analysis-of-complex-quantitative-traits-in-maize.pdf diff --git a/vendor/regex-automata-0.2.0/TODO b/vendor/regex-automata-0.2.0/TODO new file mode 100644 index 000000000..68f018799 --- /dev/null +++ b/vendor/regex-automata-0.2.0/TODO @@ -0,0 +1,13 @@ +* Consider refactoring the NFA representation such that it can be instantly + loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this + could negatively impact using the NFA with deserialization costs. Before + doing this, we should write PikeVM and backtracking implementations so that + they can be benchmarked. +* Add captures to NFA. +* Once we're happy, re-organize the public API such that NFAs are exported + and usable on their own. + +* Investigate why NFA shrinking seems to produce bigger DFAs after + determinization, even though it makes determinization substantially + faster. This might be because of its use of sparse NFA states, which have + a lower constant overhead associated with them. diff --git a/vendor/regex-automata-0.2.0/UNLICENSE b/vendor/regex-automata-0.2.0/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/vendor/regex-automata-0.2.0/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/vendor/regex-automata-0.2.0/rustfmt.toml b/vendor/regex-automata-0.2.0/rustfmt.toml new file mode 100644 index 000000000..aa37a218b --- /dev/null +++ b/vendor/regex-automata-0.2.0/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/vendor/regex-automata-0.2.0/src/dfa/accel.rs b/vendor/regex-automata-0.2.0/src/dfa/accel.rs new file mode 100644 index 000000000..dbfeb7932 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/accel.rs @@ -0,0 +1,507 @@ +// This module defines some core types for dealing with accelerated DFA states. +// Briefly, a DFA state can be "accelerated" if all of its transitions except +// for a few loop back to itself. This directly implies that the only way out +// of such a state is if a byte corresponding to one of those non-loopback +// transitions is found. Such states are often found in simple repetitions in +// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its +// DFA with regex-cli: +// +// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC +// dense::DFA( +// D 000000: +// Q 000001: +// *000002: +// A 000003: \x00-` => 3, a => 5, b-\xFF => 3 +// >000004: \x00-` => 3, a => 4, b-\xFF => 3 +// 000005: \x00-\xFF => 2, EOI => 2 +// ) +// +// In particular, state 3 is accelerated (shown via the 'A' indicator) since +// the only way to leave that state once entered is to see an 'a' byte. If +// there is a long run of non-'a' bytes, then using something like 'memchr' +// to find the next 'a' byte can be significantly faster than just using the +// standard byte-at-a-time state machine. +// +// Unfortunately, this optimization rarely applies when Unicode is enabled. +// For example, patterns like '[^a]' don't actually match any byte that isn't +// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't +// 'a'. This makes the state machine much more complex---far beyond a single +// state---and removes the ability to easily accelerate it. (Because if the +// machine sees a non-UTF-8 sequence, then the machine won't match through it.) +// +// In practice, we only consider accelerating states that have 3 or fewer +// non-loop transitions. At a certain point, you get diminishing returns, but +// also because that's what the memchr crate supports. The structures below +// hard-code this assumption and provide (de)serialization APIs for use inside +// a DFA. +// +// And finally, note that there is some trickery involved in making it very +// fast to not only check whether a state is accelerated at search time, but +// also to access the bytes to search for to implement the acceleration itself. +// dfa/special.rs provides more detail, but the short story is that all +// accelerated states appear contiguously in a DFA. This means we can represent +// the ID space of all accelerated DFA states with a single range. So given +// a state ID, we can determine whether it's accelerated via +// +// min_accel_id <= id <= max_accel_id +// +// And find its corresponding accelerator with: +// +// accels.get((id - min_accel_id) / dfa_stride) + +use core::convert::{TryFrom, TryInto}; + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +use crate::util::bytes::{self, DeserializeError, Endian, SerializeError}; + +/// The base type used to represent a collection of accelerators. +/// +/// While an `Accel` is represented as a fixed size array of bytes, a +/// *collection* of `Accel`s (called `Accels`) is represented internally as a +/// slice of u32. While it's a bit unnatural to do this and costs us a bit of +/// fairly low-risk not-safe code, it lets us remove the need for a second type +/// parameter in the definition of dense::DFA. (Which really wants everything +/// to be a slice of u32.) +type AccelTy = u32; + +/// The size of the unit of representation for accelerators. +/// +/// ACCEL_CAP *must* be a multiple of this size. +const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>(); + +/// The maximum length in bytes that a single Accel can be. This is distinct +/// from the capacity of an accelerator in that the length represents only the +/// bytes that should be read. +const ACCEL_LEN: usize = 4; + +/// The capacity of each accelerator, in bytes. We set this to 8 since it's a +/// multiple of 4 (our ID size) and because it gives us a little wiggle room +/// if we want to support more accel bytes in the future without a breaking +/// change. +/// +/// This MUST be a multiple of ACCEL_TY_SIZE. +const ACCEL_CAP: usize = 8; + +/// Search for between 1 and 3 needle bytes in the given haystack, starting the +/// search at the given position. If `needles` has a length other than 1-3, +/// then this panics. +#[inline(always)] +pub(crate) fn find_fwd( + needles: &[u8], + haystack: &[u8], + at: usize, +) -> Option<usize> { + let bs = needles; + let i = match needles.len() { + 1 => memchr::memchr(bs[0], &haystack[at..])?, + 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, + 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, + 0 => panic!("cannot find with empty needles"), + n => panic!("invalid needles length: {}", n), + }; + Some(at + i) +} + +/// Search for between 1 and 3 needle bytes in the given haystack in reverse, +/// starting the search at the given position. If `needles` has a length other +/// than 1-3, then this panics. +#[inline(always)] +pub(crate) fn find_rev( + needles: &[u8], + haystack: &[u8], + at: usize, +) -> Option<usize> { + let bs = needles; + match needles.len() { + 1 => memchr::memrchr(bs[0], &haystack[..at]), + 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), + 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), + 0 => panic!("cannot find with empty needles"), + n => panic!("invalid needles length: {}", n), + } +} + +/// Represents the accelerators for all accelerated states in a dense DFA. +/// +/// The `A` type parameter represents the type of the underlying bytes. +/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`. +#[derive(Clone)] +pub(crate) struct Accels<A> { + /// A length prefixed slice of contiguous accelerators. See the top comment + /// in this module for more details on how we can jump from a DFA's state + /// ID to an accelerator in this list. + /// + /// The first 4 bytes always correspond to the number of accelerators + /// that follow. + accels: A, +} + +#[cfg(feature = "alloc")] +impl Accels<Vec<AccelTy>> { + /// Create an empty sequence of accelerators for a DFA. + pub fn empty() -> Accels<Vec<AccelTy>> { + Accels { accels: vec![0] } + } + + /// Add an accelerator to this sequence. + /// + /// This adds to the accelerator to the end of the sequence and therefore + /// should be done in correspondence with its state in the DFA. + /// + /// This panics if this results in more accelerators than AccelTy::MAX. + pub fn add(&mut self, accel: Accel) { + self.accels.extend_from_slice(&accel.as_accel_tys()); + let len = self.len(); + self.set_len(len + 1); + } + + /// Set the number of accelerators in this sequence, which is encoded in + /// the first 4 bytes of the underlying bytes. + fn set_len(&mut self, new_len: usize) { + // The only way an accelerator gets added is if a state exists for + // it, and if a state exists, then its index is guaranteed to be + // representable by a AccelTy by virtue of the guarantees provided by + // StateID. + let new_len = AccelTy::try_from(new_len).unwrap(); + self.accels[0] = new_len; + } +} + +impl<'a> Accels<&'a [AccelTy]> { + /// Deserialize a sequence of accelerators from the given bytes. If there + /// was a problem deserializing, then an error is returned. + /// + /// This is guaranteed to run in constant time. This does not guarantee + /// that every accelerator in the returned collection is valid. Thus, + /// accessing one may panic, or not-safe code that relies on accelerators + /// being correct my result in UB. + /// + /// Callers may check the validity of every accelerator with the `validate` + /// method. + pub unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (count, _) = + bytes::try_read_u32_as_usize(slice, "accelerators count")?; + // The accelerator count is part of the accel_tys slice that + // we deserialize. This is perhaps a bit idiosyncratic. It would + // probably be better to split out the count into a real field. + + let accel_tys_count = bytes::add( + bytes::mul(count, 2, "total number of accelerator accel_tys")?, + 1, + "total number of accel_tys", + )?; + let accel_tys_len = bytes::mul( + ACCEL_TY_SIZE, + accel_tys_count, + "total number of bytes in accelerators", + )?; + bytes::check_slice_len(slice, accel_tys_len, "accelerators")?; + bytes::check_alignment::<AccelTy>(slice)?; + let accel_tys = &slice[..accel_tys_len]; + slice = &slice[accel_tys_len..]; + // SAFETY: We've checked the length and alignment above, and since + // slice is just bytes, we can safely cast to a slice of &[AccelTy]. + #[allow(unused_unsafe)] + let accels = unsafe { + core::slice::from_raw_parts( + accel_tys.as_ptr() as *const AccelTy, + accel_tys_count, + ) + }; + Ok((Accels { accels }, slice.as_ptr() as usize - slice_start)) + } +} + +impl<A: AsRef<[AccelTy]>> Accels<A> { + /// Return an owned version of the accelerators. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> Accels<Vec<AccelTy>> { + Accels { accels: self.accels.as_ref().to_vec() } + } + + /// Return a borrowed version of the accelerators. + pub fn as_ref(&self) -> Accels<&[AccelTy]> { + Accels { accels: self.accels.as_ref() } + } + + /// Return the bytes representing the serialization of the accelerators. + pub fn as_bytes(&self) -> &[u8] { + let accels = self.accels.as_ref(); + // SAFETY: This is safe because accels is a just a slice of AccelTy, + // and u8 always has a smaller alignment. + unsafe { + core::slice::from_raw_parts( + accels.as_ptr() as *const u8, + accels.len() * ACCEL_TY_SIZE, + ) + } + } + + /// Returns the memory usage, in bytes, of these accelerators. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent all of the accelerators. + /// + /// This does **not** include the stack size used by this value. + pub fn memory_usage(&self) -> usize { + self.as_bytes().len() + } + + /// Return the bytes to search for corresponding to the accelerator in this + /// sequence at index `i`. If no such accelerator exists, then this panics. + /// + /// The significance of the index is that it should be in correspondence + /// with the index of the corresponding DFA. That is, accelerated DFA + /// states are stored contiguously in the DFA and have an ordering implied + /// by their respective state IDs. The state's index in that sequence + /// corresponds to the index of its corresponding accelerator. + #[inline(always)] + pub fn needles(&self, i: usize) -> &[u8] { + if i >= self.len() { + panic!("invalid accelerator index {}", i); + } + let bytes = self.as_bytes(); + let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; + let len = bytes[offset] as usize; + &bytes[offset + 1..offset + 1 + len] + } + + /// Return the total number of accelerators in this sequence. + pub fn len(&self) -> usize { + // This should never panic since deserialization checks that the + // length can fit into a usize. + usize::try_from(self.accels.as_ref()[0]).unwrap() + } + + /// Return the accelerator in this sequence at index `i`. If no such + /// accelerator exists, then this returns None. + /// + /// See the docs for `needles` on the significance of the index. + fn get(&self, i: usize) -> Option<Accel> { + if i >= self.len() { + return None; + } + let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; + let accel = Accel::from_slice(&self.as_bytes()[offset..]) + .expect("Accels must contain valid accelerators"); + Some(accel) + } + + /// Returns an iterator of accelerators in this sequence. + fn iter(&self) -> IterAccels<'_, A> { + IterAccels { accels: self, i: 0 } + } + + /// Writes these accelerators to the given byte buffer using the indicated + /// endianness. If the given buffer is too small, then an error is + /// returned. Upon success, the total number of bytes written is returned. + /// The number of bytes written is guaranteed to be a multiple of 8. + pub fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + assert_eq!( + nwrite % ACCEL_TY_SIZE, + 0, + "expected accelerator bytes written to be a multiple of {}", + ACCEL_TY_SIZE, + ); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("accelerators")); + } + + // The number of accelerators can never exceed AccelTy::MAX. + E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst); + // The actual accelerators are just raw bytes and thus their endianness + // is irrelevant. So we can copy them as bytes. + dst[ACCEL_TY_SIZE..nwrite] + .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]); + Ok(nwrite) + } + + /// Validates that every accelerator in this collection can be successfully + /// deserialized as a valid accelerator. + pub fn validate(&self) -> Result<(), DeserializeError> { + for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) { + let _ = Accel::from_slice(chunk)?; + } + Ok(()) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + self.as_bytes().len() + } +} + +impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Accels(")?; + let mut list = f.debug_list(); + for a in self.iter() { + list.entry(&a); + } + list.finish()?; + write!(f, ")") + } +} + +#[derive(Debug)] +struct IterAccels<'a, A: AsRef<[AccelTy]>> { + accels: &'a Accels<A>, + i: usize, +} + +impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> { + type Item = Accel; + + fn next(&mut self) -> Option<Accel> { + let accel = self.accels.get(self.i)?; + self.i += 1; + Some(accel) + } +} + +/// Accel represents a structure for determining how to "accelerate" a DFA +/// state. +/// +/// Namely, it contains zero or more bytes that must be seen in order for the +/// DFA to leave the state it is associated with. In practice, the actual range +/// is 1 to 3 bytes. +/// +/// The purpose of acceleration is to identify states whose vast majority +/// of transitions are just loops back to the same state. For example, +/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state +/// (corresponding to `[^a]+`) where all transitions *except* for `a` and +/// `b` loop back to itself. Thus, this state can be "accelerated" by simply +/// looking for the next occurrence of either `a` or `b` instead of explicitly +/// following transitions. (In this case, `b` transitions to the next state +/// where as `a` would transition to the dead state.) +#[derive(Clone)] +pub(crate) struct Accel { + /// The first byte is the length. Subsequent bytes are the accelerated + /// bytes. + /// + /// Note that we make every accelerator 8 bytes as a slightly wasteful + /// way of making sure alignment is always correct for state ID sizes of + /// 1, 2, 4 and 8. This should be okay since accelerated states aren't + /// particularly common, especially when Unicode is enabled. + bytes: [u8; ACCEL_CAP], +} + +impl Accel { + /// Returns an empty accel, where no bytes are accelerated. + #[cfg(feature = "alloc")] + pub fn new() -> Accel { + Accel { bytes: [0; ACCEL_CAP] } + } + + /// Returns a verified accelerator derived from the beginning of the given + /// slice. + /// + /// If the slice is not long enough or contains invalid bytes for an + /// accelerator, then this returns an error. + pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> { + slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())]; + let bytes = slice + .try_into() + .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?; + Accel::from_bytes(bytes) + } + + /// Returns a verified accelerator derived from raw bytes. + /// + /// If the given bytes are invalid, then this returns an error. + fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> { + if bytes[0] as usize >= ACCEL_LEN { + return Err(DeserializeError::generic( + "accelerator bytes cannot have length more than 3", + )); + } + Ok(Accel::from_bytes_unchecked(bytes)) + } + + /// Returns an accelerator derived from raw bytes. + /// + /// This does not check whether the given bytes are valid. Invalid bytes + /// cannot sacrifice memory safety, but may result in panics or silent + /// logic bugs. + fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel { + Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] } + } + + /// Attempts to add the given byte to this accelerator. If the accelerator + /// is already full then this returns false. Otherwise, returns true. + /// + /// If the given byte is already in this accelerator, then it panics. + #[cfg(feature = "alloc")] + pub fn add(&mut self, byte: u8) -> bool { + if self.len() >= 3 { + return false; + } + assert!( + !self.contains(byte), + "accelerator already contains {:?}", + crate::util::DebugByte(byte) + ); + self.bytes[self.len() + 1] = byte; + self.bytes[0] += 1; + true + } + + /// Return the number of bytes in this accelerator. + pub fn len(&self) -> usize { + self.bytes[0] as usize + } + + /// Returns true if and only if there are no bytes in this accelerator. + #[cfg(feature = "alloc")] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the slice of bytes to accelerate. + /// + /// If this accelerator is empty, then this returns an empty slice. + fn needles(&self) -> &[u8] { + &self.bytes[1..1 + self.len()] + } + + /// Returns true if and only if this accelerator will accelerate the given + /// byte. + #[cfg(feature = "alloc")] + fn contains(&self, byte: u8) -> bool { + self.needles().iter().position(|&b| b == byte).is_some() + } + + /// Returns the accelerator bytes as an array of AccelTys. + #[cfg(feature = "alloc")] + fn as_accel_tys(&self) -> [AccelTy; 2] { + assert_eq!(ACCEL_CAP, 8); + // These unwraps are OK since ACCEL_CAP is set to 8. + let first = + AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap()); + let second = + AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap()); + [first, second] + } +} + +impl core::fmt::Debug for Accel { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Accel(")?; + let mut set = f.debug_set(); + for &b in self.needles() { + set.entry(&crate::util::DebugByte(b)); + } + set.finish()?; + write!(f, ")") + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/automaton.rs b/vendor/regex-automata-0.2.0/src/dfa/automaton.rs new file mode 100644 index 000000000..08bd6722a --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/automaton.rs @@ -0,0 +1,1903 @@ +use crate::{ + dfa::search, + util::{ + id::{PatternID, StateID}, + matchtypes::{HalfMatch, MatchError}, + prefilter, + }, +}; + +/// A trait describing the interface of a deterministic finite automaton (DFA). +/// +/// The complexity of this trait probably means that it's unlikely for others +/// to implement it. The primary purpose of the trait is to provide for a way +/// of abstracting over different types of DFAs. In this crate, that means +/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where +/// as sparse DFAs are slower but come with a smaller memory footprint. But +/// they otherwise provide exactly equivalent expressive power.) For example, a +/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait. +/// +/// Normally, a DFA's execution model is very simple. You might have a single +/// start state, zero or more final or "match" states and a function that +/// transitions from one state to the next given the next byte of input. +/// Unfortunately, the interface described by this trait is significantly +/// more complicated than this. The complexity has a number of different +/// reasons, mostly motivated by performance, functionality or space savings: +/// +/// * A DFA can search for multiple patterns simultaneously. This +/// means extra information is returned when a match occurs. Namely, +/// a match is not just an offset, but an offset plus a pattern ID. +/// [`Automaton::pattern_count`] returns the number of patterns compiled into +/// the DFA, [`Automaton::match_count`] returns the total number of patterns +/// that match in a particular state and [`Automaton::match_pattern`] permits +/// iterating over the patterns that match in a particular state. +/// * A DFA can have multiple start states, and the choice of which start +/// state to use depends on the content of the string being searched and +/// position of the search, as well as whether the search is an anchored +/// search for a specific pattern in the DFA. Moreover, computing the start +/// state also depends on whether you're doing a forward or a reverse search. +/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`] +/// are used to compute the start state for forward and reverse searches, +/// respectively. +/// * All matches are delayed by one byte to support things like `$` and `\b` +/// at the end of a pattern. Therefore, every use of a DFA is required to use +/// [`Automaton::next_eoi_state`] +/// at the end of the search to compute the final transition. +/// * For optimization reasons, some states are treated specially. Every +/// state is either special or not, which can be determined via the +/// [`Automaton::is_special_state`] method. If it's special, then the state +/// must be at least one of a few possible types of states. (Note that some +/// types can overlap, for example, a match state can also be an accel state. +/// But some types can't. If a state is a dead state, then it can never be any +/// other type of state.) Those types are: +/// * A dead state. A dead state means the DFA will never enter a match +/// state. This can be queried via the [`Automaton::is_dead_state`] method. +/// * A quit state. A quit state occurs if the DFA had to stop the search +/// prematurely for some reason. This can be queried via the +/// [`Automaton::is_quit_state`] method. +/// * A match state. A match state occurs when a match is found. When a DFA +/// enters a match state, the search may stop immediately (when looking +/// for the earliest match), or it may continue to find the leftmost-first +/// match. This can be queried via the [`Automaton::is_match_state`] +/// method. +/// * A start state. A start state is where a search begins. For every +/// search, there is exactly one start state that is used, however, a +/// DFA may contain many start states. When the search is in a start +/// state, it may use a prefilter to quickly skip to candidate matches +/// without executing the DFA on every byte. This can be queried via the +/// [`Automaton::is_start_state`] method. +/// * An accel state. An accel state is a state that is accelerated. +/// That is, it is a state where _most_ of its transitions loop back to +/// itself and only a small number of transitions lead to other states. +/// This kind of state is said to be accelerated because a search routine +/// can quickly look for the bytes leading out of the state instead of +/// continuing to execute the DFA on each byte. This can be queried via the +/// [`Automaton::is_accel_state`] method. And the bytes that lead out of +/// the state can be queried via the [`Automaton::accelerator`] method. +/// +/// There are a number of provided methods on this trait that implement +/// efficient searching (for forwards and backwards) with a DFA using all of +/// the above features of this trait. In particular, given the complexity of +/// all these features, implementing a search routine in this trait is not +/// straight forward. If you need to do this for specialized reasons, then +/// it's recommended to look at the source of this crate. It is intentionally +/// well commented to help with this. With that said, it is possible to +/// somewhat simplify the search routine. For example, handling accelerated +/// states is strictly optional, since it is always correct to assume that +/// `Automaton::is_accel_state` returns false. However, one complex part of +/// writing a search routine using this trait is handling the 1-byte delay of a +/// match. That is not optional. +/// +/// # Safety +/// +/// This trait is unsafe to implement because DFA searching may rely on the +/// correctness of the implementation for memory safety. For example, DFA +/// searching may use explicit bounds check elision, which will in turn rely +/// on the correctness of every function that returns a state ID. +/// +/// When implementing this trait, one must uphold the documented correctness +/// guarantees. Otherwise, undefined behavior may occur. +pub unsafe trait Automaton { + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// Implementations must guarantee that the returned ID is always a valid + /// ID when `current` refers to a valid ID. Moreover, the transition + /// function must be defined for all possible values of `input`. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid ID. + /// However, if the caller provides an invalid ID then this must never + /// sacrifice memory safety. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack + /// by using the `next_state` method. + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense}; + /// + /// let dfa = dense::DFA::new(r"[a-z]+r")?; + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// assert!(dfa.is_match_state(state)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn next_state(&self, current: StateID, input: u8) -> StateID; + + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// Unlike [`Automaton::next_state`], implementations may implement this + /// more efficiently by assuming that the `current` state ID is valid. + /// Typically, this manifests by eliding bounds checks. + /// + /// # Safety + /// + /// Callers of this method must guarantee that `current` refers to a valid + /// state ID. If `current` is not a valid state ID for this automaton, then + /// calling this routine may result in undefined behavior. + /// + /// If `current` is valid, then implementations must guarantee that the ID + /// returned is valid for all possible values of `input`. + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID; + + /// Transitions from the current state to the next state for the special + /// EOI symbol. + /// + /// Implementations must guarantee that the returned ID is always a valid + /// ID when `current` refers to a valid ID. + /// + /// This routine must be called at the end of every search in a correct + /// implementation of search. Namely, DFAs in this crate delay matches + /// by one byte in order to support look-around operators. Thus, after + /// reaching the end of a haystack, a search implementation must follow one + /// last EOI transition. + /// + /// It is best to think of EOI as an additional symbol in the alphabet of + /// a DFA that is distinct from every other symbol. That is, the alphabet + /// of DFAs in this crate has a logical size of 257 instead of 256, where + /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the + /// physical alphabet size may be smaller because of alphabet compression + /// via equivalence classes, but EOI is always represented somehow in the + /// alphabet.) + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid ID. + /// However, if the caller provides an invalid ID then this must never + /// sacrifice memory safety. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack, + /// and then finishing the search with the final EOI transition. + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense}; + /// + /// let dfa = dense::DFA::new(r"[a-z]+r")?; + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. Without this + /// // final transition, the assert below will fail since the DFA will not + /// // have entered a match state yet! + /// state = dfa.next_eoi_state(state); + /// assert!(dfa.is_match_state(state)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn next_eoi_state(&self, current: StateID) -> StateID; + + /// Return the ID of the start state for this DFA when executing a forward + /// search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been compiled + /// with multiple patterns _and_ the DFA has been configured to compile + /// an anchored start state for each pattern, then a pattern ID may be + /// specified to execute an anchored search for that specific pattern. + /// If `pattern_id` is invalid or if the DFA doesn't have start states + /// compiled for each pattern, then implementations must panic. DFAs in + /// this crate can be configured to compile start states for each pattern + /// via + /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern). + /// * When `start > 0`, the byte at index `start - 1` may influence the + /// start state if the regex uses `^` or `\b`. + /// * Similarly, when `start == 0`, it may influence the start state when + /// the regex uses `^` or `\A`. + /// * Currently, `end` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Panics + /// + /// Implementations must panic if `start..end` is not a valid sub-slice of + /// `bytes`. Implementations must also panic if `pattern_id` is non-None + /// and does not refer to a valid pattern, or if the DFA was not compiled + /// with anchored start states for each pattern. + fn start_state_forward( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID; + + /// Return the ID of the start state for this DFA when executing a reverse + /// search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been compiled + /// with multiple patterns _and_ the DFA has been configured to compile an + /// anchored start state for each pattern, then a pattern ID may be + /// specified to execute an anchored search for that specific pattern. If + /// `pattern_id` is invalid or if the DFA doesn't have start states compiled + /// for each pattern, then implementations must panic. DFAs in this crate + /// can be configured to compile start states for each pattern via + /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern). + /// * When `end < bytes.len()`, the byte at index `end` may influence the + /// start state if the regex uses `$` or `\b`. + /// * Similarly, when `end == bytes.len()`, it may influence the start + /// state when the regex uses `$` or `\z`. + /// * Currently, `start` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Panics + /// + /// Implementations must panic if `start..end` is not a valid sub-slice of + /// `bytes`. Implementations must also panic if `pattern_id` is non-None + /// and does not refer to a valid pattern, or if the DFA was not compiled + /// with anchored start states for each pattern. + fn start_state_reverse( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID; + + /// Returns true if and only if the given identifier corresponds to a + /// "special" state. A special state is one or more of the following: + /// a dead state, a quit state, a match state, a start state or an + /// accelerated state. + /// + /// A correct implementation _may_ always return false for states that + /// are either start states or accelerated states, since that information + /// is only intended to be used for optimization purposes. Correct + /// implementations must return true if the state is a dead, quit or match + /// state. This is because search routines using this trait must be able + /// to rely on `is_special_state` as an indicator that a state may need + /// special treatment. (For example, when a search routine sees a dead + /// state, it must terminate.) + /// + /// This routine permits search implementations to use a single branch to + /// check whether a state needs special attention before executing the next + /// transition. The example below shows how to do this. + /// + /// # Example + /// + /// This example shows how `is_special_state` can be used to implement a + /// correct search routine with minimal branching. In particular, this + /// search routine implements "leftmost" matching, which means that it + /// doesn't immediately stop once a match is found. Instead, it continues + /// until it reaches a dead state. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, PatternID, + /// }; + /// + /// fn find_leftmost_first<A: Automaton>( + /// dfa: &A, + /// haystack: &[u8], + /// ) -> Result<Option<HalfMatch>, MatchError> { + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. Note that start states can never + /// // be match states (since DFAs in this crate delay matches by 1 + /// // byte), so we don't need to check if the start state is a match. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// let mut last_match = None; + /// // Walk all the bytes in the haystack. We can quit early if we see + /// // a dead or a quit state. The former means the automaton will + /// // never transition to any other state. The latter means that the + /// // automaton entered a condition in which its search failed. + /// for (i, &b) in haystack.iter().enumerate() { + /// state = dfa.next_state(state, b); + /// if dfa.is_special_state(state) { + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// i, + /// )); + /// } else if dfa.is_dead_state(state) { + /// return Ok(last_match); + /// } else if dfa.is_quit_state(state) { + /// // It is possible to enter into a quit state after + /// // observing a match has occurred. In that case, we + /// // should return the match instead of an error. + /// if last_match.is_some() { + /// return Ok(last_match); + /// } + /// return Err(MatchError::Quit { byte: b, offset: i }); + /// } + /// // Implementors may also want to check for start or accel + /// // states and handle them differently for performance + /// // reasons. But it is not necessary for correctness. + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// haystack.len(), + /// )); + /// } + /// Ok(last_match) + /// } + /// + /// // We use a greedy '+' operator to show how the search doesn't just + /// // stop once a match is detected. It continues extending the match. + /// // Using '[a-z]+?' would also work as expected and stop the search + /// // early. Greediness is built into the automaton. + /// let dfa = dense::DFA::new(r"[a-z]+")?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 10); + /// + /// // Here's another example that tests our handling of the special EOI + /// // transition. This will fail to find a match if we don't call + /// // 'next_eoi_state' at the end of the search since the match isn't + /// // found until the final byte in the haystack. + /// let dfa = dense::DFA::new(r"[0-9]{4}")?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // And note that our search implementation above automatically works + /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects + /// // the appropriate pattern ID for us. + /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; + /// let haystack = "123 foobar 4567".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 1); + /// assert_eq!(mat.offset(), 3); + /// let mat = find_leftmost_first(&dfa, &haystack[3..])?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 7); + /// let mat = find_leftmost_first(&dfa, &haystack[10..])?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 1); + /// assert_eq!(mat.offset(), 5); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_special_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a dead + /// state. When a DFA enters a dead state, it is impossible to leave. That + /// is, every transition on a dead state by definition leads back to the + /// same dead state. + /// + /// In practice, the dead state always corresponds to the identifier `0`. + /// Moreover, in practice, there is only one dead state. + /// + /// The existence of a dead state is not strictly required in the classical + /// model of finite state machines, where one generally only cares about + /// the question of whether an input sequence matches or not. Dead states + /// are not needed to answer that question, since one can immediately quit + /// as soon as one enters a final or "match" state. However, we don't just + /// care about matches but also care about the location of matches, and + /// more specifically, care about semantics like "greedy" matching. + /// + /// For example, given the pattern `a+` and the input `aaaz`, the dead + /// state won't be entered until the state machine reaches `z` in the + /// input, at which point, the search routine can quit. But without the + /// dead state, the search routine wouldn't know when to quit. In a + /// classical representation, the search routine would stop after seeing + /// the first `a` (which is when the search would enter a match state). But + /// this wouldn't implement "greedy" matching where `a+` matches as many + /// `a`'s as possible. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_dead_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a quit + /// state. A quit state is like a dead state (it has no transitions other + /// than to itself), except it indicates that the DFA failed to complete + /// the search. When this occurs, callers can neither accept or reject that + /// a match occurred. + /// + /// In practice, the quit state always corresponds to the state immediately + /// following the dead state. (Which is not usually represented by `1`, + /// since state identifiers are pre-multiplied by the state machine's + /// alphabet stride, and the alphabet stride varies between DFAs.) + /// + /// By default, state machines created by this crate will never enter a + /// quit state. Since entering a quit state is the only way for a DFA + /// in this crate to fail at search time, it follows that the default + /// configuration can never produce a match error. Nevertheless, handling + /// quit states is necessary to correctly support all configurations in + /// this crate. + /// + /// The typical way in which a quit state can occur is when heuristic + /// support for Unicode word boundaries is enabled via the + /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary) + /// option. But other options, like the lower level + /// [`dense::Config::quit`](crate::dfa::dense::Config::quit) + /// configuration, can also result in a quit state being entered. The + /// purpose of the quit state is to provide a way to execute a fast DFA + /// in common cases while delegating to slower routines when the DFA quits. + /// + /// The default search implementations provided by this crate will return + /// a [`MatchError::Quit`](crate::MatchError::Quit) error when a quit state + /// is entered. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_quit_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a + /// match state. A match state is also referred to as a "final" state and + /// indicates that a match has been found. + /// + /// If all you care about is whether a particular pattern matches in the + /// input sequence, then a search routine can quit early as soon as the + /// machine enters a match state. However, if you're looking for the + /// standard "leftmost-first" match location, then search _must_ continue + /// until either the end of the input or until the machine enters a dead + /// state. (Since either condition implies that no other useful work can + /// be done.) Namely, when looking for the location of a match, then + /// search implementations should record the most recent location in + /// which a match state was entered, but otherwise continue executing the + /// search as normal. (The search may even leave the match state.) Once + /// the termination condition is reached, the most recently recorded match + /// location should be returned. + /// + /// Finally, one additional power given to match states in this crate + /// is that they are always associated with a specific pattern in order + /// to support multi-DFAs. See [`Automaton::match_pattern`] for more + /// details and an example for how to query the pattern associated with a + /// particular match state. + /// + /// # Example + /// + /// See the example for [`Automaton::is_special_state`] for how to use this + /// method correctly. + fn is_match_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a + /// start state. A start state is a state in which a DFA begins a search. + /// All searches begin in a start state. Moreover, since all matches are + /// delayed by one byte, a start state can never be a match state. + /// + /// The main role of a start state is, as mentioned, to be a starting + /// point for a DFA. This starting point is determined via one of + /// [`Automaton::start_state_forward`] or + /// [`Automaton::start_state_reverse`], depending on whether one is doing + /// a forward or a reverse search, respectively. + /// + /// A secondary use of start states is for prefix acceleration. Namely, + /// while executing a search, if one detects that you're in a start state, + /// then it may be faster to look for the next match of a prefix of the + /// pattern, if one exists. If a prefix exists and since all matches must + /// begin with that prefix, then skipping ahead to occurrences of that + /// prefix may be much faster than executing the DFA. + /// + /// # Example + /// + /// This example shows how to implement your own search routine that does + /// a prefix search whenever the search enters a start state. + /// + /// Note that you do not need to implement your own search routine to + /// make use of prefilters like this. The search routines provided + /// by this crate already implement prefilter support via the + /// [`Prefilter`](crate::util::prefilter::Prefilter) trait. The various + /// `find_*_at` routines on this trait support the `Prefilter` trait + /// through [`Scanner`](crate::util::prefilter::Scanner)s. This example is + /// meant to show how you might deal with prefilters in a simplified case + /// if you are implementing your own search routine. + /// + /// ``` + /// use regex_automata::{ + /// MatchError, PatternID, + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> { + /// // Would be faster to use the memchr crate, but this is still + /// // faster than running through the DFA. + /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i) + /// } + /// + /// fn find_leftmost_first<A: Automaton>( + /// dfa: &A, + /// haystack: &[u8], + /// prefix_byte: Option<u8>, + /// ) -> Result<Option<HalfMatch>, MatchError> { + /// // See the Automaton::is_special_state example for similar code + /// // with more comments. + /// + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// let mut last_match = None; + /// let mut pos = 0; + /// while pos < haystack.len() { + /// let b = haystack[pos]; + /// state = dfa.next_state(state, b); + /// pos += 1; + /// if dfa.is_special_state(state) { + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// pos - 1, + /// )); + /// } else if dfa.is_dead_state(state) { + /// return Ok(last_match); + /// } else if dfa.is_quit_state(state) { + /// // It is possible to enter into a quit state after + /// // observing a match has occurred. In that case, we + /// // should return the match instead of an error. + /// if last_match.is_some() { + /// return Ok(last_match); + /// } + /// return Err(MatchError::Quit { + /// byte: b, offset: pos - 1, + /// }); + /// } else if dfa.is_start_state(state) { + /// // If we're in a start state and know all matches begin + /// // with a particular byte, then we can quickly skip to + /// // candidate matches without running the DFA through + /// // every byte inbetween. + /// if let Some(prefix_byte) = prefix_byte { + /// pos = match find_byte(haystack, pos, prefix_byte) { + /// Some(pos) => pos, + /// None => break, + /// }; + /// } + /// } + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. + /// state = dfa.next_eoi_state(state); + /// if dfa.is_match_state(state) { + /// last_match = Some(HalfMatch::new( + /// dfa.match_pattern(state, 0), + /// haystack.len(), + /// )); + /// } + /// Ok(last_match) + /// } + /// + /// // In this example, it's obvious that all occurrences of our pattern + /// // begin with 'Z', so we pass in 'Z'. + /// let dfa = dense::DFA::new(r"Z[a-z]+")?; + /// let haystack = "123 foobar Zbaz quux".as_bytes(); + /// let mat = find_leftmost_first(&dfa, haystack, Some(b'Z'))?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // But note that we don't need to pass in a prefix byte. If we don't, + /// // then the search routine does no acceleration. + /// let mat = find_leftmost_first(&dfa, haystack, None)?.unwrap(); + /// assert_eq!(mat.pattern().as_usize(), 0); + /// assert_eq!(mat.offset(), 15); + /// + /// // However, if we pass an incorrect byte, then the prefix search will + /// // result in incorrect results. + /// assert_eq!(find_leftmost_first(&dfa, haystack, Some(b'X'))?, None); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_start_state(&self, id: StateID) -> bool; + + /// Returns true if and only if the given identifier corresponds to an + /// accelerated state. + /// + /// An accelerated state is a special optimization + /// trick implemented by this crate. Namely, if + /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is + /// enabled (and it is by default), then DFAs generated by this crate will + /// tag states meeting certain characteristics as accelerated. States meet + /// this criteria whenever most of their transitions are self-transitions. + /// That is, transitions that loop back to the same state. When a small + /// number of transitions aren't self-transitions, then it follows that + /// there are only a small number of bytes that can cause the DFA to leave + /// that state. Thus, there is an opportunity to look for those bytes + /// using more optimized routines rather than continuing to run through + /// the DFA. This trick is similar to the prefilter idea described in + /// the documentation of [`Automaton::is_start_state`] with two main + /// differences: + /// + /// 1. It is more limited since acceleration only applies to single bytes. + /// This means states are rarely accelerated when Unicode mode is enabled + /// (which is enabled by default). + /// 2. It can occur anywhere in the DFA, which increases optimization + /// opportunities. + /// + /// Like the prefilter idea, the main downside (and a possible reason to + /// disable it) is that it can lead to worse performance in some cases. + /// Namely, if a state is accelerated for very common bytes, then the + /// overhead of checking for acceleration and using the more optimized + /// routines to look for those bytes can cause overall performance to be + /// worse than if acceleration wasn't enabled at all. + /// + /// A simple example of a regex that has an accelerated state is + /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down + /// into a single state where all transitions except for `a` loop back to + /// itself, and where `a` is the only transition (other than the special + /// EOI transition) that goes to some other state. Thus, this state can + /// be accelerated and implemented more efficiently by calling an + /// optimized routine like `memchr` with `a` as the needle. Notice that + /// the `(?-u)` to disable Unicode is necessary here, as without it, + /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other + /// than `a`. This more complicated expression compiles down to many DFA + /// states and the simple acceleration optimization is no longer available. + /// + /// Typically, this routine is used to guard calls to + /// [`Automaton::accelerator`], which returns the accelerated bytes for + /// the specified state. + fn is_accel_state(&self, id: StateID) -> bool; + + /// Returns the total number of patterns compiled into this DFA. + /// + /// In the case of a DFA that contains no patterns, this must return `0`. + /// + /// # Example + /// + /// This example shows the pattern count for a DFA that never matches: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa: DFA<Vec<u32>> = DFA::never_match()?; + /// assert_eq!(dfa.pattern_count(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a DFA that matches at every position: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa: DFA<Vec<u32>> = DFA::always_match()?; + /// assert_eq!(dfa.pattern_count(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a DFA that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(dfa.pattern_count(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn pattern_count(&self) -> usize; + + /// Returns the total number of patterns that match in this state. + /// + /// If the given state is not a match state, then implementations may + /// panic. + /// + /// If the DFA was compiled with one pattern, then this must necessarily + /// always return `1` for all match states. + /// + /// Implementations must guarantee that [`Automaton::match_pattern`] can + /// be called with indices up to (but not including) the count returned by + /// this routine without panicking. + /// + /// # Panics + /// + /// Implementations are permitted to panic if the provided state ID does + /// not correspond to a match state. + /// + /// # Example + /// + /// This example shows a simple instance of implementing overlapping + /// matches. In particular, it shows not only how to determine how many + /// patterns have matched in a particular state, but also how to access + /// which specific patterns have matched. + /// + /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All) + /// when building the DFA. If we used + /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) + /// instead, then the DFA would not be constructed in a way that supports + /// overlapping matches. (It would only report a single pattern that + /// matches at any particular point in time.) + /// + /// Another thing to take note of is the patterns used and the order in + /// which the pattern IDs are reported. In the example below, pattern `3` + /// is yielded first. Why? Because it corresponds to the match that + /// appears first. Namely, the `@` symbol is part of `\S+` but not part + /// of any of the other patterns. Since the `\S+` pattern has a match that + /// starts to the left of any other pattern, its ID is returned before any + /// other. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[ + /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", + /// ])?; + /// let haystack = "@bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut state = dfa.start_state_forward( + /// None, haystack, 0, haystack.len(), + /// ); + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// state = dfa.next_state(state, b); + /// } + /// state = dfa.next_eoi_state(state); + /// + /// assert!(dfa.is_match_state(state)); + /// assert_eq!(dfa.match_count(state), 3); + /// // The following calls are guaranteed to not panic since `match_count` + /// // returned `3` above. + /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3); + /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0); + /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn match_count(&self, id: StateID) -> usize; + + /// Returns the pattern ID corresponding to the given match index in the + /// given state. + /// + /// See [`Automaton::match_count`] for an example of how to use this + /// method correctly. Note that if you know your DFA is compiled with a + /// single pattern, then this routine is never necessary since it will + /// always return a pattern ID of `0` for an index of `0` when `id` + /// corresponds to a match state. + /// + /// Typically, this routine is used when implementing an overlapping + /// search, as the example for `Automaton::match_count` does. + /// + /// # Panics + /// + /// If the state ID is not a match state or if the match index is out + /// of bounds for the given state, then this routine may either panic + /// or produce an incorrect result. If the state ID is correct and the + /// match index is correct, then this routine must always produce a valid + /// `PatternID`. + fn match_pattern(&self, id: StateID, index: usize) -> PatternID; + + /// Return a slice of bytes to accelerate for the given state, if possible. + /// + /// If the given state has no accelerator, then an empty slice must be + /// returned. If `Automaton::is_accel_state` returns true for the given + /// ID, then this routine _must_ return a non-empty slice, but it is not + /// required to do so. + /// + /// If the given ID is not a valid state ID for this automaton, then + /// implementations may panic or produce incorrect results. + /// + /// See [`Automaton::is_accel_state`] for more details on state + /// acceleration. + /// + /// By default, this method will always return an empty slice. + /// + /// # Example + /// + /// This example shows a contrived case in which we build a regex that we + /// know is accelerated and extract the accelerator from a state. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// util::id::StateID, + /// SyntaxConfig, + /// }; + /// + /// let dfa = dense::Builder::new() + /// // We disable Unicode everywhere and permit the regex to match + /// // invalid UTF-8. e.g., `[^abc]` matches `\xFF`, which is not valid + /// // UTF-8. + /// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) + /// // This makes the implicit `(?s:.)*?` prefix added to the regex + /// // match through arbitrary bytes instead of being UTF-8 aware. This + /// // isn't necessary to get acceleration to work in this case, but + /// // it does make the DFA substantially simpler. + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("[^abc]+a")?; + /// + /// // Here we just pluck out the state that we know is accelerated. + /// // While the stride calculations are something that can be relied + /// // on by callers, the specific position of the accelerated state is + /// // implementation defined. + /// // + /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. + /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`. + /// let id = StateID::new(3 * dfa.stride()).unwrap(); + /// let accelerator = dfa.accelerator(id); + /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. + /// assert_eq!(accelerator, &[b'a', b'b', b'c']); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn accelerator(&self, _id: StateID) -> &[u8] { + &[] + } + + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// See [`Automaton::find_earliest_fwd_at`] for additional functionality, + /// such as providing a prefilter, a specific pattern to match and the + /// bounds of the search within the haystack. This routine is meant as + /// a convenience for common cases where the additional functionality is + /// not needed. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates + /// how the position returned might differ from what one might expect when + /// executing a traditional leftmost search. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::DFA::new("foo[0-9]+")?; + /// // Normally, the end of the leftmost first match here would be 8, + /// // corresponding to the end of the input. But the "earliest" semantics + /// // this routine cause it to stop as soon as a match is known, which + /// // occurs once 'foo[0-9]' has matched. + /// let expected = HalfMatch::must(0, 4); + /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"foo12345")?); + /// + /// let dfa = dense::DFA::new("abc|a")?; + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 1); + /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"abc")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn find_earliest_fwd( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_earliest_fwd_at(None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// Note that while it is not technically necessary to build a reverse + /// automaton to use a reverse search, it is likely that you'll want to do + /// so. Namely, the typical use of a reverse search is to find the starting + /// location of a match once its end is discovered from a forward search. A + /// reverse DFA automaton can be built by configuring the intermediate NFA + /// to be reversed via + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates + /// how the position returned might differ from what one might expect when + /// executing a traditional leftmost reverse search. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("[a-z]+[0-9]+")?; + /// // Normally, the end of the leftmost first match here would be 0, + /// // corresponding to the beginning of the input. But the "earliest" + /// // semantics of this routine cause it to stop as soon as a match is + /// // known, which occurs once '[a-z][0-9]+' has matched. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"foo12345")?); + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// // Normally, the end of the leftmost first match here would be 0, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"abc")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn find_earliest_rev( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_earliest_rev_at(None, bytes, 0, bytes.len()) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Notes for implementors + /// + /// Implementors of this trait are not required to implement any particular + /// match semantics (such as leftmost-first), which are instead manifest in + /// the DFA's transitions. + /// + /// In particular, this method must continue searching even after it enters + /// a match state. The search should only terminate once it has reached + /// the end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// Since this trait provides an implementation for this method by default, + /// it's unlikely that one will need to implement this. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). By default, a dense DFA uses + /// "leftmost first" match semantics. + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::DFA::new("foo[0-9]+")?; + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = dense::DFA::new("abc|a")?; + /// let expected = HalfMatch::must(0, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn find_leftmost_fwd( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_leftmost_fwd_at(None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Notes for implementors + /// + /// Implementors of this trait are not required to implement any particular + /// match semantics (such as leftmost-first), which are instead manifest in + /// the DFA's transitions. + /// + /// In particular, this method must continue searching even after it enters + /// a match state. The search should only terminate once it has reached + /// the end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// Since this trait provides an implementation for this method by default, + /// it's unlikely that one will need to implement this. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this routine + /// is principally useful when used in conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) + /// configuration. In general, it's unlikely to be correct to use both + /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since any + /// particular DFA will only support searching in one direction with + /// respect to the pattern. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson, + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("foo[0-9]+")?; + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"foo12345")?); + /// + /// // Even though a match is found after reading the last byte (`c`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = dense::Builder::new() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn find_leftmost_rev( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_leftmost_rev_at(None, bytes, 0, bytes.len()) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic overlapping search with a + /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the + /// automaton with a `MatchKind::All` configuration. Overlapping searches + /// are unlikely to work as one would expect when using the default + /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first + /// matching is fundamentally incompatible with overlapping searches. + /// Namely, overlapping searches need to report matches as they are seen, + /// where as leftmost-first searches will continue searching even after a + /// match has been observed in order to find the conventional end position + /// of the match. More concretely, leftmost-first searches use dead states + /// to terminate a search after a specific match can no longer be extended. + /// Overlapping searches instead do the opposite by continuing the search + /// to find totally new matches (potentially of other patterns). + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, OverlappingState, dense}, + /// HalfMatch, + /// MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn find_overlapping_fwd( + &self, + bytes: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_overlapping_fwd_at(None, None, bytes, 0, bytes.len(), state) + } + + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// This is like [`Automaton::find_earliest_fwd`], except it provides some + /// additional control over how the search is executed: + /// + /// * `pre` is a prefilter scanner that, when given, is used whenever the + /// DFA enters its starting state. This is meant to speed up searches where + /// one or a small number of literal prefixes are known. + /// * `pattern_id` specifies a specific pattern in the DFA to run an + /// anchored search for. If not given, then a search for any pattern is + /// performed. For DFAs built by this crate, + /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern) + /// must be enabled to use this functionality. + /// * `start` and `end` permit searching a specific region of the haystack + /// `bytes`. This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `bytes`. (Because the existence of look-around + /// operations such as `\b`, `^` and `$` need to take the surrounding + /// context into account. This cannot be done if the haystack doesn't + /// contain it.) + /// + /// The examples below demonstrate each of these additional parameters. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + /// + /// # Example: prefilter + /// + /// This example shows how to provide a prefilter for a pattern where all + /// matches start with a `z` byte. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// util::prefilter::{Candidate, Prefilter, Scanner, State}, + /// HalfMatch, + /// }; + /// + /// #[derive(Debug)] + /// pub struct ZPrefilter; + /// + /// impl Prefilter for ZPrefilter { + /// fn next_candidate( + /// &self, + /// _: &mut State, + /// haystack: &[u8], + /// at: usize, + /// ) -> Candidate { + /// // Try changing b'z' to b'q' and observe this test fail since + /// // the prefilter will skip right over the match. + /// match haystack.iter().position(|&b| b == b'z') { + /// None => Candidate::None, + /// Some(i) => Candidate::PossibleStartOfMatch(at + i), + /// } + /// } + /// + /// fn heap_bytes(&self) -> usize { + /// 0 + /// } + /// } + /// + /// let dfa = dense::DFA::new("z[0-9]{3}")?; + /// let haystack = "foobar z123 q123".as_bytes(); + /// // A scanner executes a prefilter while tracking some state that helps + /// // determine whether a prefilter is still "effective" or not. + /// let mut scanner = Scanner::new(&ZPrefilter); + /// + /// let expected = Some(HalfMatch::must(0, 11)); + /// let got = dfa.find_earliest_fwd_at( + /// Some(&mut scanner), + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-DFA that permits searching for + /// specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// PatternID, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let haystack = "foo123".as_bytes(); + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(HalfMatch::must(1, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// Some(PatternID::must(1)), + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, + /// }; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries. + /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let haystack = "foo123bar".as_bytes(); + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// None, + /// &haystack[3..6], + /// 0, + /// haystack[3..6].len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let got = dfa.find_earliest_fwd_at( + /// None, + /// None, + /// haystack, + /// 3, + /// 6, + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn find_earliest_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_earliest_fwd(pre, self, pattern_id, bytes, start, end) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// This is like [`Automaton::find_earliest_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_earliest_rev_at( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_earliest_rev(self, pattern_id, bytes, start, end) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// This is like [`Automaton::find_leftmost_fwd`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_leftmost_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_leftmost_fwd(pre, self, pattern_id, bytes, start, end) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// This is like [`Automaton::find_leftmost_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_leftmost_rev_at( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_leftmost_rev(self, pattern_id, bytes, start, end) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// This is like [`Automaton::find_overlapping_fwd`], except it provides + /// some additional control over how the search is executed. See the + /// documentation of [`Automaton::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should always be set to the end + /// of the last match. If more patterns match at the previous location, + /// then they will be immediately returned. (This is tracked by the given + /// overlapping state.) Otherwise, the search continues at the starting + /// position given. + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFAs generated by this crate, this only occurs in a non-default + /// configuration where quit bytes are used or Unicode word boundaries are + /// heuristically enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine must panic if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It must also panic if the given haystack range is not valid. + #[inline] + fn find_overlapping_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_overlapping_fwd( + pre, self, pattern_id, bytes, start, end, state, + ) + } +} + +unsafe impl<'a, T: Automaton> Automaton for &'a T { + #[inline] + fn next_state(&self, current: StateID, input: u8) -> StateID { + (**self).next_state(current, input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + (**self).next_state_unchecked(current, input) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + (**self).next_eoi_state(current) + } + + #[inline] + fn start_state_forward( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + (**self).start_state_forward(pattern_id, bytes, start, end) + } + + #[inline] + fn start_state_reverse( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + (**self).start_state_reverse(pattern_id, bytes, start, end) + } + + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + (**self).is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + (**self).is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + (**self).is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + (**self).is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + (**self).is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + (**self).is_accel_state(id) + } + + #[inline] + fn pattern_count(&self) -> usize { + (**self).pattern_count() + } + + #[inline] + fn match_count(&self, id: StateID) -> usize { + (**self).match_count(id) + } + + #[inline] + fn match_pattern(&self, id: StateID, index: usize) -> PatternID { + (**self).match_pattern(id, index) + } + + #[inline] + fn accelerator(&self, id: StateID) -> &[u8] { + (**self).accelerator(id) + } + + #[inline] + fn find_earliest_fwd( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_earliest_fwd(bytes) + } + + #[inline] + fn find_earliest_rev( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_earliest_rev(bytes) + } + + #[inline] + fn find_leftmost_fwd( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_leftmost_fwd(bytes) + } + + #[inline] + fn find_leftmost_rev( + &self, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_leftmost_rev(bytes) + } + + #[inline] + fn find_overlapping_fwd( + &self, + bytes: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_overlapping_fwd(bytes, state) + } + + #[inline] + fn find_earliest_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_earliest_fwd_at(pre, pattern_id, bytes, start, end) + } + + #[inline] + fn find_earliest_rev_at( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_earliest_rev_at(pattern_id, bytes, start, end) + } + + #[inline] + fn find_leftmost_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_leftmost_fwd_at(pre, pattern_id, bytes, start, end) + } + + #[inline] + fn find_leftmost_rev_at( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self).find_leftmost_rev_at(pattern_id, bytes, start, end) + } + + #[inline] + fn find_overlapping_fwd_at( + &self, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + (**self) + .find_overlapping_fwd_at(pre, pattern_id, bytes, start, end, state) + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides no introspection capabilities. The only thing a caller +/// can do is construct it and pass it around to permit search routines to use +/// it to track state. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + id: Option<StateID>, + /// Information associated with a match when `id` corresponds to a match + /// state. + last_match: Option<StateMatch>, +} + +/// Internal state about the last match that occurred. This records both the +/// offset of the match and the match index. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct StateMatch { + /// The index into the matching patterns for the current match state. + pub(crate) match_index: usize, + /// The offset in the haystack at which the match occurred. This is used + /// when reporting multiple matches at the same offset. That is, when + /// an overlapping search runs, the first thing it checks is whether it's + /// already in a match state, and if so, whether there are more patterns + /// to report as matches in that state. If so, it increments `match_index` + /// and returns the pattern and this offset. Once `match_index` exceeds the + /// number of matching patterns in the current state, the search continues. + pub(crate) offset: usize, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { id: None, last_match: None } + } + + pub(crate) fn id(&self) -> Option<StateID> { + self.id + } + + pub(crate) fn set_id(&mut self, id: StateID) { + self.id = Some(id); + } + + pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> { + self.last_match.as_mut() + } + + pub(crate) fn set_last_match(&mut self, last_match: StateMatch) { + self.last_match = Some(last_match); + } +} + +/// Write a prefix "state" indicator for fmt::Debug impls. +/// +/// Specifically, this tries to succinctly distinguish the different types of +/// states: dead states, quit states, accelerated states, start states and +/// match states. It even accounts for the possible overlappings of different +/// state types. +pub(crate) fn fmt_state_indicator<A: Automaton>( + f: &mut core::fmt::Formatter<'_>, + dfa: A, + id: StateID, +) -> core::fmt::Result { + if dfa.is_dead_state(id) { + write!(f, "D")?; + if dfa.is_start_state(id) { + write!(f, ">")?; + } else { + write!(f, " ")?; + } + } else if dfa.is_quit_state(id) { + write!(f, "Q ")?; + } else if dfa.is_start_state(id) { + if dfa.is_accel_state(id) { + write!(f, "A>")?; + } else { + write!(f, " >")?; + } + } else if dfa.is_match_state(id) { + if dfa.is_accel_state(id) { + write!(f, "A*")?; + } else { + write!(f, " *")?; + } + } else if dfa.is_accel_state(id) { + write!(f, "A ")?; + } else { + write!(f, " ")?; + } + Ok(()) +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/dense.rs b/vendor/regex-automata-0.2.0/src/dfa/dense.rs new file mode 100644 index 000000000..07c135098 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/dense.rs @@ -0,0 +1,4470 @@ +/*! +Types and routines specific to dense DFAs. + +This module is the home of [`dense::DFA`](DFA). + +This module also contains a [`dense::Builder`](Builder) and a +[`dense::Config`](Config) for configuring and building a dense DFA. +*/ + +#[cfg(feature = "alloc")] +use core::cmp; +use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; + +#[cfg(feature = "alloc")] +use alloc::{ + collections::{BTreeMap, BTreeSet}, + vec, + vec::Vec, +}; + +#[cfg(feature = "alloc")] +use crate::{ + dfa::{ + accel::Accel, determinize, error::Error, minimize::Minimizer, sparse, + }, + nfa::thompson, + util::alphabet::ByteSet, + MatchKind, +}; +use crate::{ + dfa::{ + accel::Accels, + automaton::{fmt_state_indicator, Automaton}, + special::Special, + DEAD, + }, + util::{ + alphabet::{self, ByteClasses}, + bytes::{self, DeserializeError, Endian, SerializeError}, + id::{PatternID, StateID}, + start::Start, + }, +}; + +/// The label that is pre-pended to a serialized DFA. +const LABEL: &str = "rust-regex-automata-dfa-dense"; + +/// The format version of dense regexes. This version gets incremented when a +/// change occurs. A change may not necessarily be a breaking change, but the +/// version does permit good error messages in the case where a breaking change +/// is made. +const VERSION: u32 = 2; + +/// The configuration used for compiling a dense DFA. +/// +/// A dense DFA configuration is a simple data object that is typically used +/// with [`dense::Builder::configure`](self::Builder::configure). +/// +/// The default configuration guarantees that a search will _never_ return a +/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a +/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode +/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a +/// search to return an error. See the corresponding configuration options for +/// more details on when those error conditions arise. +#[cfg(feature = "alloc")] +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + anchored: Option<bool>, + accelerate: Option<bool>, + minimize: Option<bool>, + match_kind: Option<MatchKind>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + unicode_word_boundary: Option<bool>, + quit: Option<ByteSet>, + dfa_size_limit: Option<Option<usize>>, + determinize_size_limit: Option<Option<usize>>, +} + +#[cfg(feature = "alloc")] +impl Config { + /// Return a new default dense DFA compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of a search. When + /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`, + /// which enables a match to appear anywhere. + /// + /// Note that if you want to run both anchored and unanchored + /// searches without building multiple automatons, you can enable the + /// [`Config::starts_for_each_pattern`] configuration instead. This will + /// permit unanchored any-pattern searches and pattern-specific anchored + /// searches. See the documentation for that configuration for an example. + /// + /// By default this is disabled. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// input, regardless of where you begin your search. In contrast, enabling + /// this option will allow your regex to match anywhere in your input, + /// but the match must start at the beginning of a search. (Most of the + /// higher level convenience search routines make "start of input" and + /// "start of search" equivalent, but some routines allow treating these as + /// orthogonal.) + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `anchored=false` and searches + /// `aba` starting at position `2`. Since `^` requires the match to + /// start at the beginning of the input and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the input. + /// 3. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the regex is anchored, it finds no match. + /// 4. The regex `a` is compiled with `anchored=false` and searches `aba` + /// startting at position `1`. Since the regex is neither anchored nor + /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?` + /// prefix that permits it to match anywhere. Thus, it reports a match + /// at `[2, 3]`. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let haystack = "aba".as_bytes(); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(false)) // default + /// .build(r"^a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?; + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(true)) + /// .build(r"a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?; + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(true)) + /// .build(r"a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?; + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().anchored(false)) // default + /// .build(r"a")?; + /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?; + /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn anchored(mut self, yes: bool) -> Config { + self.anchored = Some(yes); + self + } + + /// Enable state acceleration. + /// + /// When enabled, DFA construction will analyze each state to determine + /// whether it is eligible for simple acceleration. Acceleration typically + /// occurs when most of a state's transitions loop back to itself, leaving + /// only a select few bytes that will exit the state. When this occurs, + /// other routines like `memchr` can be used to look for those bytes which + /// may be much faster than traversing the DFA. + /// + /// Callers may elect to disable this if consistent performance is more + /// desirable than variable performance. Namely, acceleration can sometimes + /// make searching slower than it otherwise would be if the transitions + /// that leave accelerated states are traversed frequently. + /// + /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for + /// an example. + /// + /// This is enabled by default. + pub fn accelerate(mut self, yes: bool) -> Config { + self.accelerate = Some(yes); + self + } + + /// Minimize the DFA. + /// + /// When enabled, the DFA built will be minimized such that it is as small + /// as possible. + /// + /// Whether one enables minimization or not depends on the types of costs + /// you're willing to pay and how much you care about its benefits. In + /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` + /// space, where `n` is the number of DFA states and `k` is the alphabet + /// size. In practice, minimization can be quite costly in terms of both + /// space and time, so it should only be done if you're willing to wait + /// longer to produce a DFA. In general, you might want a minimal DFA in + /// the following circumstances: + /// + /// 1. You would like to optimize for the size of the automaton. This can + /// manifest in one of two ways. Firstly, if you're converting the + /// DFA into Rust code (or a table embedded in the code), then a minimal + /// DFA will translate into a corresponding reduction in code size, and + /// thus, also the final compiled binary size. Secondly, if you are + /// building many DFAs and putting them on the heap, you'll be able to + /// fit more if they are smaller. Note though that building a minimal + /// DFA itself requires additional space; you only realize the space + /// savings once the minimal DFA is constructed (at which point, the + /// space used for minimization is freed). + /// 2. You've observed that a smaller DFA results in faster match + /// performance. Naively, this isn't guaranteed since there is no + /// inherent difference between matching with a bigger-than-minimal + /// DFA and a minimal DFA. However, a smaller DFA may make use of your + /// CPU's cache more efficiently. + /// 3. You are trying to establish an equivalence between regular + /// languages. The standard method for this is to build a minimal DFA + /// for each language and then compare them. If the DFAs are equivalent + /// (up to state renaming), then the languages are equivalent. + /// + /// Typically, minimization only makes sense as an offline process. That + /// is, one might minimize a DFA before serializing it to persistent + /// storage. In practical terms, minimization can take around an order of + /// magnitude more time than compiling the initial DFA via determinization. + /// + /// This option is disabled by default. + pub fn minimize(mut self, yes: bool) -> Config { + self.minimize = Some(yes); + self + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to classical DFA construction + /// where all possible matches are added to the DFA. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA + /// as a way to terminate the search and report a match. `LeftmostFirst` + /// also supports non-greedy matches using this strategy where as `All` + /// does not. + /// + /// # Example: overlapping search + /// + /// This example shows the typical use of `MatchKind::All`, which is to + /// report overlapping matches. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, OverlappingState, dense}, + /// HalfMatch, MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: reverse automaton to find start of match + /// + /// Another example for using `MatchKind::All` is for constructing a + /// reverse automaton to find the start of a match. `All` semantics are + /// used for this in order to find the longest possible match, which + /// corresponds to the leftmost starting position. + /// + /// Note that if you need the starting position then + /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for + /// you, so it's usually not necessary to do this yourself. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind}; + /// + /// let haystack = "123foobar456".as_bytes(); + /// let pattern = r"[a-z]+"; + /// + /// let dfa_fwd = dense::DFA::new(pattern)?; + /// let dfa_rev = dense::Builder::new() + /// .configure(dense::Config::new() + /// .anchored(true) + /// .match_kind(MatchKind::All) + /// ) + /// .build(pattern)?; + /// let expected_fwd = HalfMatch::must(0, 9); + /// let expected_rev = HalfMatch::must(0, 3); + /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap(); + /// // Here we don't specify the pattern to search for since there's only + /// // one pattern and we're doing a leftmost search. But if this were an + /// // overlapping search, you'd need to specify the pattern that matched + /// // in the forward direction. (Otherwise, you might wind up finding the + /// // starting position of a match of some other pattern.) That in turn + /// // requires building the reverse automaton with starts_for_each_pattern + /// // enabled. Indeed, this is what Regex does internally. + /// let got_rev = dfa_rev.find_leftmost_rev_at( + /// None, haystack, 0, got_fwd.offset(), + /// )?.unwrap(); + /// assert_eq!(expected_fwd, got_fwd); + /// assert_eq!(expected_rev, got_rev); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// automaton. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the DFA. When this start state is used, then the DFA will + /// only search for matches for the pattern specified, even if there are + /// other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the DFA. + /// + /// There are a few reasons one might want to enable this (it's disabled + /// by default): + /// + /// 1. When looking for the start of an overlapping match (using a + /// reverse DFA), doing it correctly requires starting the reverse search + /// using the starting state of the pattern that matched in the forward + /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex), + /// it will automatically enable this option when building the reverse DFA + /// internally. + /// 2. When you want to use a DFA with multiple patterns to both search + /// for matches of any pattern or to search for anchored matches of one + /// particular pattern while using the same DFA. (Otherwise, you would need + /// to compile a new DFA for each pattern.) + /// 3. Since the start states added for each pattern are anchored, if you + /// compile an unanchored DFA with one pattern while also enabling this + /// option, then you can use the same DFA to perform anchored or unanchored + /// searches. The latter you get with the standard search APIs. The former + /// you get from the various `_at` search methods that allow you specify a + /// pattern ID to search for. + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to use this option to permit the same DFA to + /// run both anchored and unanchored searches for a single pattern. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, PatternID, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build(r"foo[0-9]+")?; + /// let haystack = b"quux foo123"; + /// + /// // Here's a normal unanchored search. Notice that we use 'None' for the + /// // pattern ID. Since the DFA was built as an unanchored machine, it + /// // use its default unanchored starting state. + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// None, None, haystack, 0, haystack.len(), + /// )?); + /// // But now if we explicitly specify the pattern to search ('0' being + /// // the only pattern in the DFA), then it will use the starting state + /// // for that specific pattern which is always anchored. Since the + /// // pattern doesn't have a match at the beginning of the haystack, we + /// // find nothing. + /// assert_eq!(None, dfa.find_leftmost_fwd_at( + /// None, Some(PatternID::must(0)), haystack, 0, haystack.len(), + /// )?); + /// // And finally, an anchored search is not the same as putting a '^' at + /// // beginning of the pattern. An anchored search can only match at the + /// // beginning of the *search*, which we can change: + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// None, Some(PatternID::must(0)), haystack, 5, haystack.len(), + /// )?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the DFA's alphabet or not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging a generated DFA. + /// + /// When enabled, the DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence classes because they never discriminate between a match + /// and a non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to + /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence + /// classes (rounded up to the nearest power of 2). As a result, total + /// space usage can decrease substantially. Moreover, since a smaller + /// alphabet is used, DFA compilation becomes faster as well. + /// + /// **WARNING:** This is only useful for debugging DFAs. Disabling this + /// does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Heuristically enable Unicode word boundaries. + /// + /// When set, this will attempt to implement Unicode word boundaries as if + /// they were ASCII word boundaries. This only works when the search input + /// is ASCII only. If a non-ASCII byte is observed while searching, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned. + /// + /// A possible alternative to enabling this option is to simply use an + /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this + /// option is if you absolutely need Unicode support. This option lets one + /// use a fast search implementation (a DFA) for some potentially very + /// common cases, while providing the option to fall back to some other + /// regex engine to handle the general case when an error is returned. + /// + /// If the pattern provided has no Unicode word boundary in it, then this + /// option has no effect. (That is, quitting on a non-ASCII byte only + /// occurs when this option is enabled _and_ a Unicode word boundary is + /// present in the pattern.) + /// + /// This is almost equivalent to setting all non-ASCII bytes to be quit + /// bytes. The only difference is that this will cause non-ASCII bytes to + /// be quit bytes _only_ when a Unicode word boundary is present in the + /// pattern. + /// + /// When enabling this option, callers _must_ be prepared to handle + /// a [`MatchError`](crate::MatchError) error during search. + /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds + /// to using the `try_` suite of methods. Alternatively, if + /// callers can guarantee that their input is ASCII only, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be + /// returned while searching. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to heuristically enable Unicode word boundaries + /// in a pattern. It also shows what happens when a search comes across a + /// non-ASCII byte. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, MatchKind, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().unicode_word_boundary(true)) + /// .build(r"\b[0-9]+\b")?; + /// + /// // The match occurs before the search ever observes the snowman + /// // character, so no error occurs. + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = Some(HalfMatch::must(0, 7)); + /// let got = dfa.find_leftmost_fwd(haystack)?; + /// assert_eq!(expected, got); + /// + /// // Notice that this search fails, even though the snowman character + /// // occurs after the ending match offset. This is because search + /// // routines read one byte past the end of the search to account for + /// // look-around, and indeed, this is required here to determine whether + /// // the trailing \b matches. + /// let haystack = "foo 123☃".as_bytes(); + /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 }; + /// let got = dfa.find_leftmost_fwd(haystack); + /// assert_eq!(Err(expected), got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn unicode_word_boundary(mut self, yes: bool) -> Config { + // We have a separate option for this instead of just setting the + // appropriate quit bytes here because we don't want to set quit bytes + // for every regex. We only want to set them when the regex contains a + // Unicode word boundary. + self.unicode_word_boundary = Some(yes); + self + } + + /// Add a "quit" byte to the DFA. + /// + /// When a quit byte is seen during search time, then search will return + /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the + /// offset at which the search stopped. + /// + /// A quit byte will always overrule any other aspects of a regex. For + /// example, if the `x` byte is added as a quit byte and the regex `\w` is + /// used, then observing `x` will cause the search to quit immediately + /// despite the fact that `x` is in the `\w` class. + /// + /// This mechanism is primarily useful for heuristically enabling certain + /// features like Unicode word boundaries in a DFA. Namely, if the input + /// to search is ASCII, then a Unicode word boundary can be implemented + /// via an ASCII word boundary with no change in semantics. Thus, a DFA + /// can attempt to match a Unicode word boundary but give up as soon as it + /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes + /// to be quit bytes, then Unicode word boundaries will be permitted when + /// building DFAs. Of course, callers should enable + /// [`Config::unicode_word_boundary`] if they want this behavior instead. + /// (The advantage being that non-ASCII quit bytes will only be added if a + /// Unicode word boundary is in the pattern.) + /// + /// When enabling this option, callers _must_ be prepared to handle a + /// [`MatchError`](crate::MatchError) error during search. When using a + /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the + /// `try_` suite of methods. + /// + /// By default, there are no quit bytes set. + /// + /// # Panics + /// + /// This panics if heuristic Unicode word boundaries are enabled and any + /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling + /// Unicode word boundaries requires setting every non-ASCII byte to a quit + /// byte. So if the caller attempts to undo any of that, then this will + /// panic. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte. This could be useful if, for example, you wanted to prevent + /// a user supplied pattern from matching across a line boundary. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// HalfMatch, MatchError, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn quit(mut self, byte: u8, yes: bool) -> Config { + if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { + panic!( + "cannot set non-ASCII byte to be non-quit when \ + Unicode word boundaries are enabled" + ); + } + if self.quit.is_none() { + self.quit = Some(ByteSet::empty()); + } + if yes { + self.quit.as_mut().unwrap().add(byte); + } else { + self.quit.as_mut().unwrap().remove(byte); + } + self + } + + /// Set a size limit on the total heap used by a DFA. + /// + /// This size limit is expressed in bytes and is applied during + /// determinization of an NFA into a DFA. If the DFA's heap usage, and only + /// the DFA, exceeds this configured limit, then determinization is stopped + /// and an error is returned. + /// + /// This limit does not apply to auxiliary storage used during + /// determinization that isn't part of the generated DFA. + /// + /// This limit is only applied during determinization. Currently, there is + /// no way to post-pone this check to after minimization if minimization + /// was enabled. + /// + /// The total limit on heap used during determinization is the sum of the + /// DFA and determinization size limits. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a DFA that fails to build because of a configured + /// size limit. This particular example also serves as a cautionary tale + /// demonstrating just how big DFAs with large Unicode character classes + /// can get. + /// + /// ``` + /// use regex_automata::dfa::{dense, Automaton}; + /// + /// // 3MB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new().dfa_size_limit(Some(3_000_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 4MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().dfa_size_limit(Some(4_000_000))) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// While one needs a little more than 3MB to represent `\w{20}`, it + /// turns out that you only need a little more than 4KB to represent + /// `(?-u:\w{20})`. So only use Unicode if you need it! + pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config { + self.dfa_size_limit = Some(bytes); + self + } + + /// Set a size limit on the total heap used by determinization. + /// + /// This size limit is expressed in bytes and is applied during + /// determinization of an NFA into a DFA. If the heap used for auxiliary + /// storage during determinization (memory that is not in the DFA but + /// necessary for building the DFA) exceeds this configured limit, then + /// determinization is stopped and an error is returned. + /// + /// This limit does not apply to heap used by the DFA itself. + /// + /// The total limit on heap used during determinization is the sum of the + /// DFA and determinization size limits. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a DFA that fails to build because of a + /// configured size limit on the amount of heap space used by + /// determinization. This particular example complements the example for + /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode + /// potentially make DFAs themselves big, but it also results in more + /// auxiliary storage during determinization. (Although, auxiliary storage + /// is still not as much as the DFA itself.) + /// + /// ``` + /// use regex_automata::dfa::{dense, Automaton}; + /// + /// // 300KB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(300_000)) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 400KB probably is! + /// // (Note that auxiliary storage sizes aren't necessarily stable between + /// // releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(400_000)) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config { + self.determinize_size_limit = Some(bytes); + self + } + + /// Returns whether this configuration has enabled anchored searches. + pub fn get_anchored(&self) -> bool { + self.anchored.unwrap_or(false) + } + + /// Returns whether this configuration has enabled simple state + /// acceleration. + pub fn get_accelerate(&self) -> bool { + self.accelerate.unwrap_or(true) + } + + /// Returns whether this configuration has enabled the expensive process + /// of minimizing a DFA. + pub fn get_minimize(&self) -> bool { + self.minimize.unwrap_or(false) + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns whether this configuration has enabled heuristic Unicode word + /// boundary support. When enabled, it is possible for a search to return + /// an error. + pub fn get_unicode_word_boundary(&self) -> bool { + self.unicode_word_boundary.unwrap_or(false) + } + + /// Returns whether this configuration will instruct the DFA to enter a + /// quit state whenever the given byte is seen during a search. When at + /// least one byte has this enabled, it is possible for a search to return + /// an error. + pub fn get_quit(&self, byte: u8) -> bool { + self.quit.map_or(false, |q| q.contains(byte)) + } + + /// Returns the DFA size limit of this configuration if one was set. + /// The size limit is total number of bytes on the heap that a DFA is + /// permitted to use. If the DFA exceeds this limit during construction, + /// then construction is stopped and an error is returned. + pub fn get_dfa_size_limit(&self) -> Option<usize> { + self.dfa_size_limit.unwrap_or(None) + } + + /// Returns the determinization size limit of this configuration if one + /// was set. The size limit is total number of bytes on the heap that + /// determinization is permitted to use. If determinization exceeds this + /// limit during construction, then construction is stopped and an error is + /// returned. + /// + /// This is different from the DFA size limit in that this only applies to + /// the auxiliary storage used during determinization. Once determinization + /// is complete, this memory is freed. + /// + /// The limit on the total heap memory used is the sum of the DFA and + /// determinization size limits. + pub fn get_determinize_size_limit(&self) -> Option<usize> { + self.determinize_size_limit.unwrap_or(None) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { + anchored: o.anchored.or(self.anchored), + accelerate: o.accelerate.or(self.accelerate), + minimize: o.minimize.or(self.minimize), + match_kind: o.match_kind.or(self.match_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + unicode_word_boundary: o + .unicode_word_boundary + .or(self.unicode_word_boundary), + quit: o.quit.or(self.quit), + dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), + determinize_size_limit: o + .determinize_size_limit + .or(self.determinize_size_limit), + } + } +} + +/// A builder for constructing a deterministic finite automaton from regular +/// expressions. +/// +/// This builder provides two main things: +/// +/// 1. It provides a few different `build` routines for actually constructing +/// a DFA from different kinds of inputs. The most convenient is +/// [`Builder::build`], which builds a DFA directly from a pattern string. The +/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight +/// from an NFA. +/// 2. The builder permits configuring a number of things. +/// [`Builder::configure`] is used with [`Config`] to configure aspects of +/// the DFA and the construction process itself. [`Builder::syntax`] and +/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA +/// construction, respectively. The syntax and thompson configurations only +/// apply when building from a pattern string. +/// +/// This builder always constructs a *single* DFA. As such, this builder +/// can only be used to construct regexes that either detect the presence +/// of a match or find the end location of a match. A single DFA cannot +/// produce both the start and end of a match. For that information, use a +/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured +/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to +/// use a DFA directly is if the end location of a match is enough for your use +/// case. Namely, a `Regex` will construct two DFAs instead of one, since a +/// second reverse DFA is needed to find the start of a match. +/// +/// Note that if one wants to build a sparse DFA, you must first build a dense +/// DFA and convert that to a sparse DFA. There is no way to build a sparse +/// DFA without first building a dense DFA. +/// +/// # Example +/// +/// This example shows how to build a minimized DFA that completely disables +/// Unicode. That is: +/// +/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` +/// and `\b` are ASCII-only while `.` matches any byte except for `\n` +/// (instead of any UTF-8 encoding of a Unicode scalar value except for +/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. +/// * The pattern itself is permitted to match invalid UTF-8. For example, +/// things like `[^a]` that match any byte except for `a` are permitted. +/// * Unanchored patterns can search through invalid UTF-8. That is, for +/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of +/// `(?s:.)*?`. +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, dense}, +/// nfa::thompson, +/// HalfMatch, SyntaxConfig, +/// }; +/// +/// let dfa = dense::Builder::new() +/// .configure(dense::Config::new().minimize(false)) +/// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo[^b]ar.*")?; +/// +/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; +/// let expected = Some(HalfMatch::must(0, 10)); +/// let got = dfa.find_leftmost_fwd(haystack)?; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + thompson: thompson::Builder, +} + +#[cfg(feature = "alloc")] +impl Builder { + /// Create a new dense DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + thompson: thompson::Builder::new(), + } + } + + /// Build a DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<OwnedDFA, Error> { + self.build_many(&[pattern]) + } + + /// Build a DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<OwnedDFA, Error> { + let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?; + self.build_from_nfa(&nfa) + } + + /// Build a DFA from the given NFA. + /// + /// # Example + /// + /// This example shows how to build a DFA if you already have an NFA in + /// hand. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// nfa::thompson, + /// HalfMatch, + /// }; + /// + /// let haystack = "foo123bar".as_bytes(); + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = thompson::Builder::new() + /// .configure(thompson::Config::new().shrink(false)) + /// .build(r"[0-9]+")?; + /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?; + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_leftmost_fwd(haystack)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa( + &self, + nfa: &thompson::NFA, + ) -> Result<OwnedDFA, Error> { + let mut quit = self.config.quit.unwrap_or(ByteSet::empty()); + if self.config.get_unicode_word_boundary() + && nfa.has_word_boundary_unicode() + { + for b in 0x80..=0xFF { + quit.add(b); + } + } + let classes = if !self.config.get_byte_classes() { + // DFAs will always use the equivalence class map, but enabling + // this option is useful for debugging. Namely, this will cause all + // transitions to be defined over their actual bytes instead of an + // opaque equivalence class identifier. The former is much easier + // to grok as a human. + ByteClasses::singletons() + } else { + let mut set = nfa.byte_class_set().clone(); + // It is important to distinguish any "quit" bytes from all other + // bytes. Otherwise, a non-quit byte may end up in the same class + // as a quit byte, and thus cause the DFA stop when it shouldn't. + if !quit.is_empty() { + set.add_set(&quit); + } + set.byte_classes() + }; + + let mut dfa = DFA::initial( + classes, + nfa.pattern_len(), + self.config.get_starts_for_each_pattern(), + )?; + determinize::Config::new() + .anchored(self.config.get_anchored()) + .match_kind(self.config.get_match_kind()) + .quit(quit) + .dfa_size_limit(self.config.get_dfa_size_limit()) + .determinize_size_limit(self.config.get_determinize_size_limit()) + .run(nfa, &mut dfa)?; + if self.config.get_minimize() { + dfa.minimize(); + } + if self.config.get_accelerate() { + dfa.accelerate(); + } + Ok(dfa) + } + + /// Apply the given dense DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether the DFA should match the regex + /// in reverse or if additional time should be spent shrinking the size of + /// the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +#[cfg(feature = "alloc")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// A convenience alias for an owned DFA. We use this particular instantiation +/// a lot in this crate, so it's worth giving it a name. This instantiation +/// is commonly used for mutable APIs on the DFA while building it. The main +/// reason for making DFAs generic is no_std support, and more generally, +/// making it possible to load a DFA from an arbitrary slice of bytes. +#[cfg(feature = "alloc")] +pub(crate) type OwnedDFA = DFA<Vec<u32>>; + +/// A dense table-based deterministic finite automaton (DFA). +/// +/// All dense DFAs have one or more start states, zero or more match states +/// and a transition table that maps the current state and the current byte +/// of input to the next state. A DFA can use this information to implement +/// fast searching. In particular, the use of a dense DFA generally makes the +/// trade off that match speed is the most valuable characteristic, even if +/// building the DFA may take significant time *and* space. (More concretely, +/// building a DFA takes time and space that is exponential in the size of the +/// pattern in the worst case.) As such, the processing of every byte of input +/// is done with a small constant number of operations that does not vary with +/// the pattern, its size or the size of the alphabet. If your needs don't line +/// up with this trade off, then a dense DFA may not be an adequate solution to +/// your problem. +/// +/// In contrast, a [`sparse::DFA`] makes the opposite +/// trade off: it uses less space but will execute a variable number of +/// instructions per byte at match time, which makes it slower for matching. +/// (Note that space usage is still exponential in the size of the pattern in +/// the worst case.) +/// +/// A DFA can be built using the default configuration via the +/// [`DFA::new`] constructor. Otherwise, one can +/// configure various aspects via [`dense::Builder`](Builder). +/// +/// A single DFA fundamentally supports the following operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of a match. +/// 3. In the case of a DFA with multiple patterns, which pattern matched is +/// reported as well. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of +/// a match, *two* DFAs are required. This functionality is provided by a +/// [`Regex`](crate::dfa::regex::Regex). +/// +/// # Type parameters +/// +/// A `DFA` has one type parameter, `T`, which is used to represent state IDs, +/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`. +/// +/// # The `Automaton` trait +/// +/// This type implements the [`Automaton`] trait, which means it can be used +/// for searching. For example: +/// +/// ``` +/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let expected = HalfMatch::must(0, 8); +/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct DFA<T> { + /// The transition table for this DFA. This includes the transitions + /// themselves, along with the stride, number of states and the equivalence + /// class mapping. + tt: TransitionTable<T>, + /// The set of starting state identifiers for this DFA. The starting state + /// IDs act as pointers into the transition table. The specific starting + /// state chosen for each search is dependent on the context at which the + /// search begins. + st: StartTable<T>, + /// The set of match states and the patterns that match for each + /// corresponding match state. + /// + /// This structure is technically only needed because of support for + /// multi-regexes. Namely, multi-regexes require answering not just whether + /// a match exists, but _which_ patterns match. So we need to store the + /// matching pattern IDs for each match state. We do this even when there + /// is only one pattern for the sake of simplicity. In practice, this uses + /// up very little space for the case of on pattern. + ms: MatchStates<T>, + /// Information about which states are "special." Special states are states + /// that are dead, quit, matching, starting or accelerated. For more info, + /// see the docs for `Special`. + special: Special, + /// The accelerators for this DFA. + /// + /// If a state is accelerated, then there exist only a small number of + /// bytes that can cause the DFA to leave the state. This permits searching + /// to use optimized routines to find those specific bytes instead of using + /// the transition table. + /// + /// All accelerated states exist in a contiguous range in the DFA's + /// transition table. See dfa/special.rs for more details on how states are + /// arranged. + accels: Accels<T>, +} + +#[cfg(feature = "alloc")] +impl OwnedDFA { + /// Parse the given regular expression using a default configuration and + /// return the corresponding DFA. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](Builder) to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dfa = dense::DFA::new("foo[0-9]+bar")?; + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<OwnedDFA, Error> { + Builder::new().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding multi-DFA. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](Builder) to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<OwnedDFA, Error> { + Builder::new().build_many(patterns) + } +} + +#[cfg(feature = "alloc")] +impl OwnedDFA { + /// Create a new DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dfa = dense::DFA::always_match()?; + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<OwnedDFA, Error> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(&nfa) + } + + /// Create a new DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::{Automaton, dense}; + /// + /// let dfa = dense::DFA::never_match()?; + /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<OwnedDFA, Error> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(&nfa) + } + + /// Create an initial DFA with the given equivalence classes, pattern count + /// and whether anchored starting states are enabled for each pattern. An + /// initial DFA can be further mutated via determinization. + fn initial( + classes: ByteClasses, + pattern_count: usize, + starts_for_each_pattern: bool, + ) -> Result<OwnedDFA, Error> { + let start_pattern_count = + if starts_for_each_pattern { pattern_count } else { 0 }; + Ok(DFA { + tt: TransitionTable::minimal(classes), + st: StartTable::dead(start_pattern_count)?, + ms: MatchStates::empty(pattern_count), + special: Special::new(), + accels: Accels::empty(), + }) + } +} + +impl<T: AsRef<[u32]>> DFA<T> { + /// Cheaply return a borrowed version of this dense DFA. Specifically, + /// the DFA returned always uses `&[u32]` for its transition table. + pub fn as_ref(&self) -> DFA<&'_ [u32]> { + DFA { + tt: self.tt.as_ref(), + st: self.st.as_ref(), + ms: self.ms.as_ref(), + special: self.special, + accels: self.accels(), + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u32>` for its transition table. + /// + /// Effectively, this returns a dense DFA whose transition table lives on + /// the heap. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> OwnedDFA { + DFA { + tt: self.tt.to_owned(), + st: self.st.to_owned(), + ms: self.ms.to_owned(), + special: self.special, + accels: self.accels().to_owned(), + } + } + + /// Returns true only if this DFA has starting states for each pattern. + /// + /// When a DFA has starting states for each pattern, then a search with the + /// DFA can be configured to only look for anchored matches of a specific + /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] + /// can accept a non-None `pattern_id` if and only if this method returns + /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// + /// Note that if the DFA has no patterns, this always returns false. + pub fn has_starts_for_each_pattern(&self) -> bool { + self.st.patterns > 0 + } + + /// Returns the total number of elements in the alphabet for this DFA. + /// + /// That is, this returns the total number of transitions that each state + /// in this DFA must have. Typically, a normal byte oriented DFA would + /// always have an alphabet size of 256, corresponding to the number of + /// unique values in a single byte. However, this implementation has two + /// peculiarities that impact the alphabet length: + /// + /// * Every state has a special "EOI" transition that is only followed + /// after the end of some haystack is reached. This EOI transition is + /// necessary to account for one byte of look-ahead when implementing + /// things like `\b` and `$`. + /// * Bytes are grouped into equivalence classes such that no two bytes in + /// the same class can distinguish a match from a non-match. For example, + /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the + /// same equivalence class. This leads to a massive space savings. + /// + /// Note though that the alphabet length does _not_ necessarily equal the + /// total stride space taken up by a single DFA state in the transition + /// table. Namely, for performance reasons, the stride is always the + /// smallest power of two that is greater than or equal to the alphabet + /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are + /// often more useful. The alphabet length is typically useful only for + /// informational purposes. + pub fn alphabet_len(&self) -> usize { + self.tt.alphabet_len() + } + + /// Returns the total stride for every state in this DFA, expressed as the + /// exponent of a power of 2. The stride is the amount of space each state + /// takes up in the transition table, expressed as a number of transitions. + /// (Unused transitions map to dead states.) + /// + /// The stride of a DFA is always equivalent to the smallest power of 2 + /// that is greater than or equal to the DFA's alphabet length. This + /// definition uses extra space, but permits faster translation between + /// premultiplied state identifiers and contiguous indices (by using shifts + /// instead of relying on integer division). + /// + /// For example, if the DFA's stride is 16 transitions, then its `stride2` + /// is `4` since `2^4 = 16`. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride of + /// `512`). The maximum is not `8` since the maximum alphabet size is `257` + /// when accounting for the special EOI transition. However, an alphabet + /// length of that size is exceptionally rare since the alphabet is shrunk + /// into equivalence classes. + pub fn stride2(&self) -> usize { + self.tt.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + /// + /// Please see [`DFA::stride2`] for more information. In particular, this + /// returns the stride as the number of transitions, where as `stride2` + /// returns it as the exponent of a power of 2. + pub fn stride(&self) -> usize { + self.tt.stride() + } + + /// Returns the "universal" start state for this DFA. + /// + /// A universal start state occurs only when all of the starting states + /// for this DFA are precisely the same. This occurs when there are no + /// look-around assertions at the beginning (or end for a reverse DFA) of + /// the pattern. + /// + /// Using this as a starting state for a DFA without a universal starting + /// state has unspecified behavior. This condition is not checked, so the + /// caller must guarantee it themselves. + pub(crate) fn universal_start_state(&self) -> StateID { + // We choose 'NonWordByte' for no particular reason, other than + // the fact that this is the 'main' starting configuration used in + // determinization. But in essence, it doesn't really matter. + // + // Also, we might consider exposing this routine, but it seems + // a little tricky to use correctly. Maybe if we also expose a + // 'has_universal_start_state' method? + self.st.start(Start::NonWordByte, None) + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<dense::DFA>()`. + pub fn memory_usage(&self) -> usize { + self.tt.memory_usage() + + self.st.memory_usage() + + self.ms.memory_usage() + + self.accels.memory_usage() + } +} + +/// Routines for converting a dense DFA to other representations, such as +/// sparse DFAs or raw bytes suitable for persistent storage. +impl<T: AsRef<[u32]>> DFA<T> { + /// Convert this dense DFA to a sparse DFA. + /// + /// If a `StateID` is too small to represent all states in the sparse + /// DFA, then this returns an error. In most cases, if a dense DFA is + /// constructable with `StateID` then a sparse DFA will be as well. + /// However, it is not guaranteed. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// let dense = dense::DFA::new("foo[0-9]+")?; + /// let sparse = dense.to_sparse()?; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, Error> { + sparse::DFA::from_dense(self) + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian + /// format. Upon success, the `Vec<u8>` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec<u8>` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec<u8>`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_little_endian would work on a little endian target. + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) { + self.to_bytes::<bytes::LE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian + /// format. Upon success, the `Vec<u8>` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec<u8>` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec<u8>`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_big_endian would work on a big endian target. + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) { + self.to_bytes::<bytes::BE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian + /// format. Upon success, the `Vec<u8>` and the initial padding length are + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// The padding returned is non-zero if the returned `Vec<u8>` starts at + /// an address that does not have the same alignment as `u32`. The padding + /// corresponds to the number of leading bytes written to the returned + /// `Vec<u8>`. + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let (buf, _) = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) { + self.to_bytes::<bytes::NE>() + } + + /// The implementation of the public `to_bytes` serialization methods, + /// which is generic over endianness. + #[cfg(feature = "alloc")] + fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) { + let len = self.write_to_len(); + let (mut buf, padding) = bytes::alloc_aligned_buffer::<u32>(len); + // This should always succeed since the only possible serialization + // error is providing a buffer that's too small, but we've ensured that + // `buf` is big enough here. + self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap(); + (buf, padding) + } + + /// Serialize this DFA as raw bytes to the given slice, in little endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_little_endian would work on a little endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_little_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.as_ref().write_to::<bytes::LE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in big endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_big_endian would work on a big endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_big_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.as_ref().write_to::<bytes::BE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in native endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// Note that unlike the various `to_byte_*` routines, this does not write + /// any padding. Callers are responsible for handling alignment correctly. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_native_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.as_ref().write_to::<bytes::NE>(dst) + } + + /// Return the total number of bytes required to serialize this DFA. + /// + /// This is useful for determining the size of the buffer required to pass + /// to one of the serialization routines: + /// + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// Passing a buffer smaller than the size returned by this method will + /// result in a serialization error. Serialization routines are guaranteed + /// to succeed when the buffer is big enough. + /// + /// # Example + /// + /// This example shows how to dynamically allocate enough room to serialize + /// a DFA. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let mut buf = vec![0; original_dfa.write_to_len()]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note that this example isn't actually guaranteed to work! In + /// particular, if `buf` is not aligned to a 4-byte boundary, then the + /// `DFA::from_bytes` call will fail. If you need this to work, then you + /// either need to deal with adding some initial padding yourself, or use + /// one of the `to_bytes` methods, which will do it for you. + pub fn write_to_len(&self) -> usize { + bytes::write_label_len(LABEL) + + bytes::write_endianness_check_len() + + bytes::write_version_len() + + size_of::<u32>() // unused, intended for future flexibility + + self.tt.write_to_len() + + self.st.write_to_len() + + self.ms.write_to_len() + + self.special.write_to_len() + + self.accels.write_to_len() + } +} + +impl<'a> DFA<&'a [u32]> { + /// Safely deserialize a DFA with a specific state identifier + /// representation. Upon success, this returns both the deserialized DFA + /// and the number of bytes read from the given slice. Namely, the contents + /// of the slice beyond the DFA are not read. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// For safety purposes, the DFA's transition table will be verified such + /// that every transition points to a valid state. If this verification is + /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which + /// will always execute in constant time. + /// + /// The bytes given must be generated by one of the serialization APIs + /// of a `DFA` using a semver compatible release of this crate. Those + /// include: + /// + /// * [`DFA::to_bytes_little_endian`] + /// * [`DFA::to_bytes_big_endian`] + /// * [`DFA::to_bytes_native_endian`] + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along + /// with handling alignment correctly. The `write_to` methods do not + /// allocate and write to an existing slice (which may be on the stack). + /// Since deserialization always uses the native endianness of the target + /// platform, the serialization API you use should match the endianness of + /// the target platform. (It's often a good idea to generate serialized + /// DFAs for both forms of endianness and then load the correct one based + /// on endianness.) + /// + /// # Errors + /// + /// Generally speaking, it's easier to state the conditions in which an + /// error is _not_ returned. All of the following must be true: + /// + /// * The bytes given must be produced by one of the serialization APIs + /// on this DFA, as mentioned above. + /// * The endianness of the target platform matches the endianness used to + /// serialized the provided DFA. + /// * The slice given must have the same alignment as `u32`. + /// + /// If any of the above are not true, then an error will be returned. + /// + /// # Panics + /// + /// This routine will never panic for any input. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let (bytes, _) = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: dealing with alignment and padding + /// + /// In the above example, we used the `to_bytes_native_endian` method to + /// serialize a DFA, but we ignored part of its return value corresponding + /// to padding added to the beginning of the serialized DFA. This is OK + /// because deserialization will skip this initial padding. What matters + /// is that the address immediately following the padding has an alignment + /// that matches `u32`. That is, the following is an equivalent but + /// alternative way to write the above example: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// // Serialization returns the number of leading padding bytes added to + /// // the returned Vec<u8>. + /// let (bytes, pad) = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This padding is necessary because Rust's standard library does + /// not expose any safe and robust way of creating a `Vec<u8>` with a + /// guaranteed alignment other than 1. Now, in practice, the underlying + /// allocator is likely to provide a `Vec<u8>` that meets our alignment + /// requirements, which means `pad` is zero in practice most of the time. + /// + /// The purpose of exposing the padding like this is flexibility for the + /// caller. For example, if one wants to embed a serialized DFA into a + /// compiled program, then it's important to guarantee that it starts at a + /// `u32`-aligned address. The simplest way to do this is to discard the + /// padding bytes and set it up so that the serialized DFA itself begins at + /// a properly aligned address. We can show this in two parts. The first + /// part is serializing the DFA to a file: + /// + /// ```no_run + /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// + /// let (bytes, pad) = dfa.to_bytes_big_endian(); + /// // Write the contents of the DFA *without* the initial padding. + /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?; + /// + /// // Do it again, but this time for little endian. + /// let (bytes, pad) = dfa.to_bytes_little_endian(); + /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And now the second part is embedding the DFA into the compiled program + /// and deserializing it at runtime on first use. We use conditional + /// compilation to choose the correct endianness. + /// + /// ```no_run + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// + /// type S = u32; + /// type DFA = dense::DFA<&'static [S]>; + /// + /// fn get_foo() -> &'static DFA { + /// use std::cell::Cell; + /// use std::mem::MaybeUninit; + /// use std::sync::Once; + /// + /// // This struct with a generic B is used to permit unsizing + /// // coercions, specifically, where B winds up being a [u8]. We also + /// // need repr(C) to guarantee that _align comes first, which forces + /// // a correct alignment. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [S; 0], + /// bytes: B, + /// } + /// + /// # const _: &str = stringify! { + /// // This assignment is made possible (implicitly) via the + /// // CoerceUnsized trait. + /// static ALIGNED: &Aligned<[u8]> = &Aligned { + /// _align: [], + /// #[cfg(target_endian = "big")] + /// bytes: *include_bytes!("foo.bigendian.dfa"), + /// #[cfg(target_endian = "little")] + /// bytes: *include_bytes!("foo.littleendian.dfa"), + /// }; + /// # }; + /// # static ALIGNED: &Aligned<[u8]> = &Aligned { + /// # _align: [], + /// # bytes: [], + /// # }; + /// + /// struct Lazy(Cell<MaybeUninit<DFA>>); + /// // SAFETY: This is safe because DFA impls Sync. + /// unsafe impl Sync for Lazy {} + /// + /// static INIT: Once = Once::new(); + /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); + /// + /// INIT.call_once(|| { + /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) + /// .expect("serialized DFA should be valid"); + /// // SAFETY: This is guaranteed to only execute once, and all + /// // we do with the pointer is write the DFA to it. + /// unsafe { + /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); + /// } + /// }); + /// // SAFETY: DFA is guaranteed to by initialized via INIT and is + /// // stored in static memory. + /// unsafe { + /// let dfa = (*DFA.0.as_ptr()).as_ptr(); + /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) + /// } + /// } + /// + /// let dfa = get_foo(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// ``` + /// + /// Alternatively, consider using + /// [`lazy_static`](https://crates.io/crates/lazy_static) + /// or + /// [`once_cell`](https://crates.io/crates/once_cell), + /// which will guarantee safety for you. You will still need to use the + /// `Aligned` trick above to force correct alignment, but this is safe to + /// do and `from_bytes` will return an error if you get it wrong. + pub fn from_bytes( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { + // SAFETY: This is safe because we validate both the transition table, + // start state ID list and the match states below. If either validation + // fails, then we return an error. + let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; + dfa.tt.validate()?; + dfa.st.validate(&dfa.tt)?; + dfa.ms.validate(&dfa)?; + dfa.accels.validate()?; + // N.B. dfa.special doesn't have a way to do unchecked deserialization, + // so it has already been validated. + Ok((dfa, nread)) + } + + /// Deserialize a DFA with a specific state identifier representation in + /// constant time by omitting the verification of the validity of the + /// transition table and other data inside the DFA. + /// + /// This is just like [`DFA::from_bytes`], except it can potentially return + /// a DFA that exhibits undefined behavior if its transition table contains + /// invalid state identifiers. + /// + /// This routine is useful if you need to deserialize a DFA cheaply + /// and cannot afford the transition table validation performed by + /// `from_bytes`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let (bytes, _) = initial.to_bytes_native_endian(); + /// // SAFETY: This is guaranteed to be safe since the bytes given come + /// // directly from a compatible serialization routine. + /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub unsafe fn from_bytes_unchecked( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { + let mut nr = 0; + + nr += bytes::skip_initial_padding(slice); + bytes::check_alignment::<StateID>(&slice[nr..])?; + nr += bytes::read_label(&slice[nr..], LABEL)?; + nr += bytes::read_endianness_check(&slice[nr..])?; + nr += bytes::read_version(&slice[nr..], VERSION)?; + + let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + nr += size_of::<u32>(); + + let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (special, nread) = Special::from_bytes(&slice[nr..])?; + nr += nread; + special.validate_state_count(tt.count(), tt.stride2)?; + + let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + Ok((DFA { tt, st, ms, special, accels }, nr)) + } + + /// The implementation of the public `write_to` serialization methods, + /// which is generic over endianness. + /// + /// This is defined only for &[u32] to reduce binary size/compilation time. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("dense DFA")); + } + dst = &mut dst[..nwrite]; + + let mut nw = 0; + nw += bytes::write_label(LABEL, &mut dst[nw..])?; + nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?; + nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?; + nw += { + // Currently unused, intended for future flexibility + E::write_u32(0, &mut dst[nw..]); + size_of::<u32>() + }; + nw += self.tt.write_to::<E>(&mut dst[nw..])?; + nw += self.st.write_to::<E>(&mut dst[nw..])?; + nw += self.ms.write_to::<E>(&mut dst[nw..])?; + nw += self.special.write_to::<E>(&mut dst[nw..])?; + nw += self.accels.write_to::<E>(&mut dst[nw..])?; + Ok(nw) + } +} + +/// The following methods implement mutable routines on the internal +/// representation of a DFA. As such, we must fix the first type parameter to a +/// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We +/// can get away with this because these methods are internal to the crate and +/// are exclusively used during construction of the DFA. +#[cfg(feature = "alloc")] +impl OwnedDFA { + /// Add a start state of this DFA. + pub(crate) fn set_start_state( + &mut self, + index: Start, + pattern_id: Option<PatternID>, + id: StateID, + ) { + assert!(self.tt.is_valid(id), "invalid start state"); + self.st.set_start(index, pattern_id, id); + } + + /// Set the given transition to this DFA. Both the `from` and `to` states + /// must already exist. + pub(crate) fn set_transition( + &mut self, + from: StateID, + byte: alphabet::Unit, + to: StateID, + ) { + self.tt.set(from, byte, to); + } + + /// An an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exceed `StateID::LIMIT`, then this returns an + /// error. + pub(crate) fn add_empty_state(&mut self) -> Result<StateID, Error> { + self.tt.add_empty_state() + } + + /// Swap the two states given in the transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { + self.tt.swap(id1, id2); + } + + /// Truncate the states in this DFA to the given count. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + pub(crate) fn truncate_states(&mut self, count: usize) { + self.tt.truncate(count); + } + + /// Return a mutable representation of the state corresponding to the given + /// id. This is useful for implementing routines that manipulate DFA states + /// (e.g., swapping states). + pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + self.tt.state_mut(id) + } + + /// Minimize this DFA in place using Hopcroft's algorithm. + pub(crate) fn minimize(&mut self) { + Minimizer::new(self).run(); + } + + /// Updates the match state pattern ID map to use the one provided. + /// + /// This is useful when it's convenient to manipulate matching states + /// (and their corresponding pattern IDs) as a map. In particular, the + /// representation used by a DFA for this map is not amenable to mutation, + /// so if things need to be changed (like when shuffling states), it's + /// often easier to work with the map form. + pub(crate) fn set_pattern_map( + &mut self, + map: &BTreeMap<StateID, Vec<PatternID>>, + ) -> Result<(), Error> { + self.ms = self.ms.new_with_map(map)?; + Ok(()) + } + + /// Find states that have a small number of non-loop transitions and mark + /// them as candidates for acceleration during search. + pub(crate) fn accelerate(&mut self) { + // dead and quit states can never be accelerated. + if self.state_count() <= 2 { + return; + } + + // Go through every state and record their accelerator, if possible. + let mut accels = BTreeMap::new(); + // Count the number of accelerated match, start and non-match/start + // states. + let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0); + for state in self.states() { + if let Some(accel) = state.accelerate(self.byte_classes()) { + accels.insert(state.id(), accel); + if self.is_match_state(state.id()) { + cmatch += 1; + } else if self.is_start_state(state.id()) { + cstart += 1; + } else { + assert!(!self.is_dead_state(state.id())); + assert!(!self.is_quit_state(state.id())); + cnormal += 1; + } + } + } + // If no states were able to be accelerated, then we're done. + if accels.is_empty() { + return; + } + let original_accels_len = accels.len(); + + // A remapper keeps track of state ID changes. Once we're done + // shuffling, the remapper is used to rewrite all transitions in the + // DFA based on the new positions of states. + let mut remapper = Remapper::from_dfa(self); + + // As we swap states, if they are match states, we need to swap their + // pattern ID lists too (for multi-regexes). We do this by converting + // the lists to an easily swappable map, and then convert back to + // MatchStates once we're done. + let mut new_matches = self.ms.to_map(self); + + // There is at least one state that gets accelerated, so these are + // guaranteed to get set to sensible values below. + self.special.min_accel = StateID::MAX; + self.special.max_accel = StateID::ZERO; + let update_special_accel = + |special: &mut Special, accel_id: StateID| { + special.min_accel = cmp::min(special.min_accel, accel_id); + special.max_accel = cmp::max(special.max_accel, accel_id); + }; + + // Start by shuffling match states. Any match states that are + // accelerated get moved to the end of the match state range. + if cmatch > 0 && self.special.matches() { + // N.B. special.{min,max}_match do not need updating, since the + // range/number of match states does not change. Only the ordering + // of match states may change. + let mut next_id = self.special.max_match; + let mut cur_id = next_id; + while cur_id >= self.special.min_match { + if let Some(accel) = accels.remove(&cur_id) { + accels.insert(next_id, accel); + update_special_accel(&mut self.special, next_id); + + // No need to do any actual swapping for equivalent IDs. + if cur_id != next_id { + remapper.swap(self, cur_id, next_id); + + // Swap pattern IDs for match states. + let cur_pids = new_matches.remove(&cur_id).unwrap(); + let next_pids = new_matches.remove(&next_id).unwrap(); + new_matches.insert(cur_id, next_pids); + new_matches.insert(next_id, cur_pids); + } + next_id = self.tt.prev_state_id(next_id); + } + cur_id = self.tt.prev_state_id(cur_id); + } + } + + // This is where it gets tricky. Without acceleration, start states + // normally come right after match states. But we want accelerated + // states to be a single contiguous range (to make it very fast + // to determine whether a state *is* accelerated), while also keeping + // match and starting states as contiguous ranges for the same reason. + // So what we do here is shuffle states such that it looks like this: + // + // DQMMMMAAAAASSSSSSNNNNNNN + // | | + // |---------| + // accelerated states + // + // Where: + // D - dead state + // Q - quit state + // M - match state (may be accelerated) + // A - normal state that is accelerated + // S - start state (may be accelerated) + // N - normal state that is NOT accelerated + // + // We implement this by shuffling states, which is done by a sequence + // of pairwise swaps. We start by looking at all normal states to be + // accelerated. When we find one, we swap it with the earliest starting + // state, and then swap that with the earliest normal state. This + // preserves the contiguous property. + // + // Once we're done looking for accelerated normal states, now we look + // for accelerated starting states by moving them to the beginning + // of the starting state range (just like we moved accelerated match + // states to the end of the matching state range). + // + // For a more detailed/different perspective on this, see the docs + // in dfa/special.rs. + if cnormal > 0 { + // our next available starting and normal states for swapping. + let mut next_start_id = self.special.min_start; + let mut cur_id = self.from_index(self.state_count() - 1); + // This is guaranteed to exist since cnormal > 0. + let mut next_norm_id = + self.tt.next_state_id(self.special.max_start); + while cur_id >= next_norm_id { + if let Some(accel) = accels.remove(&cur_id) { + remapper.swap(self, next_start_id, cur_id); + remapper.swap(self, next_norm_id, cur_id); + // Keep our accelerator map updated with new IDs if the + // states we swapped were also accelerated. + if let Some(accel2) = accels.remove(&next_norm_id) { + accels.insert(cur_id, accel2); + } + if let Some(accel2) = accels.remove(&next_start_id) { + accels.insert(next_norm_id, accel2); + } + accels.insert(next_start_id, accel); + update_special_accel(&mut self.special, next_start_id); + // Our start range shifts one to the right now. + self.special.min_start = + self.tt.next_state_id(self.special.min_start); + self.special.max_start = + self.tt.next_state_id(self.special.max_start); + next_start_id = self.tt.next_state_id(next_start_id); + next_norm_id = self.tt.next_state_id(next_norm_id); + } + // This is pretty tricky, but if our 'next_norm_id' state also + // happened to be accelerated, then the result is that it is + // now in the position of cur_id, so we need to consider it + // again. This loop is still guaranteed to terminate though, + // because when accels contains cur_id, we're guaranteed to + // increment next_norm_id even if cur_id remains unchanged. + if !accels.contains_key(&cur_id) { + cur_id = self.tt.prev_state_id(cur_id); + } + } + } + // Just like we did for match states, but we want to move accelerated + // start states to the beginning of the range instead of the end. + if cstart > 0 { + // N.B. special.{min,max}_start do not need updating, since the + // range/number of start states does not change at this point. Only + // the ordering of start states may change. + let mut next_id = self.special.min_start; + let mut cur_id = next_id; + while cur_id <= self.special.max_start { + if let Some(accel) = accels.remove(&cur_id) { + remapper.swap(self, cur_id, next_id); + accels.insert(next_id, accel); + update_special_accel(&mut self.special, next_id); + next_id = self.tt.next_state_id(next_id); + } + cur_id = self.tt.next_state_id(cur_id); + } + } + + // Remap all transitions in our DFA and assert some things. + remapper.remap(self); + // This unwrap is OK because acceleration never changes the number of + // match states or patterns in those match states. Since acceleration + // runs after the pattern map has been set at least once, we know that + // our match states cannot error. + self.set_pattern_map(&new_matches).unwrap(); + self.special.set_max(); + self.special.validate().expect("special state ranges should validate"); + self.special + .validate_state_count(self.state_count(), self.stride2()) + .expect( + "special state ranges should be consistent with state count", + ); + assert_eq!( + self.special.accel_len(self.stride()), + // We record the number of accelerated states initially detected + // since the accels map is itself mutated in the process above. + // If mutated incorrectly, its size may change, and thus can't be + // trusted as a source of truth of how many accelerated states we + // expected there to be. + original_accels_len, + "mismatch with expected number of accelerated states", + ); + + // And finally record our accelerators. We kept our accels map updated + // as we shuffled states above, so the accelerators should now + // correspond to a contiguous range in the state ID space. (Which we + // assert.) + let mut prev: Option<StateID> = None; + for (id, accel) in accels { + assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id)); + prev = Some(id); + self.accels.add(accel); + } + } + + /// Shuffle the states in this DFA so that starting states, match + /// states and accelerated states are all contiguous. + /// + /// See dfa/special.rs for more details. + pub(crate) fn shuffle( + &mut self, + mut matches: BTreeMap<StateID, Vec<PatternID>>, + ) -> Result<(), Error> { + // The determinizer always adds a quit state and it is always second. + self.special.quit_id = self.from_index(1); + // If all we have are the dead and quit states, then we're done and + // the DFA will never produce a match. + if self.state_count() <= 2 { + self.special.set_max(); + return Ok(()); + } + + // Collect all our start states into a convenient set and confirm there + // is no overlap with match states. In the classicl DFA construction, + // start states can be match states. But because of look-around, we + // delay all matches by a byte, which prevents start states from being + // match states. + let mut is_start: BTreeSet<StateID> = BTreeSet::new(); + for (start_id, _, _) in self.starts() { + // While there's nothing theoretically wrong with setting a start + // state to a dead ID (indeed, it could be an optimization!), the + // shuffling code below assumes that start states aren't dead. If + // this assumption is violated, the dead state could be shuffled + // to a new location, which must never happen. So if we do want + // to allow start states to be dead, then this assert should be + // removed and the code below fixed. + // + // N.B. Minimization can cause start states to be dead, but that + // happens after states are shuffled, so it's OK. Also, start + // states are dead for the DFA that never matches anything, but + // in that case, there are no states to shuffle. + assert_ne!(start_id, DEAD, "start state cannot be dead"); + assert!( + !matches.contains_key(&start_id), + "{:?} is both a start and a match state, which is not allowed", + start_id, + ); + is_start.insert(start_id); + } + + // We implement shuffling by a sequence of pairwise swaps of states. + // Since we have a number of things referencing states via their + // IDs and swapping them changes their IDs, we need to record every + // swap we make so that we can remap IDs. The remapper handles this + // book-keeping for us. + let mut remapper = Remapper::from_dfa(self); + + // Shuffle matching states. + if matches.is_empty() { + self.special.min_match = DEAD; + self.special.max_match = DEAD; + } else { + // The determinizer guarantees that the first two states are the + // dead and quit states, respectively. We want our match states to + // come right after quit. + let mut next_id = self.from_index(2); + let mut new_matches = BTreeMap::new(); + self.special.min_match = next_id; + for (id, pids) in matches { + remapper.swap(self, next_id, id); + new_matches.insert(next_id, pids); + // If we swapped a start state, then update our set. + if is_start.contains(&next_id) { + is_start.remove(&next_id); + is_start.insert(id); + } + next_id = self.tt.next_state_id(next_id); + } + matches = new_matches; + self.special.max_match = cmp::max( + self.special.min_match, + self.tt.prev_state_id(next_id), + ); + } + + // Shuffle starting states. + { + let mut next_id = self.from_index(2); + if self.special.matches() { + next_id = self.tt.next_state_id(self.special.max_match); + } + self.special.min_start = next_id; + for id in is_start { + remapper.swap(self, next_id, id); + next_id = self.tt.next_state_id(next_id); + } + self.special.max_start = cmp::max( + self.special.min_start, + self.tt.prev_state_id(next_id), + ); + } + + // Finally remap all transitions in our DFA. + remapper.remap(self); + self.set_pattern_map(&matches)?; + self.special.set_max(); + self.special.validate().expect("special state ranges should validate"); + self.special + .validate_state_count(self.state_count(), self.stride2()) + .expect( + "special state ranges should be consistent with state count", + ); + Ok(()) + } +} + +/// A variety of generic internal methods for accessing DFA internals. +impl<T: AsRef<[u32]>> DFA<T> { + /// Return the byte classes used by this DFA. + pub(crate) fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes + } + + /// Return the info about special states. + pub(crate) fn special(&self) -> &Special { + &self.special + } + + /// Return the info about special states as a mutable borrow. + #[cfg(feature = "alloc")] + pub(crate) fn special_mut(&mut self) -> &mut Special { + &mut self.special + } + + /// Returns an iterator over all states in this DFA. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + pub(crate) fn states(&self) -> StateIter<'_, T> { + self.tt.states() + } + + /// Return the total number of states in this DFA. Every DFA has at least + /// 1 state, even the empty DFA. + pub(crate) fn state_count(&self) -> usize { + self.tt.count() + } + + /// Return an iterator over all pattern IDs for the given match state. + /// + /// If the given state is not a match state, then this panics. + #[cfg(feature = "alloc")] + pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] { + assert!(self.is_match_state(id)); + self.ms.pattern_id_slice(self.match_state_index(id)) + } + + /// Return the total number of pattern IDs for the given match state. + /// + /// If the given state is not a match state, then this panics. + pub(crate) fn match_pattern_len(&self, id: StateID) -> usize { + assert!(self.is_match_state(id)); + self.ms.pattern_len(self.match_state_index(id)) + } + + /// Returns the total number of patterns matched by this DFA. + pub(crate) fn pattern_count(&self) -> usize { + self.ms.patterns + } + + /// Returns a map from match state ID to a list of pattern IDs that match + /// in that state. + #[cfg(feature = "alloc")] + pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> { + self.ms.to_map(self) + } + + /// Returns the ID of the quit state for this DFA. + #[cfg(feature = "alloc")] + pub(crate) fn quit_id(&self) -> StateID { + self.from_index(1) + } + + /// Convert the given state identifier to the state's index. The state's + /// index corresponds to the position in which it appears in the transition + /// table. When a DFA is NOT premultiplied, then a state's identifier is + /// also its index. When a DFA is premultiplied, then a state's identifier + /// is equal to `index * alphabet_len`. This routine reverses that. + pub(crate) fn to_index(&self, id: StateID) -> usize { + self.tt.to_index(id) + } + + /// Convert an index to a state (in the range 0..self.state_count()) to an + /// actual state identifier. + /// + /// This is useful when using a `Vec<T>` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + #[cfg(feature = "alloc")] + pub(crate) fn from_index(&self, index: usize) -> StateID { + self.tt.from_index(index) + } + + /// Return the table of state IDs for this DFA's start states. + pub(crate) fn starts(&self) -> StartStateIter<'_> { + self.st.iter() + } + + /// Returns the index of the match state for the given ID. If the + /// given ID does not correspond to a match state, then this may + /// panic or produce an incorrect result. + fn match_state_index(&self, id: StateID) -> usize { + debug_assert!(self.is_match_state(id)); + // This is one of the places where we rely on the fact that match + // states are contiguous in the transition table. Namely, that the + // first match state ID always corresponds to dfa.special.min_start. + // From there, since we know the stride, we can compute the overall + // index of any match state given the match state's ID. + let min = self.special().min_match.as_usize(); + // CORRECTNESS: We're allowed to produce an incorrect result or panic, + // so both the subtraction and the unchecked StateID construction is + // OK. + self.to_index(StateID::new_unchecked(id.as_usize() - min)) + } + + /// Returns the index of the accelerator state for the given ID. If the + /// given ID does not correspond to an accelerator state, then this may + /// panic or produce an incorrect result. + fn accelerator_index(&self, id: StateID) -> usize { + let min = self.special().min_accel.as_usize(); + // CORRECTNESS: We're allowed to produce an incorrect result or panic, + // so both the subtraction and the unchecked StateID construction is + // OK. + self.to_index(StateID::new_unchecked(id.as_usize() - min)) + } + + /// Return the accelerators for this DFA. + fn accels(&self) -> Accels<&[u32]> { + self.accels.as_ref() + } + + /// Return this DFA's transition table as a slice. + fn trans(&self) -> &[StateID] { + self.tt.table() + } +} + +impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "dense::DFA(")?; + for state in self.states() { + fmt_state_indicator(f, self, state.id())?; + let id = if f.alternate() { + state.id().as_usize() + } else { + self.to_index(state.id()) + }; + write!(f, "{:06?}: ", id)?; + state.fmt(f)?; + write!(f, "\n")?; + } + writeln!(f, "")?; + for (i, (start_id, sty, pid)) in self.starts().enumerate() { + let id = if f.alternate() { + start_id.as_usize() + } else { + self.to_index(start_id) + }; + if i % self.st.stride == 0 { + match pid { + None => writeln!(f, "START-GROUP(ALL)")?, + Some(pid) => { + writeln!(f, "START_GROUP(pattern: {:?})", pid)? + } + } + } + writeln!(f, " {:?} => {:06?}", sty, id)?; + } + if self.pattern_count() > 1 { + writeln!(f, "")?; + for i in 0..self.ms.count() { + let id = self.ms.match_state_id(self, i); + let id = if f.alternate() { + id.as_usize() + } else { + self.to_index(id) + }; + write!(f, "MATCH({:06?}): ", id)?; + for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate() + { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", pid)?; + } + writeln!(f, "")?; + } + } + writeln!(f, "state count: {:?}", self.state_count())?; + writeln!(f, "pattern count: {:?}", self.pattern_count())?; + writeln!(f, ")")?; + Ok(()) + } +} + +unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> { + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + self.special.is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + self.special.is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + self.special.is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + self.special.is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + self.special.is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + self.special.is_accel_state(id) + } + + #[inline] + fn next_state(&self, current: StateID, input: u8) -> StateID { + let input = self.byte_classes().get(input); + let o = current.as_usize() + usize::from(input); + self.trans()[o] + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + let input = self.byte_classes().get_unchecked(input); + let o = current.as_usize() + usize::from(input); + *self.trans().get_unchecked(o) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + let eoi = self.byte_classes().eoi().as_usize(); + let o = current.as_usize() + eoi; + self.trans()[o] + } + + #[inline] + fn pattern_count(&self) -> usize { + self.ms.patterns + } + + #[inline] + fn match_count(&self, id: StateID) -> usize { + self.match_pattern_len(id) + } + + #[inline] + fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the state machine, which requires + // a bit of slicing/pointer-chasing. This optimization tends to only + // matter when matches are frequent. + if self.ms.patterns == 1 { + return PatternID::ZERO; + } + let state_index = self.match_state_index(id); + self.ms.pattern_id(state_index, match_index) + } + + #[inline] + fn start_state_forward( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_fwd(bytes, start, end); + self.st.start(index, pattern_id) + } + + #[inline] + fn start_state_reverse( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_rev(bytes, start, end); + self.st.start(index, pattern_id) + } + + #[inline(always)] + fn accelerator(&self, id: StateID) -> &[u8] { + if !self.is_accel_state(id) { + return &[]; + } + self.accels.needles(self.accelerator_index(id)) + } +} + +/// The transition table portion of a dense DFA. +/// +/// The transition table is the core part of the DFA in that it describes how +/// to move from one state to another based on the input sequence observed. +#[derive(Clone)] +pub(crate) struct TransitionTable<T> { + /// A contiguous region of memory representing the transition table in + /// row-major order. The representation is dense. That is, every state + /// has precisely the same number of transitions. The maximum number of + /// transitions per state is 257 (256 for each possible byte value, plus 1 + /// for the special EOI transition). If a DFA has been instructed to use + /// byte classes (the default), then the number of transitions is usually + /// substantially fewer. + /// + /// In practice, T is either `Vec<u32>` or `&[u32]`. + table: T, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to a + /// single character in this DFA's alphabet, where the maximum number of + /// characters is 257 (each possible value of a byte plus the special + /// EOI transition). Consequently, the number of equivalence classes + /// corresponds to the number of transitions for each DFA state. Note + /// though that the *space* used by each DFA state in the transition table + /// may be larger. The total space used by each DFA state is known as the + /// stride. + /// + /// The only time the number of equivalence classes is fewer than 257 is if + /// the DFA's kind uses byte classes (which is the default). Equivalence + /// classes should generally only be disabled when debugging, so that + /// the transitions themselves aren't obscured. Disabling them has no + /// other benefit, since the equivalence class map is always used while + /// searching. In the vast majority of cases, the number of equivalence + /// classes is substantially smaller than 257, particularly when large + /// Unicode classes aren't used. + classes: ByteClasses, + /// The stride of each DFA state, expressed as a power-of-two exponent. + /// + /// The stride of a DFA corresponds to the total amount of space used by + /// each DFA state in the transition table. This may be bigger than the + /// size of a DFA's alphabet, since the stride is always the smallest + /// power of two greater than or equal to the alphabet size. + /// + /// While this wastes space, this avoids the need for integer division + /// to convert between premultiplied state IDs and their corresponding + /// indices. Instead, we can use simple bit-shifts. + /// + /// See the docs for the `stride2` method for more details. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride of + /// `512`). The maximum is not `8` since the maximum alphabet size is `257` + /// when accounting for the special EOI transition. However, an alphabet + /// length of that size is exceptionally rare since the alphabet is shrunk + /// into equivalence classes. + stride2: usize, +} + +impl<'a> TransitionTable<&'a [u32]> { + /// Deserialize a transition table starting at the beginning of `slice`. + /// Upon success, return the total number of bytes read along with the + /// transition table. + /// + /// If there was a problem deserializing any part of the transition table, + /// then this returns an error. Notably, if the given slice does not have + /// the same alignment as `StateID`, then this will return an error (among + /// other possible errors). + /// + /// This is guaranteed to execute in constant time. + /// + /// # Safety + /// + /// This routine is not safe because it does not check the valdity of the + /// transition table itself. In particular, the transition table can be + /// quite large, so checking its validity can be somewhat expensive. An + /// invalid transition table is not safe because other code may rely on the + /// transition table being correct (such as explicit bounds check elision). + /// Therefore, an invalid transition table can lead to undefined behavior. + /// + /// Callers that use this function must either pass on the safety invariant + /// or guarantee that the bytes given contain a valid transition table. + /// This guarantee is upheld by the bytes written by `write_to`. + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?; + slice = &slice[nr..]; + + let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?; + slice = &slice[nr..]; + + let (classes, nr) = ByteClasses::from_bytes(slice)?; + slice = &slice[nr..]; + + // The alphabet length (determined by the byte class map) cannot be + // bigger than the stride (total space used by each DFA state). + if stride2 > 9 { + return Err(DeserializeError::generic( + "dense DFA has invalid stride2 (too big)", + )); + } + // It also cannot be zero, since even a DFA that never matches anything + // has a non-zero number of states with at least two equivalence + // classes: one for all 256 byte values and another for the EOI + // sentinel. + if stride2 < 1 { + return Err(DeserializeError::generic( + "dense DFA has invalid stride2 (too small)", + )); + } + // This is OK since 1 <= stride2 <= 9. + let stride = + 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap(); + if classes.alphabet_len() > stride { + return Err(DeserializeError::generic( + "alphabet size cannot be bigger than transition table stride", + )); + } + + let trans_count = + bytes::shl(count, stride2, "dense table transition count")?; + let table_bytes_len = bytes::mul( + trans_count, + StateID::SIZE, + "dense table state byte count", + )?; + bytes::check_slice_len(slice, table_bytes_len, "transition table")?; + bytes::check_alignment::<StateID>(slice)?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + // SAFETY: Since StateID is always representable as a u32, all we need + // to do is ensure that we have the proper length and alignment. We've + // checked both above, so the cast below is safe. + // + // N.B. This is the only not-safe code in this function, so we mark + // it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let table = unsafe { + core::slice::from_raw_parts( + table_bytes.as_ptr() as *const u32, + trans_count, + ) + }; + let tt = TransitionTable { table, classes, stride2 }; + Ok((tt, slice.as_ptr() as usize - slice_start)) + } +} + +#[cfg(feature = "alloc")] +impl TransitionTable<Vec<u32>> { + /// Create a minimal transition table with just two states: a dead state + /// and a quit state. The alphabet length and stride of the transition + /// table is determined by the given set of equivalence classes. + fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> { + let mut tt = TransitionTable { + table: vec![], + classes, + stride2: classes.stride2(), + }; + // Two states, regardless of alphabet size, can always fit into u32. + tt.add_empty_state().unwrap(); // dead state + tt.add_empty_state().unwrap(); // quit state + tt + } + + /// Set a transition in this table. Both the `from` and `to` states must + /// already exist, otherwise this panics. `unit` should correspond to the + /// transition out of `from` to set to `to`. + fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) { + assert!(self.is_valid(from), "invalid 'from' state"); + assert!(self.is_valid(to), "invalid 'to' state"); + self.table[from.as_usize() + self.classes.get_by_unit(unit)] = + to.as_u32(); + } + + /// Add an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exhaust the state identifier space, then this + /// returns an error. + fn add_empty_state(&mut self) -> Result<StateID, Error> { + // Normally, to get a fresh state identifier, we would just + // take the index of the next state added to the transition + // table. However, we actually perform an optimization here + // that premultiplies state IDs by the stride, such that they + // point immediately at the beginning of their transitions in + // the transition table. This avoids an extra multiplication + // instruction for state lookup at search time. + // + // Premultiplied identifiers means that instead of your matching + // loop looking something like this: + // + // state = dfa.start + // for byte in haystack: + // next = dfa.transitions[state * stride + byte] + // if dfa.is_match(next): + // return true + // return false + // + // it can instead look like this: + // + // state = dfa.start + // for byte in haystack: + // next = dfa.transitions[state + byte] + // if dfa.is_match(next): + // return true + // return false + // + // In other words, we save a multiplication instruction in the + // critical path. This turns out to be a decent performance win. + // The cost of using premultiplied state ids is that they can + // require a bigger state id representation. (And they also make + // the code a bit more complex, especially during minimization and + // when reshuffling states, as one needs to convert back and forth + // between state IDs and state indices.) + // + // To do this, we simply take the index of the state into the + // entire transition table, rather than the index of the state + // itself. e.g., If the stride is 64, then the ID of the 3rd state + // is 192, not 2. + let next = self.table.len(); + let id = StateID::new(next).map_err(|_| Error::too_many_states())?; + self.table.extend(iter::repeat(0).take(self.stride())); + Ok(id) + } + + /// Swap the two states given in this transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + /// + /// Both id1 and id2 must point to valid states, otherwise this panics. + fn swap(&mut self, id1: StateID, id2: StateID) { + assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1); + assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2); + // We only need to swap the parts of the state that are used. So if the + // stride is 64, but the alphabet length is only 33, then we save a lot + // of work. + for b in 0..self.classes.alphabet_len() { + self.table.swap(id1.as_usize() + b, id2.as_usize() + b); + } + } + + /// Truncate the states in this transition table to the given count. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + fn truncate(&mut self, count: usize) { + self.table.truncate(count << self.stride2); + } + + /// Return a mutable representation of the state corresponding to the given + /// id. This is useful for implementing routines that manipulate DFA states + /// (e.g., swapping states). + fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + let alphabet_len = self.alphabet_len(); + let i = id.as_usize(); + StateMut { + id, + stride2: self.stride2, + transitions: &mut self.table_mut()[i..i + alphabet_len], + } + } +} + +impl<T: AsRef<[u32]>> TransitionTable<T> { + /// Writes a serialized form of this transition table to the buffer given. + /// If the buffer is too small, then an error is returned. To determine + /// how big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("transition table")); + } + dst = &mut dst[..nwrite]; + + // write state count + // Unwrap is OK since number of states is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.count()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write state stride (as power of 2) + // Unwrap is OK since stride2 is guaranteed to be <= 9. + E::write_u32(u32::try_from(self.stride2).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write byte class map + let n = self.classes.write_to(dst)?; + dst = &mut dst[n..]; + + // write actual transitions + for &sid in self.table() { + let n = bytes::write_state_id::<E>(sid, &mut dst); + dst = &mut dst[n..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // state count + + size_of::<u32>() // stride2 + + self.classes.write_to_len() + + (self.table().len() * StateID::SIZE) + } + + /// Validates that every state ID in this transition table is valid. + /// + /// That is, every state ID can be used to correctly index a state in this + /// table. + fn validate(&self) -> Result<(), DeserializeError> { + for state in self.states() { + for (_, to) in state.transitions() { + if !self.is_valid(to) { + return Err(DeserializeError::generic( + "found invalid state ID in transition table", + )); + } + } + } + Ok(()) + } + + /// Converts this transition table to a borrowed value. + fn as_ref(&self) -> TransitionTable<&'_ [u32]> { + TransitionTable { + table: self.table.as_ref(), + classes: self.classes.clone(), + stride2: self.stride2, + } + } + + /// Converts this transition table to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> TransitionTable<Vec<u32>> { + TransitionTable { + table: self.table.as_ref().to_vec(), + classes: self.classes.clone(), + stride2: self.stride2, + } + } + + /// Return the state for the given ID. If the given ID is not valid, then + /// this panics. + fn state(&self, id: StateID) -> State<'_> { + assert!(self.is_valid(id)); + + let i = id.as_usize(); + State { + id, + stride2: self.stride2, + transitions: &self.table()[i..i + self.alphabet_len()], + } + } + + /// Returns an iterator over all states in this transition table. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + fn states(&self) -> StateIter<'_, T> { + StateIter { + tt: self, + it: self.table().chunks(self.stride()).enumerate(), + } + } + + /// Convert a state identifier to an index to a state (in the range + /// 0..self.count()). + /// + /// This is useful when using a `Vec<T>` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + /// + /// If the given ID is not valid, then this may panic or produce an + /// incorrect index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert an index to a state (in the range 0..self.count()) to an actual + /// state identifier. + /// + /// This is useful when using a `Vec<T>` as an efficient map keyed by state + /// to some other information (such as a remapped state ID). + /// + /// If the given index is not in the specified range, then this may panic + /// or produce an incorrect state ID. + fn from_index(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. + StateID::new_unchecked(index << self.stride2) + } + + /// Returns the state ID for the state immediately following the one given. + /// + /// This does not check whether the state ID returned is invalid. In fact, + /// if the state ID given is the last state in this DFA, then the state ID + /// returned is guaranteed to be invalid. + #[cfg(feature = "alloc")] + fn next_state_id(&self, id: StateID) -> StateID { + self.from_index(self.to_index(id).checked_add(1).unwrap()) + } + + /// Returns the state ID for the state immediately preceding the one given. + /// + /// If the dead ID given (which is zero), then this panics. + #[cfg(feature = "alloc")] + fn prev_state_id(&self, id: StateID) -> StateID { + self.from_index(self.to_index(id).checked_sub(1).unwrap()) + } + + /// Returns the table as a slice of state IDs. + fn table(&self) -> &[StateID] { + let integers = self.table.as_ref(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const StateID, + integers.len(), + ) + } + } + + /// Returns the total number of states in this transition table. + /// + /// Note that a DFA always has at least two states: the dead and quit + /// states. In particular, the dead state always has ID 0 and is + /// correspondingly always the first state. The dead state is never a match + /// state. + fn count(&self) -> usize { + self.table().len() >> self.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + fn stride(&self) -> usize { + 1 << self.stride2 + } + + /// Returns the total number of elements in the alphabet for this + /// transition table. This is always less than or equal to `self.stride()`. + /// It is only equal when the alphabet length is a power of 2. Otherwise, + /// it is always strictly less. + fn alphabet_len(&self) -> usize { + self.classes.alphabet_len() + } + + /// Returns true if and only if the given state ID is valid for this + /// transition table. Validity in this context means that the given ID can + /// be used as a valid offset with `self.stride()` to index this transition + /// table. + fn is_valid(&self, id: StateID) -> bool { + let id = id.as_usize(); + id < self.table().len() && id % self.stride() == 0 + } + + /// Return the memory usage, in bytes, of this transition table. + /// + /// This does not include the size of a `TransitionTable` value itself. + fn memory_usage(&self) -> usize { + self.table().len() * StateID::SIZE + } +} + +#[cfg(feature = "alloc")] +impl<T: AsMut<[u32]>> TransitionTable<T> { + /// Returns the table as a slice of state IDs. + fn table_mut(&mut self) -> &mut [StateID] { + let integers = self.table.as_mut(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts_mut( + integers.as_mut_ptr() as *mut StateID, + integers.len(), + ) + } + } +} + +/// The set of all possible starting states in a DFA. +/// +/// The set of starting states corresponds to the possible choices one can make +/// in terms of starting a DFA. That is, before following the first transition, +/// you first need to select the state that you start in. +/// +/// Normally, a DFA converted from an NFA that has a single starting state +/// would itself just have one starting state. However, our support for look +/// around generally requires more starting states. The correct starting state +/// is chosen based on certain properties of the position at which we begin +/// our search. +/// +/// Before listing those properties, we first must define two terms: +/// +/// * `haystack` - The bytes to execute the search. The search always starts +/// at the beginning of `haystack` and ends before or at the end of +/// `haystack`. +/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack` +/// must be contained within `context` such that `context` is at least as big +/// as `haystack`. +/// +/// This split is crucial for dealing with look-around. For example, consider +/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This +/// regex should _not_ match the haystack since `bar` does not appear at the +/// beginning of the input. Similarly, the regex `\Bbar\B` should match the +/// haystack because `bar` is not surrounded by word boundaries. But a search +/// that does not take context into account would not permit `\B` to match +/// since the beginning of any string matches a word boundary. Similarly, a +/// search that does not take context into account when searching `^bar$` in +/// the haystack `bar` would produce a match when it shouldn't. +/// +/// Thus, it follows that the starting state is chosen based on the following +/// criteria, derived from the position at which the search starts in the +/// `context` (corresponding to the start of `haystack`): +/// +/// 1. If the search starts at the beginning of `context`, then the `Text` +/// start state is used. (Since `^` corresponds to +/// `hir::Anchor::StartText`.) +/// 2. If the search starts at a position immediately following a line +/// terminator, then the `Line` start state is used. (Since `(?m:^)` +/// corresponds to `hir::Anchor::StartLine`.) +/// 3. If the search starts at a position immediately following a byte +/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte` +/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.) +/// 4. Otherwise, if the search starts at a position immediately following +/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`), +/// then the `NonWordByte` start state is used. (Since `(?-u:\B)` +/// corresponds to a not-word-boundary.) +/// +/// (N.B. Unicode word boundaries are not supported by the DFA because they +/// require multi-byte look-around and this is difficult to support in a DFA.) +/// +/// To further complicate things, we also support constructing individual +/// anchored start states for each pattern in the DFA. (Which is required to +/// implement overlapping regexes correctly, but is also generally useful.) +/// Thus, when individual start states for each pattern are enabled, then the +/// total number of start states represented is `4 + (4 * #patterns)`, where +/// the 4 comes from each of the 4 possibilities above. The first 4 represents +/// the starting states for the entire DFA, which support searching for +/// multiple patterns simultaneously (possibly unanchored). +/// +/// If individual start states are disabled, then this will only store 4 +/// start states. Typically, individual start states are only enabled when +/// constructing the reverse DFA for regex matching. But they are also useful +/// for building DFAs that can search for a specific pattern or even to support +/// both anchored and unanchored searches with the same DFA. +/// +/// Note though that while the start table always has either `4` or +/// `4 + (4 * #patterns)` starting state *ids*, the total number of states +/// might be considerably smaller. That is, many of the IDs may be duplicative. +/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no +/// reason to generate a unique starting state for handling word boundaries. +/// Similarly for start/end anchors.) +#[derive(Clone)] +pub(crate) struct StartTable<T> { + /// The initial start state IDs. + /// + /// In practice, T is either `Vec<u32>` or `&[u32]`. + /// + /// The first `stride` (currently always 4) entries always correspond to + /// the start states for the entire DFA. After that, there are + /// `stride * patterns` state IDs, where `patterns` may be zero in the + /// case of a DFA with no patterns or in the case where the DFA was built + /// without enabling starting states for each pattern. + table: T, + /// The number of starting state IDs per pattern. + stride: usize, + /// The total number of patterns for which starting states are encoded. + /// This may be zero for non-empty DFAs when the DFA was built without + /// start states for each pattern. Thus, one cannot use this field to + /// say how many patterns are in the DFA in all cases. It is specific to + /// how many patterns are represented in this start table. + patterns: usize, +} + +#[cfg(feature = "alloc")] +impl StartTable<Vec<u32>> { + /// Create a valid set of start states all pointing to the dead state. + /// + /// When the corresponding DFA is constructed with start states for each + /// pattern, then `patterns` should be the number of patterns. Otherwise, + /// it should be zero. + /// + /// If the total table size could exceed the allocatable limit, then this + /// returns an error. In practice, this is unlikely to be able to occur, + /// since it's likely that allocation would have failed long before it got + /// to this point. + fn dead(patterns: usize) -> Result<StartTable<Vec<u32>>, Error> { + assert!(patterns <= PatternID::LIMIT); + let stride = Start::count(); + let pattern_starts_len = match stride.checked_mul(patterns) { + Some(x) => x, + None => return Err(Error::too_many_start_states()), + }; + let table_len = match stride.checked_add(pattern_starts_len) { + Some(x) => x, + None => return Err(Error::too_many_start_states()), + }; + if table_len > core::isize::MAX as usize { + return Err(Error::too_many_start_states()); + } + let table = vec![DEAD.as_u32(); table_len]; + Ok(StartTable { table, stride, patterns }) + } +} + +impl<'a> StartTable<&'a [u32]> { + /// Deserialize a table of start state IDs starting at the beginning of + /// `slice`. Upon success, return the total number of bytes read along with + /// the table of starting state IDs. + /// + /// If there was a problem deserializing any part of the starting IDs, + /// then this returns an error. Notably, if the given slice does not have + /// the same alignment as `StateID`, then this will return an error (among + /// other possible errors). + /// + /// This is guaranteed to execute in constant time. + /// + /// # Safety + /// + /// This routine is not safe because it does not check the valdity of the + /// starting state IDs themselves. In particular, the number of starting + /// IDs can be of variable length, so it's possible that checking their + /// validity cannot be done in constant time. An invalid starting state + /// ID is not safe because other code may rely on the starting IDs being + /// correct (such as explicit bounds check elision). Therefore, an invalid + /// start ID can lead to undefined behavior. + /// + /// Callers that use this function must either pass on the safety invariant + /// or guarantee that the bytes given contain valid starting state IDs. + /// This guarantee is upheld by the bytes written by `write_to`. + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (stride, nr) = + bytes::try_read_u32_as_usize(slice, "start table stride")?; + slice = &slice[nr..]; + + let (patterns, nr) = + bytes::try_read_u32_as_usize(slice, "start table patterns")?; + slice = &slice[nr..]; + + if stride != Start::count() { + return Err(DeserializeError::generic( + "invalid starting table stride", + )); + } + if patterns > PatternID::LIMIT { + return Err(DeserializeError::generic( + "invalid number of patterns", + )); + } + let pattern_table_size = + bytes::mul(stride, patterns, "invalid pattern count")?; + // Our start states always start with a single stride of start states + // for the entire automaton which permit it to match any pattern. What + // follows it are an optional set of start states for each pattern. + let start_state_count = bytes::add( + stride, + pattern_table_size, + "invalid 'any' pattern starts size", + )?; + let table_bytes_len = bytes::mul( + start_state_count, + StateID::SIZE, + "pattern table bytes length", + )?; + bytes::check_slice_len(slice, table_bytes_len, "start ID table")?; + bytes::check_alignment::<StateID>(slice)?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + // SAFETY: Since StateID is always representable as a u32, all we need + // to do is ensure that we have the proper length and alignment. We've + // checked both above, so the cast below is safe. + // + // N.B. This is the only not-safe code in this function, so we mark + // it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let table = unsafe { + core::slice::from_raw_parts( + table_bytes.as_ptr() as *const u32, + start_state_count, + ) + }; + let st = StartTable { table, stride, patterns }; + Ok((st, slice.as_ptr() as usize - slice_start)) + } +} + +impl<T: AsRef<[u32]>> StartTable<T> { + /// Writes a serialized form of this start table to the buffer given. If + /// the buffer is too small, then an error is returned. To determine how + /// big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "starting table ids", + )); + } + dst = &mut dst[..nwrite]; + + // write stride + // Unwrap is OK since the stride is always 4 (currently). + E::write_u32(u32::try_from(self.stride).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + // write pattern count + // Unwrap is OK since number of patterns is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + // write start IDs + for &sid in self.table() { + let n = bytes::write_state_id::<E>(sid, &mut dst); + dst = &mut dst[n..]; + } + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this start ID table + /// will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // stride + + size_of::<u32>() // # patterns + + (self.table().len() * StateID::SIZE) + } + + /// Validates that every state ID in this start table is valid by checking + /// it against the given transition table (which must be for the same DFA). + /// + /// That is, every state ID can be used to correctly index a state. + fn validate( + &self, + tt: &TransitionTable<T>, + ) -> Result<(), DeserializeError> { + for &id in self.table() { + if !tt.is_valid(id) { + return Err(DeserializeError::generic( + "found invalid starting state ID", + )); + } + } + Ok(()) + } + + /// Converts this start list to a borrowed value. + fn as_ref(&self) -> StartTable<&'_ [u32]> { + StartTable { + table: self.table.as_ref(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Converts this start list to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> StartTable<Vec<u32>> { + StartTable { + table: self.table.as_ref().to_vec(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Return the start state for the given start index and pattern ID. If the + /// pattern ID is None, then the corresponding start state for the entire + /// DFA is returned. If the pattern ID is not None, then the corresponding + /// starting state for the given pattern is returned. If this start table + /// does not have individual starting states for each pattern, then this + /// panics. + fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + self.stride + (self.stride * pid) + start_index + } + }; + self.table()[index] + } + + /// Returns an iterator over all start state IDs in this table. + /// + /// Each item is a triple of: start state ID, the start state type and the + /// pattern ID (if any). + fn iter(&self) -> StartStateIter<'_> { + StartStateIter { st: self.as_ref(), i: 0 } + } + + /// Returns the table as a slice of state IDs. + fn table(&self) -> &[StateID] { + let integers = self.table.as_ref(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const StateID, + integers.len(), + ) + } + } + + /// Return the memory usage, in bytes, of this start list. + /// + /// This does not include the size of a `StartList` value itself. + fn memory_usage(&self) -> usize { + self.table().len() * StateID::SIZE + } +} + +#[cfg(feature = "alloc")] +impl<T: AsMut<[u32]>> StartTable<T> { + /// Set the start state for the given index and pattern. + /// + /// If the pattern ID or state ID are not valid, then this will panic. + fn set_start( + &mut self, + index: Start, + pattern_id: Option<PatternID>, + id: StateID, + ) { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => self + .stride + .checked_mul(pid.as_usize()) + .unwrap() + .checked_add(self.stride) + .unwrap() + .checked_add(start_index) + .unwrap(), + }; + self.table_mut()[index] = id; + } + + /// Returns the table as a mutable slice of state IDs. + fn table_mut(&mut self) -> &mut [StateID] { + let integers = self.table.as_mut(); + // SAFETY: This is safe because StateID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts_mut( + integers.as_mut_ptr() as *mut StateID, + integers.len(), + ) + } + } +} + +/// An iterator over start state IDs. +/// +/// This iterator yields a triple of start state ID, the start state type +/// and the pattern ID (if any). The pattern ID is None for start states +/// corresponding to the entire DFA and non-None for start states corresponding +/// to a specific pattern. The latter only occurs when the DFA is compiled with +/// start states for each pattern. +pub(crate) struct StartStateIter<'a> { + st: StartTable<&'a [u32]>, + i: usize, +} + +impl<'a> Iterator for StartStateIter<'a> { + type Item = (StateID, Start, Option<PatternID>); + + fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> { + let i = self.i; + let table = self.st.table(); + if i >= table.len() { + return None; + } + self.i += 1; + + // This unwrap is okay since the stride of the starting state table + // must always match the number of start state types. + let start_type = Start::from_usize(i % self.st.stride).unwrap(); + let pid = if i < self.st.stride { + None + } else { + Some( + PatternID::new((i - self.st.stride) / self.st.stride).unwrap(), + ) + }; + Some((table[i], start_type, pid)) + } +} + +/// This type represents that patterns that should be reported whenever a DFA +/// enters a match state. This structure exists to support DFAs that search for +/// matches for multiple regexes. +/// +/// This structure relies on the fact that all match states in a DFA occur +/// contiguously in the DFA's transition table. (See dfa/special.rs for a more +/// detailed breakdown of the representation.) Namely, when a match occurs, we +/// know its state ID. Since we know the start and end of the contiguous region +/// of match states, we can use that to compute the position at which the match +/// state occurs. That in turn is used as an offset into this structure. +#[derive(Clone, Debug)] +struct MatchStates<T> { + /// Slices is a flattened sequence of pairs, where each pair points to a + /// sub-slice of pattern_ids. The first element of the pair is an offset + /// into pattern_ids and the second element of the pair is the number + /// of 32-bit pattern IDs starting at that position. That is, each pair + /// corresponds to a single DFA match state and its corresponding match + /// IDs. The number of pairs always corresponds to the number of distinct + /// DFA match states. + /// + /// In practice, T is either Vec<u32> or &[u32]. + slices: T, + /// A flattened sequence of pattern IDs for each DFA match state. The only + /// way to correctly read this sequence is indirectly via `slices`. + /// + /// In practice, T is either Vec<u32> or &[u32]. + pattern_ids: T, + /// The total number of unique patterns represented by these match states. + patterns: usize, +} + +impl<'a> MatchStates<&'a [u32]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + // Read the total number of match states. + let (count, nr) = + bytes::try_read_u32_as_usize(slice, "match state count")?; + slice = &slice[nr..]; + + // Read the slice start/length pairs. + let pair_count = bytes::mul(2, count, "match state offset pairs")?; + let slices_bytes_len = bytes::mul( + pair_count, + PatternID::SIZE, + "match state slice offset byte length", + )?; + bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?; + bytes::check_alignment::<PatternID>(slice)?; + let slices_bytes = &slice[..slices_bytes_len]; + slice = &slice[slices_bytes_len..]; + // SAFETY: Since PatternID is always representable as a u32, all we + // need to do is ensure that we have the proper length and alignment. + // We've checked both above, so the cast below is safe. + // + // N.B. This is one of the few not-safe snippets in this function, so + // we mark it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let slices = unsafe { + core::slice::from_raw_parts( + slices_bytes.as_ptr() as *const u32, + pair_count, + ) + }; + + // Read the total number of unique pattern IDs (which is always 1 more + // than the maximum pattern ID in this automaton, since pattern IDs are + // handed out contiguously starting at 0). + let (patterns, nr) = + bytes::try_read_u32_as_usize(slice, "pattern count")?; + slice = &slice[nr..]; + + // Now read the pattern ID count. We don't need to store this + // explicitly, but we need it to know how many pattern IDs to read. + let (idcount, nr) = + bytes::try_read_u32_as_usize(slice, "pattern ID count")?; + slice = &slice[nr..]; + + // Read the actual pattern IDs. + let pattern_ids_len = + bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?; + bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?; + bytes::check_alignment::<PatternID>(slice)?; + let pattern_ids_bytes = &slice[..pattern_ids_len]; + slice = &slice[pattern_ids_len..]; + // SAFETY: Since PatternID is always representable as a u32, all we + // need to do is ensure that we have the proper length and alignment. + // We've checked both above, so the cast below is safe. + // + // N.B. This is one of the few not-safe snippets in this function, so + // we mark it explicitly to call it out, even though it is technically + // superfluous. + #[allow(unused_unsafe)] + let pattern_ids = unsafe { + core::slice::from_raw_parts( + pattern_ids_bytes.as_ptr() as *const u32, + idcount, + ) + }; + + let ms = MatchStates { slices, pattern_ids, patterns }; + Ok((ms, slice.as_ptr() as usize - slice_start)) + } +} + +#[cfg(feature = "alloc")] +impl MatchStates<Vec<u32>> { + fn empty(pattern_count: usize) -> MatchStates<Vec<u32>> { + assert!(pattern_count <= PatternID::LIMIT); + MatchStates { + slices: vec![], + pattern_ids: vec![], + patterns: pattern_count, + } + } + + fn new( + matches: &BTreeMap<StateID, Vec<PatternID>>, + pattern_count: usize, + ) -> Result<MatchStates<Vec<u32>>, Error> { + let mut m = MatchStates::empty(pattern_count); + for (_, pids) in matches.iter() { + let start = PatternID::new(m.pattern_ids.len()) + .map_err(|_| Error::too_many_match_pattern_ids())?; + m.slices.push(start.as_u32()); + // This is always correct since the number of patterns in a single + // match state can never exceed maximum number of allowable + // patterns. Why? Because a pattern can only appear once in a + // particular match state, by construction. (And since our pattern + // ID limit is one less than u32::MAX, we're guaranteed that the + // length fits in a u32.) + m.slices.push(u32::try_from(pids.len()).unwrap()); + for &pid in pids { + m.pattern_ids.push(pid.as_u32()); + } + } + m.patterns = pattern_count; + Ok(m) + } + + fn new_with_map( + &self, + matches: &BTreeMap<StateID, Vec<PatternID>>, + ) -> Result<MatchStates<Vec<u32>>, Error> { + MatchStates::new(matches, self.patterns) + } +} + +impl<T: AsRef<[u32]>> MatchStates<T> { + /// Writes a serialized form of these match states to the buffer given. If + /// the buffer is too small, then an error is returned. To determine how + /// big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("match states")); + } + dst = &mut dst[..nwrite]; + + // write state ID count + // Unwrap is OK since number of states is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.count()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write slice offset pairs + for &pid in self.slices() { + let n = bytes::write_pattern_id::<E>(pid, &mut dst); + dst = &mut dst[n..]; + } + + // write unique pattern ID count + // Unwrap is OK since number of patterns is guaranteed to fit in a u32. + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write pattern ID count + // Unwrap is OK since we check at construction (and deserialization) + // that the number of patterns is representable as a u32. + E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write pattern IDs + for &pid in self.pattern_ids() { + let n = bytes::write_pattern_id::<E>(pid, &mut dst); + dst = &mut dst[n..]; + } + + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // match state count + + (self.slices().len() * PatternID::SIZE) + + size_of::<u32>() // unique pattern ID count + + size_of::<u32>() // pattern ID count + + (self.pattern_ids().len() * PatternID::SIZE) + } + + /// Valides that the match state info is itself internally consistent and + /// consistent with the recorded match state region in the given DFA. + fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> { + if self.count() != dfa.special.match_len(dfa.stride()) { + return Err(DeserializeError::generic( + "match state count mismatch", + )); + } + for si in 0..self.count() { + let start = self.slices()[si * 2].as_usize(); + let len = self.slices()[si * 2 + 1].as_usize(); + if start >= self.pattern_ids().len() { + return Err(DeserializeError::generic( + "invalid pattern ID start offset", + )); + } + if start + len > self.pattern_ids().len() { + return Err(DeserializeError::generic( + "invalid pattern ID length", + )); + } + for mi in 0..len { + let pid = self.pattern_id(si, mi); + if pid.as_usize() >= self.patterns { + return Err(DeserializeError::generic( + "invalid pattern ID", + )); + } + } + } + Ok(()) + } + + /// Converts these match states back into their map form. This is useful + /// when shuffling states, as the normal MatchStates representation is not + /// amenable to easy state swapping. But with this map, to swap id1 and + /// id2, all you need to do is: + /// + /// if let Some(pids) = map.remove(&id1) { + /// map.insert(id2, pids); + /// } + /// + /// Once shuffling is done, use MatchStates::new to convert back. + #[cfg(feature = "alloc")] + fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> { + let mut map = BTreeMap::new(); + for i in 0..self.count() { + let mut pids = vec![]; + for j in 0..self.pattern_len(i) { + pids.push(self.pattern_id(i, j)); + } + map.insert(self.match_state_id(dfa, i), pids); + } + map + } + + /// Converts these match states to a borrowed value. + fn as_ref(&self) -> MatchStates<&'_ [u32]> { + MatchStates { + slices: self.slices.as_ref(), + pattern_ids: self.pattern_ids.as_ref(), + patterns: self.patterns, + } + } + + /// Converts these match states to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> MatchStates<Vec<u32>> { + MatchStates { + slices: self.slices.as_ref().to_vec(), + pattern_ids: self.pattern_ids.as_ref().to_vec(), + patterns: self.patterns, + } + } + + /// Returns the match state ID given the match state index. (Where the + /// first match state corresponds to index 0.) + /// + /// This panics if there is no match state at the given index. + fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID { + assert!(dfa.special.matches(), "no match states to index"); + // This is one of the places where we rely on the fact that match + // states are contiguous in the transition table. Namely, that the + // first match state ID always corresponds to dfa.special.min_start. + // From there, since we know the stride, we can compute the ID of any + // match state given its index. + let stride2 = u32::try_from(dfa.stride2()).unwrap(); + let offset = index.checked_shl(stride2).unwrap(); + let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap(); + let sid = StateID::new(id).unwrap(); + assert!(dfa.is_match_state(sid)); + sid + } + + /// Returns the pattern ID at the given match index for the given match + /// state. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + /// + /// The match index is the index of the pattern ID for the given state. + /// The index must be less than `self.pattern_len(state_index)`. + fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID { + self.pattern_id_slice(state_index)[match_index] + } + + /// Returns the number of patterns in the given match state. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + fn pattern_len(&self, state_index: usize) -> usize { + self.slices()[state_index * 2 + 1].as_usize() + } + + /// Returns all of the pattern IDs for the given match state index. + /// + /// The match state index is the state index minus the state index of the + /// first match state in the DFA. + fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] { + let start = self.slices()[state_index * 2].as_usize(); + let len = self.pattern_len(state_index); + &self.pattern_ids()[start..start + len] + } + + /// Returns the pattern ID offset slice of u32 as a slice of PatternID. + fn slices(&self) -> &[PatternID] { + let integers = self.slices.as_ref(); + // SAFETY: This is safe because PatternID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const PatternID, + integers.len(), + ) + } + } + + /// Returns the total number of match states. + fn count(&self) -> usize { + assert_eq!(0, self.slices().len() % 2); + self.slices().len() / 2 + } + + /// Returns the pattern ID slice of u32 as a slice of PatternID. + fn pattern_ids(&self) -> &[PatternID] { + let integers = self.pattern_ids.as_ref(); + // SAFETY: This is safe because PatternID is guaranteed to be + // representable as a u32. + unsafe { + core::slice::from_raw_parts( + integers.as_ptr() as *const PatternID, + integers.len(), + ) + } + } + + /// Return the memory usage, in bytes, of these match pairs. + fn memory_usage(&self) -> usize { + (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE + } +} + +/// An iterator over all states in a DFA. +/// +/// This iterator yields a tuple for each state. The first element of the +/// tuple corresponds to a state's identifier, and the second element +/// corresponds to the state itself (comprised of its transitions). +/// +/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to +/// the type of the transition table itself. +pub(crate) struct StateIter<'a, T> { + tt: &'a TransitionTable<T>, + it: iter::Enumerate<slice::Chunks<'a, StateID>>, +} + +impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> { + type Item = State<'a>; + + fn next(&mut self) -> Option<State<'a>> { + self.it.next().map(|(index, _)| { + let id = self.tt.from_index(index); + self.tt.state(id) + }) + } +} + +/// An immutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table. +pub(crate) struct State<'a> { + id: StateID, + stride2: usize, + transitions: &'a [StateID], +} + +impl<'a> State<'a> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is + /// the input byte for that transition and the second element is the + /// transitions itself. + pub(crate) fn transitions(&self) -> StateTransitionIter<'_> { + StateTransitionIter { + len: self.transitions.len(), + it: self.transitions.iter().enumerate(), + } + } + + /// Return an iterator over a sparse representation of the transitions in + /// this state. Only non-dead transitions are returned. + /// + /// The "sparse" representation in this case corresponds to a sequence of + /// triples. The first two elements of the triple comprise an inclusive + /// byte range while the last element corresponds to the transition taken + /// for all bytes in the range. + /// + /// This is somewhat more condensed than the classical sparse + /// representation (where you have an element for every non-dead + /// transition), but in practice, checking if a byte is in a range is very + /// cheap and using ranges tends to conserve quite a bit more space. + pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> { + StateSparseTransitionIter { dense: self.transitions(), cur: None } + } + + /// Returns the identifier for this state. + pub(crate) fn id(&self) -> StateID { + self.id + } + + /// Analyzes this state to determine whether it can be accelerated. If so, + /// it returns an accelerator that contains at least one byte. + #[cfg(feature = "alloc")] + fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> { + // We just try to add bytes to our accelerator. Once adding fails + // (because we've added too many bytes), then give up. + let mut accel = Accel::new(); + for (class, id) in self.transitions() { + if id == self.id() { + continue; + } + for unit in classes.elements(class) { + if let Some(byte) = unit.as_u8() { + if !accel.add(byte) { + return None; + } + } + } + } + if accel.is_empty() { + None + } else { + Some(accel) + } + } +} + +impl<'a> fmt::Debug for State<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for (i, (start, end, id)) in self.sparse_transitions().enumerate() { + let index = if f.alternate() { + id.as_usize() + } else { + id.as_usize() >> self.stride2 + }; + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!(f, "{:?} => {:?}", start, index)?; + } else { + write!(f, "{:?}-{:?} => {:?}", start, end, index)?; + } + } + Ok(()) + } +} + +/// A mutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table. +#[cfg(feature = "alloc")] +pub(crate) struct StateMut<'a> { + id: StateID, + stride2: usize, + transitions: &'a mut [StateID], +} + +#[cfg(feature = "alloc")] +impl<'a> StateMut<'a> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is the + /// input byte for that transition and the second element is a mutable + /// reference to the transition itself. + pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> { + StateTransitionIterMut { + len: self.transitions.len(), + it: self.transitions.iter_mut().enumerate(), + } + } +} + +#[cfg(feature = "alloc")] +impl<'a> fmt::Debug for StateMut<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt( + &State { + id: self.id, + stride2: self.stride2, + transitions: self.transitions, + }, + f, + ) + } +} + +/// An iterator over all transitions in a single DFA state. This yields +/// a number of transitions equivalent to the alphabet length of the +/// corresponding DFA. +/// +/// Each transition is represented by a tuple. The first element is the input +/// byte for that transition and the second element is the transition itself. +#[derive(Debug)] +pub(crate) struct StateTransitionIter<'a> { + len: usize, + it: iter::Enumerate<slice::Iter<'a, StateID>>, +} + +impl<'a> Iterator for StateTransitionIter<'a> { + type Item = (alphabet::Unit, StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, StateID)> { + self.it.next().map(|(i, &id)| { + let unit = if i + 1 == self.len { + alphabet::Unit::eoi(i) + } else { + let b = u8::try_from(i) + .expect("raw byte alphabet is never exceeded"); + alphabet::Unit::u8(b) + }; + (unit, id) + }) + } +} + +/// A mutable iterator over all transitions in a DFA state. +/// +/// Each transition is represented by a tuple. The first element is the +/// input byte for that transition and the second element is a mutable +/// reference to the transition itself. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub(crate) struct StateTransitionIterMut<'a> { + len: usize, + it: iter::Enumerate<slice::IterMut<'a, StateID>>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for StateTransitionIterMut<'a> { + type Item = (alphabet::Unit, &'a mut StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> { + self.it.next().map(|(i, id)| { + let unit = if i + 1 == self.len { + alphabet::Unit::eoi(i) + } else { + let b = u8::try_from(i) + .expect("raw byte alphabet is never exceeded"); + alphabet::Unit::u8(b) + }; + (unit, id) + }) + } +} + +/// An iterator over all non-DEAD transitions in a single DFA state using a +/// sparse representation. +/// +/// Each transition is represented by a triple. The first two elements of the +/// triple comprise an inclusive byte range while the last element corresponds +/// to the transition taken for all bytes in the range. +/// +/// As a convenience, this always returns `alphabet::Unit` values of the same +/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte, +/// byte) and (EOI, EOI) values are yielded. +#[derive(Debug)] +pub(crate) struct StateSparseTransitionIter<'a> { + dense: StateTransitionIter<'a>, + cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>, +} + +impl<'a> Iterator for StateSparseTransitionIter<'a> { + type Item = (alphabet::Unit, alphabet::Unit, StateID); + + fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> { + while let Some((unit, next)) = self.dense.next() { + let (prev_start, prev_end, prev_next) = match self.cur { + Some(t) => t, + None => { + self.cur = Some((unit, unit, next)); + continue; + } + }; + if prev_next == next && !unit.is_eoi() { + self.cur = Some((prev_start, unit, prev_next)); + } else { + self.cur = Some((unit, unit, next)); + if prev_next != DEAD { + return Some((prev_start, prev_end, prev_next)); + } + } + } + if let Some((start, end, next)) = self.cur.take() { + if next != DEAD { + return Some((start, end, next)); + } + } + None + } +} + +/// An iterator over pattern IDs for a single match state. +#[derive(Debug)] +pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>); + +impl<'a> Iterator for PatternIDIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + self.0.next().copied() + } +} + +/// Remapper is an abstraction the manages the remapping of state IDs in a +/// dense DFA. This is useful when one wants to shuffle states into different +/// positions in the DFA. +/// +/// One of the key complexities this manages is the ability to correctly move +/// one state multiple times. +/// +/// Once shuffling is complete, `remap` should be called, which will rewrite +/// all pertinent transitions to updated state IDs. +#[cfg(feature = "alloc")] +#[derive(Debug)] +struct Remapper { + /// A map from the index of a state to its pre-multiplied identifier. + /// + /// When a state is swapped with another, then their corresponding + /// locations in this map are also swapped. Thus, its new position will + /// still point to its old pre-multiplied StateID. + /// + /// While there is a bit more to it, this then allows us to rewrite the + /// state IDs in a DFA's transition table in a single pass. This is done + /// by iterating over every ID in this map, then iterating over each + /// transition for the state at that ID and re-mapping the transition from + /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position + /// in this map where `old_id` *started*, and set it to where it ended up + /// after all swaps have been completed. + map: Vec<StateID>, +} + +#[cfg(feature = "alloc")] +impl Remapper { + fn from_dfa(dfa: &OwnedDFA) -> Remapper { + Remapper { + map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(), + } + } + + fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) { + dfa.swap_states(id1, id2); + self.map.swap(dfa.to_index(id1), dfa.to_index(id2)); + } + + fn remap(mut self, dfa: &mut OwnedDFA) { + // Update the map to account for states that have been swapped + // multiple times. For example, if (A, C) and (C, G) are swapped, then + // transitions previously pointing to A should now point to G. But if + // we don't update our map, they will erroneously be set to C. All we + // do is follow the swaps in our map until we see our original state + // ID. + let oldmap = self.map.clone(); + for i in 0..dfa.state_count() { + let cur_id = dfa.from_index(i); + let mut new = oldmap[i]; + if cur_id == new { + continue; + } + loop { + let id = oldmap[dfa.to_index(new)]; + if cur_id == id { + self.map[i] = new; + break; + } + new = id; + } + } + + // To work around the borrow checker for converting state IDs to + // indices. We cannot borrow self while mutably iterating over a + // state's transitions. Otherwise, we'd just use dfa.to_index(..). + let stride2 = dfa.stride2(); + let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; + + // Now that we've finished shuffling, we need to remap all of our + // transitions. We don't need to handle re-mapping accelerated states + // since `accels` is only populated after shuffling. + for &id in self.map.iter() { + for (_, next_id) in dfa.state_mut(id).iter_mut() { + *next_id = self.map[to_index(*next_id)]; + } + } + for start_id in dfa.st.table_mut().iter_mut() { + *start_id = self.map[to_index(*start_id)]; + } + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn errors_with_unicode_word_boundary() { + let pattern = r"\b"; + assert!(Builder::new().build(pattern).is_err()); + } + + #[test] + fn roundtrip_never_match() { + let dfa = DFA::never_match().unwrap(); + let (buf, _) = dfa.to_bytes_native_endian(); + let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; + + assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap()); + } + + #[test] + fn roundtrip_always_match() { + use crate::HalfMatch; + + let dfa = DFA::always_match().unwrap(); + let (buf, _) = dfa.to_bytes_native_endian(); + let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; + + assert_eq!( + Some(HalfMatch::must(0, 0)), + dfa.find_leftmost_fwd(b"foo12345").unwrap() + ); + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/determinize.rs b/vendor/regex-automata-0.2.0/src/dfa/determinize.rs new file mode 100644 index 000000000..61603481b --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/determinize.rs @@ -0,0 +1,547 @@ +use alloc::{ + collections::BTreeMap, + vec::{self, Vec}, +}; + +use crate::{ + dfa::{dense, Error, DEAD}, + nfa::thompson, + util::{ + self, + alphabet::{self, ByteSet}, + determinize::{State, StateBuilderEmpty, StateBuilderNFA}, + id::{PatternID, StateID}, + matchtypes::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + }, +}; + +/// A builder for configuring and running a DFA determinizer. +#[derive(Clone, Debug)] +pub(crate) struct Config { + anchored: bool, + match_kind: MatchKind, + quit: ByteSet, + dfa_size_limit: Option<usize>, + determinize_size_limit: Option<usize>, +} + +impl Config { + /// Create a new default config for a determinizer. The determinizer may be + /// configured before calling `run`. + pub fn new() -> Config { + Config { + anchored: false, + match_kind: MatchKind::LeftmostFirst, + quit: ByteSet::empty(), + dfa_size_limit: None, + determinize_size_limit: None, + } + } + + /// Run determinization on the given NFA and write the resulting DFA into + /// the one given. The DFA given should be initialized but otherwise empty. + /// "Initialized" means that it is setup to handle the NFA's byte classes, + /// number of patterns and whether to build start states for each pattern. + pub fn run( + &self, + nfa: &thompson::NFA, + dfa: &mut dense::OwnedDFA, + ) -> Result<(), Error> { + let dead = State::dead(); + let quit = State::dead(); + let mut cache = StateMap::default(); + // We only insert the dead state here since its representation is + // identical to the quit state. And we never want anything pointing + // to the quit state other than specific transitions derived from the + // determinizer's configured "quit" bytes. + // + // We do put the quit state into 'builder_states' below. This ensures + // that a proper DFA state ID is allocated for it, and that no other + // DFA state uses the "location after the DEAD state." That is, it + // is assumed that the quit state is always the state immediately + // following the DEAD state. + cache.insert(dead.clone(), DEAD); + + let runner = Runner { + config: self.clone(), + nfa, + dfa, + builder_states: alloc::vec![dead, quit], + cache, + memory_usage_state: 0, + sparses: SparseSets::new(nfa.len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + }; + runner.run() + } + + /// Whether to build an anchored DFA or not. When disabled (the default), + /// the unanchored prefix from the NFA is used to start the DFA. Otherwise, + /// the anchored start state of the NFA is used to start the DFA. + pub fn anchored(&mut self, yes: bool) -> &mut Config { + self.anchored = yes; + self + } + + /// The match semantics to use for determinization. + /// + /// MatchKind::All corresponds to the standard textbook construction. + /// All possible match states are represented in the DFA. + /// MatchKind::LeftmostFirst permits greediness and otherwise tries to + /// simulate the match semantics of backtracking regex engines. Namely, + /// only a subset of match states are built, and dead states are used to + /// stop searches with an unanchored prefix. + /// + /// The default is MatchKind::LeftmostFirst. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.match_kind = kind; + self + } + + /// The set of bytes to use that will cause the DFA to enter a quit state, + /// stop searching and return an error. By default, this is empty. + pub fn quit(&mut self, set: ByteSet) -> &mut Config { + self.quit = set; + self + } + + /// The limit, in bytes of the heap, that the DFA is permitted to use. This + /// does not include the auxiliary heap storage used by determinization. + pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config { + self.dfa_size_limit = bytes; + self + } + + /// The limit, in bytes of the heap, that determinization itself is allowed + /// to use. This does not include the size of the DFA being built. + pub fn determinize_size_limit( + &mut self, + bytes: Option<usize>, + ) -> &mut Config { + self.determinize_size_limit = bytes; + self + } +} + +/// The actual implementation of determinization that converts an NFA to a DFA +/// through powerset construction. +/// +/// This determinizer roughly follows the typical powerset construction, where +/// each DFA state is comprised of one or more NFA states. In the worst case, +/// there is one DFA state for every possible combination of NFA states. In +/// practice, this only happens in certain conditions, typically when there are +/// bounded repetitions. +/// +/// The main differences between this implementation and typical deteminization +/// are that this implementation delays matches by one state and hackily makes +/// look-around work. Comments below attempt to explain this. +/// +/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA, +/// whichever is shorter. +#[derive(Debug)] +struct Runner<'a> { + /// The configuration used to initialize determinization. + config: Config, + /// The NFA we're converting into a DFA. + nfa: &'a thompson::NFA, + /// The DFA we're building. + dfa: &'a mut dense::OwnedDFA, + /// Each DFA state being built is defined as an *ordered* set of NFA + /// states, along with some meta facts about the ordered set of NFA states. + /// + /// This is never empty. The first state is always a dummy state such that + /// a state id == 0 corresponds to a dead state. The second state is always + /// the quit state. + /// + /// Why do we have states in both a `Vec` and in a cache map below? + /// Well, they serve two different roles based on access patterns. + /// `builder_states` is the canonical home of each state, and provides + /// constant random access by a DFA state's ID. The cache map below, on + /// the other hand, provides a quick way of searching for identical DFA + /// states by using the DFA state as a key in the map. Of course, we use + /// reference counting to avoid actually duplicating the state's data + /// itself. (Although this has never been benchmarked.) Note that the cache + /// map does not give us full minimization; it just lets us avoid some very + /// obvious redundant states. + /// + /// Note that the index into this Vec isn't quite the DFA's state ID. + /// Rather, it's just an index. To get the state ID, you have to multiply + /// it by the DFA's stride. That's done by self.dfa.from_index. And the + /// inverse is self.dfa.to_index. + /// + /// Moreover, DFA states don't usually retain the IDs assigned to them + /// by their position in this Vec. After determinization completes, + /// states are shuffled around to support other optimizations. See the + /// sibling 'special' module for more details on that. (The reason for + /// mentioning this is that if you print out the DFA for debugging during + /// determinization, and then print out the final DFA after it is fully + /// built, then the state IDs likely won't match up.) + builder_states: Vec<State>, + /// A cache of DFA states that already exist and can be easily looked up + /// via ordered sets of NFA states. + /// + /// See `builder_states` docs for why we store states in two different + /// ways. + cache: StateMap, + /// The memory usage, in bytes, used by builder_states and cache. We track + /// this as new states are added since states use a variable amount of + /// heap. Tracking this as we add states makes it possible to compute the + /// total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// A pair of sparse sets for tracking ordered sets of NFA state IDs. + /// These are reused throughout determinization. A bounded sparse set + /// gives us constant time insertion, membership testing and clearing. + sparses: SparseSets, + /// Scratch space for a stack of NFA states to visit, for depth first + /// visiting without recursion. + stack: Vec<StateID>, + /// Scratch space for storing an ordered sequence of NFA states, for + /// amortizing allocation. This is principally useful for when we avoid + /// adding a new DFA state since it already exists. In order to detect this + /// case though, we still need an ordered set of NFA state IDs. So we use + /// this space to stage that ordered set before we know whether we need to + /// create a new DFA state or not. + scratch_state_builder: StateBuilderEmpty, +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap<State, StateID>; +#[cfg(not(feature = "std"))] +type StateMap = BTreeMap<State, StateID>; + +impl<'a> Runner<'a> { + /// Build the DFA. If there was a problem constructing the DFA (e.g., if + /// the chosen state identifier representation is too small), then an error + /// is returned. + fn run(mut self) -> Result<(), Error> { + if self.nfa.has_word_boundary_unicode() + && !self.config.quit.contains_range(0x80, 0xFF) + { + return Err(Error::unsupported_dfa_word_boundary_unicode()); + } + + // A sequence of "representative" bytes drawn from each equivalence + // class. These representative bytes are fed to the NFA to compute + // state transitions. This allows us to avoid re-computing state + // transitions for bytes that are guaranteed to produce identical + // results. + let representatives: Vec<alphabet::Unit> = + self.dfa.byte_classes().representatives().collect(); + // The set of all DFA state IDs that still need to have their + // transitions set. We start by seeding this with all starting states. + let mut uncompiled = alloc::vec![]; + self.add_all_starts(&mut uncompiled)?; + while let Some(dfa_id) = uncompiled.pop() { + for &unit in &representatives { + if unit.as_u8().map_or(false, |b| self.config.quit.contains(b)) + { + continue; + } + // In many cases, the state we transition to has already been + // computed. 'cached_state' will do the minimal amount of work + // to check this, and if it exists, immediately return an + // already existing state ID. + let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?; + self.dfa.set_transition(dfa_id, unit, next_dfa_id); + // If the state ID we got back is newly created, then we need + // to compile it, so add it to our uncompiled frontier. + if is_new { + uncompiled.push(next_dfa_id); + } + } + } + trace!( + "determinization complete, memory usage: {}, dense DFA size: {}", + self.memory_usage(), + self.dfa.memory_usage(), + ); + + // A map from DFA state ID to one or more NFA match IDs. Each NFA match + // ID corresponds to a distinct regex pattern that matches in the state + // corresponding to the key. + let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new(); + self.cache.clear(); + #[allow(unused_variables)] + let mut total_pat_count = 0; + for (i, state) in self.builder_states.into_iter().enumerate() { + if let Some(pat_ids) = state.match_pattern_ids() { + let id = self.dfa.from_index(i); + total_pat_count += pat_ids.len(); + matches.insert(id, pat_ids); + } + } + log! { + use core::mem::size_of; + let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>(); + let pats = total_pat_count * size_of::<PatternID>(); + let mem = (matches.len() * per_elem) + pats; + log::trace!("matches map built, memory usage: {}", mem); + } + // At this point, we shuffle the "special" states in the final DFA. + // This permits a DFA's match loop to detect a match condition (among + // other things) by merely inspecting the current state's identifier, + // and avoids the need for any additional auxiliary storage. + self.dfa.shuffle(matches)?; + Ok(()) + } + + /// Return the identifier for the next DFA state given an existing DFA + /// state and an input byte. If the next DFA state already exists, then + /// return its identifier from the cache. Otherwise, build the state, cache + /// it and return its identifier. + /// + /// This routine returns a boolean indicating whether a new state was + /// built. If a new state is built, then the caller needs to add it to its + /// frontier of uncompiled DFA states to compute transitions for. + fn cached_state( + &mut self, + dfa_id: StateID, + unit: alphabet::Unit, + ) -> Result<(StateID, bool), Error> { + // Compute the set of all reachable NFA states, including epsilons. + let empty_builder = self.get_state_builder(); + let builder = util::determinize::next( + self.nfa, + self.config.match_kind, + &mut self.sparses, + &mut self.stack, + &self.builder_states[self.dfa.to_index(dfa_id)], + unit, + empty_builder, + ); + self.maybe_add_state(builder) + } + + /// Compute the set of DFA start states and add their identifiers in + /// 'dfa_state_ids' (no duplicates are added). + fn add_all_starts( + &mut self, + dfa_state_ids: &mut Vec<StateID>, + ) -> Result<(), Error> { + // Always add the (possibly unanchored) start states for matching any + // of the patterns in this DFA. + self.add_start_group(None, dfa_state_ids)?; + // We only need to compute anchored start states for each pattern if it + // was requested to do so. + if self.dfa.has_starts_for_each_pattern() { + for pid in PatternID::iter(self.dfa.pattern_count()) { + self.add_start_group(Some(pid), dfa_state_ids)?; + } + } + Ok(()) + } + + /// Add a group of start states for the given match pattern ID. Any new + /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are + /// pushed.) + /// + /// When pattern_id is None, then this will compile a group of unanchored + /// start states (if the DFA is unanchored). When the pattern_id is + /// present, then this will compile a group of anchored start states that + /// only match the given pattern. + fn add_start_group( + &mut self, + pattern_id: Option<PatternID>, + dfa_state_ids: &mut Vec<StateID>, + ) -> Result<(), Error> { + let nfa_start = match pattern_id { + Some(pid) => self.nfa.start_pattern(pid), + None if self.config.anchored => self.nfa.start_anchored(), + None => self.nfa.start_unanchored(), + }; + + // When compiling start states, we're careful not to build additional + // states that aren't necessary. For example, if the NFA has no word + // boundary assertion, then there's no reason to have distinct start + // states for 'NonWordByte' and 'WordByte' starting configurations. + // Instead, the 'WordByte' starting configuration can just point + // directly to the start state for the 'NonWordByte' config. + + let (id, is_new) = + self.add_one_start(nfa_start, Start::NonWordByte)?; + self.dfa.set_start_state(Start::NonWordByte, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + + if !self.nfa.has_word_boundary() { + self.dfa.set_start_state(Start::WordByte, pattern_id, id); + } else { + let (id, is_new) = + self.add_one_start(nfa_start, Start::WordByte)?; + self.dfa.set_start_state(Start::WordByte, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + } + if !self.nfa.has_any_anchor() { + self.dfa.set_start_state(Start::Text, pattern_id, id); + self.dfa.set_start_state(Start::Line, pattern_id, id); + } else { + let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; + self.dfa.set_start_state(Start::Text, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?; + self.dfa.set_start_state(Start::Line, pattern_id, id); + if is_new { + dfa_state_ids.push(id); + } + } + + Ok(()) + } + + /// Add a new DFA start state corresponding to the given starting NFA + /// state, and the starting search configuration. (The starting search + /// configuration essentially tells us which look-behind assertions are + /// true for this particular state.) + /// + /// The boolean returned indicates whether the state ID returned is a newly + /// created state, or a previously cached state. + fn add_one_start( + &mut self, + nfa_start: StateID, + start: Start, + ) -> Result<(StateID, bool), Error> { + // Compute the look-behind assertions that are true in this starting + // configuration, and the determine the epsilon closure. While + // computing the epsilon closure, we only follow condiional epsilon + // transitions that satisfy the look-behind assertions in 'facts'. + let mut builder_matches = self.get_state_builder().into_matches(); + util::determinize::set_lookbehind_from_start( + &start, + &mut builder_matches, + ); + self.sparses.set1.clear(); + util::determinize::epsilon_closure( + self.nfa, + nfa_start, + *builder_matches.look_have(), + &mut self.stack, + &mut self.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + util::determinize::add_nfa_states( + &self.nfa, + &self.sparses.set1, + &mut builder, + ); + self.maybe_add_state(builder) + } + + /// Adds the given state to the DFA being built depending on whether it + /// already exists in this determinizer's cache. + /// + /// If it does exist, then the memory used by 'state' is put back into the + /// determinizer and the previously created state's ID is returned. (Along + /// with 'false', indicating that no new state was added.) + /// + /// If it does not exist, then the state is added to the DFA being built + /// and a fresh ID is allocated (if ID allocation fails, then an error is + /// returned) and returned. (Along with 'true', indicating that a new state + /// was added.) + fn maybe_add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<(StateID, bool), Error> { + if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok((cached_id, false)); + } + self.add_state(builder).map(|sid| (sid, true)) + } + + /// Add the given state to the DFA and make it available in the cache. + /// + /// The state initially has no transitions. That is, it transitions to the + /// dead state for all possible inputs, and transitions to the quit state + /// for all quit bytes. + /// + /// If adding the state would exceed the maximum value for StateID, then an + /// error is returned. + fn add_state( + &mut self, + builder: StateBuilderNFA, + ) -> Result<StateID, Error> { + let id = self.dfa.add_empty_state()?; + if !self.config.quit.is_empty() { + for b in self.config.quit.iter() { + self.dfa.set_transition( + id, + alphabet::Unit::u8(b), + self.dfa.quit_id(), + ); + } + } + let state = builder.to_state(); + // States use reference counting internally, so we only need to count + // their memroy usage once. + self.memory_usage_state += state.memory_usage(); + self.builder_states.push(state.clone()); + self.cache.insert(state, id); + self.put_state_builder(builder); + if let Some(limit) = self.config.dfa_size_limit { + if self.dfa.memory_usage() > limit { + return Err(Error::dfa_exceeded_size_limit(limit)); + } + } + if let Some(limit) = self.config.determinize_size_limit { + if self.memory_usage() > limit { + return Err(Error::determinize_exceeded_size_limit(limit)); + } + } + Ok(id) + } + + /// Returns a state builder from this determinizer that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this determinizer for reuse. + /// + /// Note that building a 'State' from a builder always creates a new + /// alloc, so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.scratch_state_builder, + builder.clear(), + ); + } + + /// Return the memory usage, in bytes, of this determinizer at the current + /// point in time. This does not include memory used by the NFA or the + /// dense DFA itself. + fn memory_usage(&self) -> usize { + use core::mem::size_of; + + self.builder_states.len() * size_of::<State>() + // Maps likely use more memory than this, but it's probably close. + + self.cache.len() * (size_of::<State>() + size_of::<StateID>()) + + self.memory_usage_state + + self.stack.capacity() * size_of::<StateID>() + + self.scratch_state_builder.capacity() + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/error.rs b/vendor/regex-automata-0.2.0/src/dfa/error.rs new file mode 100644 index 000000000..6497a4cff --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/error.rs @@ -0,0 +1,162 @@ +use crate::{ + nfa, + util::{ + id::{PatternID, StateID}, + start::Start, + }, +}; + +/// An error that occurred during the construction of a DFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`nfa::thompson::Error`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building a DFA directly from a pattern string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred during the construction of a DFA. +/// +/// Note that this error is non-exhaustive. Adding new variants is not +/// considered a breaking change. +#[derive(Clone, Debug)] +enum ErrorKind { + /// An error that occurred while constructing an NFA as a precursor step + /// before a DFA is compiled. + NFA(nfa::thompson::Error), + /// An error that occurred because an unsupported regex feature was used. + /// The message string describes which unsupported feature was used. + /// + /// The primary regex feature that is unsupported by DFAs is the Unicode + /// word boundary look-around assertion (`\b`). This can be worked around + /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the + /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary) + /// option when building a DFA. + Unsupported(&'static str), + /// An error that occurs if too many states are produced while building a + /// DFA. + TooManyStates, + /// An error that occurs if too many start states are needed while building + /// a DFA. + /// + /// This is a kind of oddball error that occurs when building a DFA with + /// start states enabled for each pattern and enough patterns to cause + /// the table of start states to overflow `usize`. + TooManyStartStates, + /// This is another oddball error that can occur if there are too many + /// patterns spread out across too many match states. + TooManyMatchPatternIDs, + /// An error that occurs if the DFA got too big during determinization. + DFAExceededSizeLimit { limit: usize }, + /// An error that occurs if auxiliary storage (not the DFA) used during + /// determinization got too big. + DeterminizeExceededSizeLimit { limit: usize }, +} + +impl Error { + /// Return the kind of this error. + fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn nfa(err: nfa::thompson::Error) -> Error { + Error { kind: ErrorKind::NFA(err) } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error { + let msg = "cannot build DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + Error { kind: ErrorKind::Unsupported(msg) } + } + + pub(crate) fn too_many_states() -> Error { + Error { kind: ErrorKind::TooManyStates } + } + + pub(crate) fn too_many_start_states() -> Error { + Error { kind: ErrorKind::TooManyStartStates } + } + + pub(crate) fn too_many_match_pattern_ids() -> Error { + Error { kind: ErrorKind::TooManyMatchPatternIDs } + } + + pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error { + Error { kind: ErrorKind::DFAExceededSizeLimit { limit } } + } + + pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error { + Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + ErrorKind::NFA(ref err) => Some(err), + ErrorKind::Unsupported(_) => None, + ErrorKind::TooManyStates => None, + ErrorKind::TooManyStartStates => None, + ErrorKind::TooManyMatchPatternIDs => None, + ErrorKind::DFAExceededSizeLimit { .. } => None, + ErrorKind::DeterminizeExceededSizeLimit { .. } => None, + } + } +} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + ErrorKind::NFA(_) => write!(f, "error building NFA"), + ErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) + } + ErrorKind::TooManyStates => write!( + f, + "number of DFA states exceeds limit of {}", + StateID::LIMIT, + ), + ErrorKind::TooManyStartStates => { + let stride = Start::count(); + // The start table has `stride` entries for starting states for + // the entire DFA, and then `stride` entries for each pattern + // if start states for each pattern are enabled (which is the + // only way this error can occur). Thus, the total number of + // patterns that can fit in the table is `stride` less than + // what we can allocate. + let limit = ((core::isize::MAX as usize) - stride) / stride; + write!( + f, + "compiling DFA with start states exceeds pattern \ + pattern limit of {}", + limit, + ) + } + ErrorKind::TooManyMatchPatternIDs => write!( + f, + "compiling DFA with total patterns in all match states \ + exceeds limit of {}", + PatternID::LIMIT, + ), + ErrorKind::DFAExceededSizeLimit { limit } => write!( + f, + "DFA exceeded size limit of {:?} during determinization", + limit, + ), + ErrorKind::DeterminizeExceededSizeLimit { limit } => { + write!(f, "determinization exceeded size limit of {:?}", limit) + } + } + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/minimize.rs b/vendor/regex-automata-0.2.0/src/dfa/minimize.rs new file mode 100644 index 000000000..80e2f4e73 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/minimize.rs @@ -0,0 +1,461 @@ +use core::{cell::RefCell, fmt, mem}; + +use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; + +use crate::{ + dfa::{automaton::Automaton, dense, DEAD}, + util::{ + alphabet, + id::{PatternID, StateID}, + }, +}; + +/// An implementation of Hopcroft's algorithm for minimizing DFAs. +/// +/// The algorithm implemented here is mostly taken from Wikipedia: +/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm +/// +/// This code has had some light optimization attention paid to it, +/// particularly in the form of reducing allocation as much as possible. +/// However, it is still generally slow. Future optimization work should +/// probably focus on the bigger picture rather than micro-optimizations. For +/// example: +/// +/// 1. Figure out how to more intelligently create initial partitions. That is, +/// Hopcroft's algorithm starts by creating two partitions of DFA states +/// that are known to NOT be equivalent: match states and non-match states. +/// The algorithm proceeds by progressively refining these partitions into +/// smaller partitions. If we could start with more partitions, then we +/// could reduce the amount of work that Hopcroft's algorithm needs to do. +/// 2. For every partition that we visit, we find all incoming transitions to +/// every state in the partition for *every* element in the alphabet. (This +/// is why using byte classes can significantly decrease minimization times, +/// since byte classes shrink the alphabet.) This is quite costly and there +/// is perhaps some redundant work being performed depending on the specific +/// states in the set. For example, we might be able to only visit some +/// elements of the alphabet based on the transitions. +/// 3. Move parts of minimization into determinization. If minimization has +/// fewer states to deal with, then it should run faster. A prime example +/// of this might be large Unicode classes, which are generated in way that +/// can create a lot of redundant states. (Some work has been done on this +/// point during NFA compilation via the algorithm described in the +/// "Incremental Construction of MinimalAcyclic Finite-State Automata" +/// paper.) +pub(crate) struct Minimizer<'a> { + dfa: &'a mut dense::OwnedDFA, + in_transitions: Vec<Vec<Vec<StateID>>>, + partitions: Vec<StateSet>, + waiting: Vec<StateSet>, +} + +impl<'a> fmt::Debug for Minimizer<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Minimizer") + .field("dfa", &self.dfa) + .field("in_transitions", &self.in_transitions) + .field("partitions", &self.partitions) + .field("waiting", &self.waiting) + .finish() + } +} + +/// A set of states. A state set makes up a single partition in Hopcroft's +/// algorithm. +/// +/// It is represented by an ordered set of state identifiers. We use shared +/// ownership so that a single state set can be in both the set of partitions +/// and in the set of waiting sets simultaneously without an additional +/// allocation. Generally, once a state set is built, it becomes immutable. +/// +/// We use this representation because it avoids the overhead of more +/// traditional set data structures (HashSet/BTreeSet), and also because +/// computing intersection/subtraction on this representation is especially +/// fast. +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +struct StateSet { + ids: Rc<RefCell<Vec<StateID>>>, +} + +impl<'a> Minimizer<'a> { + pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { + let in_transitions = Minimizer::incoming_transitions(dfa); + let partitions = Minimizer::initial_partitions(dfa); + let waiting = partitions.clone(); + Minimizer { dfa, in_transitions, partitions, waiting } + } + + pub fn run(mut self) { + let stride2 = self.dfa.stride2(); + let as_state_id = |index: usize| -> StateID { + StateID::new(index << stride2).unwrap() + }; + let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; + + let mut incoming = StateSet::empty(); + let mut scratch1 = StateSet::empty(); + let mut scratch2 = StateSet::empty(); + let mut newparts = vec![]; + + // This loop is basically Hopcroft's algorithm. Everything else is just + // shuffling data around to fit our representation. + while let Some(set) = self.waiting.pop() { + for b in self.dfa.byte_classes().iter() { + self.find_incoming_to(b, &set, &mut incoming); + // If incoming is empty, then the intersection with any other + // set must also be empty. So 'newparts' just ends up being + // 'self.partitions'. So there's no need to go through the loop + // below. + // + // This actually turns out to be rather large optimization. On + // the order of making minimization 4-5x faster. It's likely + // that the vast majority of all states have very few incoming + // transitions. + if incoming.is_empty() { + continue; + } + + for p in 0..self.partitions.len() { + self.partitions[p].intersection(&incoming, &mut scratch1); + if scratch1.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + self.partitions[p].subtract(&incoming, &mut scratch2); + if scratch2.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + let (x, y) = + (scratch1.deep_clone(), scratch2.deep_clone()); + newparts.push(x.clone()); + newparts.push(y.clone()); + match self.find_waiting(&self.partitions[p]) { + Some(i) => { + self.waiting[i] = x; + self.waiting.push(y); + } + None => { + if x.len() <= y.len() { + self.waiting.push(x); + } else { + self.waiting.push(y); + } + } + } + } + newparts = mem::replace(&mut self.partitions, newparts); + newparts.clear(); + } + } + + // At this point, we now have a minimal partitioning of states, where + // each partition is an equivalence class of DFA states. Now we need to + // use this partioning to update the DFA to only contain one state for + // each partition. + + // Create a map from DFA state ID to the representative ID of the + // equivalence class to which it belongs. The representative ID of an + // equivalence class of states is the minimum ID in that class. + let mut state_to_part = vec![DEAD; self.dfa.state_count()]; + for p in &self.partitions { + p.iter(|id| state_to_part[as_index(id)] = p.min()); + } + + // Generate a new contiguous sequence of IDs for minimal states, and + // create a map from equivalence IDs to the new IDs. Thus, the new + // minimal ID of *any* state in the unminimized DFA can be obtained + // with minimals_ids[state_to_part[old_id]]. + let mut minimal_ids = vec![DEAD; self.dfa.state_count()]; + let mut new_index = 0; + for state in self.dfa.states() { + if state_to_part[as_index(state.id())] == state.id() { + minimal_ids[as_index(state.id())] = as_state_id(new_index); + new_index += 1; + } + } + // The total number of states in the minimal DFA. + let minimal_count = new_index; + // Convenience function for remapping state IDs. This takes an old ID, + // looks up its Hopcroft partition and then maps that to the new ID + // range. + let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; + + // Re-map this DFA in place such that the only states remaining + // correspond to the representative states of every equivalence class. + for id in (0..self.dfa.state_count()).map(as_state_id) { + // If this state isn't a representative for an equivalence class, + // then we skip it since it won't appear in the minimal DFA. + if state_to_part[as_index(id)] != id { + continue; + } + for (_, next) in self.dfa.state_mut(id).iter_mut() { + *next = remap(*next); + } + self.dfa.swap_states(id, minimal_ids[as_index(id)]); + } + // Trim off all unused states from the pre-minimized DFA. This + // represents all states that were merged into a non-singleton + // equivalence class of states, and appeared after the first state + // in each such class. (Because the state with the smallest ID in each + // equivalence class is its representative ID.) + self.dfa.truncate_states(minimal_count); + + // Update the new start states, which is now just the minimal ID of + // whatever state the old start state was collapsed into. Also, we + // collect everything before-hand to work around the borrow checker. + // We're already allocating so much that this is probably fine. If this + // turns out to be costly, then I guess add a `starts_mut` iterator. + let starts: Vec<_> = self.dfa.starts().collect(); + for (old_start_id, start_type, pid) in starts { + self.dfa.set_start_state(start_type, pid, remap(old_start_id)); + } + + // Update the match state pattern ID list for multi-regexes. All we + // need to do is remap the match state IDs. The pattern ID lists are + // always the same as they were since match states with distinct + // pattern ID lists are always considered distinct states. + let mut pmap = BTreeMap::new(); + for (match_id, pattern_ids) in self.dfa.pattern_map() { + let new_id = remap(match_id); + pmap.insert(new_id, pattern_ids); + } + // This unwrap is OK because minimization never increases the number of + // match states or patterns in those match states. Since minimization + // runs after the pattern map has already been set at least once, we + // know that our match states cannot error. + self.dfa.set_pattern_map(&pmap).unwrap(); + + // In order to update the ID of the maximum match state, we need to + // find the maximum ID among all of the match states in the minimized + // DFA. This is not necessarily the new ID of the unminimized maximum + // match state, since that could have been collapsed with a much + // earlier match state. Therefore, to find the new max match state, + // we iterate over all previous match states, find their corresponding + // new minimal ID, and take the maximum of those. + let old = self.dfa.special().clone(); + let new = self.dfa.special_mut(); + // ... but only remap if we had match states. + if old.matches() { + new.min_match = StateID::MAX; + new.max_match = StateID::ZERO; + for i in as_index(old.min_match)..=as_index(old.max_match) { + let new_id = remap(as_state_id(i)); + if new_id < new.min_match { + new.min_match = new_id; + } + if new_id > new.max_match { + new.max_match = new_id; + } + } + } + // ... same, but for start states. + if old.starts() { + new.min_start = StateID::MAX; + new.max_start = StateID::ZERO; + for i in as_index(old.min_start)..=as_index(old.max_start) { + let new_id = remap(as_state_id(i)); + if new_id == DEAD { + continue; + } + if new_id < new.min_start { + new.min_start = new_id; + } + if new_id > new.max_start { + new.max_start = new_id; + } + } + if new.max_start == DEAD { + new.min_start = DEAD; + } + } + new.quit_id = remap(new.quit_id); + new.set_max(); + } + + fn find_waiting(&self, set: &StateSet) -> Option<usize> { + self.waiting.iter().position(|s| s == set) + } + + fn find_incoming_to( + &self, + b: alphabet::Unit, + set: &StateSet, + incoming: &mut StateSet, + ) { + incoming.clear(); + set.iter(|id| { + for &inid in + &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] + { + incoming.add(inid); + } + }); + incoming.canonicalize(); + } + + fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> { + // For match states, we know that two match states with different + // pattern ID lists will *always* be distinct, so we can partition them + // initially based on that. + let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new(); + let mut is_quit = StateSet::empty(); + let mut no_match = StateSet::empty(); + for state in dfa.states() { + if dfa.is_match_state(state.id()) { + let mut pids = vec![]; + for i in 0..dfa.match_count(state.id()) { + pids.push(dfa.match_pattern(state.id(), i)); + } + matching + .entry(pids) + .or_insert(StateSet::empty()) + .add(state.id()); + } else if dfa.is_quit_state(state.id()) { + is_quit.add(state.id()); + } else { + no_match.add(state.id()); + } + } + + let mut sets: Vec<StateSet> = + matching.into_iter().map(|(_, set)| set).collect(); + sets.push(no_match); + sets.push(is_quit); + sets + } + + fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> { + let mut incoming = vec![]; + for _ in dfa.states() { + incoming.push(vec![vec![]; dfa.alphabet_len()]); + } + for state in dfa.states() { + for (b, next) in state.transitions() { + incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); + } + } + incoming + } +} + +impl StateSet { + fn empty() -> StateSet { + StateSet { ids: Rc::new(RefCell::new(vec![])) } + } + + fn add(&mut self, id: StateID) { + self.ids.borrow_mut().push(id); + } + + fn min(&self) -> StateID { + self.ids.borrow()[0] + } + + fn canonicalize(&mut self) { + self.ids.borrow_mut().sort(); + self.ids.borrow_mut().dedup(); + } + + fn clear(&mut self) { + self.ids.borrow_mut().clear(); + } + + fn len(&self) -> usize { + self.ids.borrow().len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn deep_clone(&self) -> StateSet { + let ids = self.ids.borrow().iter().cloned().collect(); + StateSet { ids: Rc::new(RefCell::new(ids)) } + } + + fn iter<F: FnMut(StateID)>(&self, mut f: F) { + for &id in self.ids.borrow().iter() { + f(id); + } + } + + fn intersection(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => break, + Some(b) => b, + }; + } else if a < b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => break, + Some(b) => b, + }; + } + } + } + + fn subtract(&self, other: &StateSet, dest: &mut StateSet) { + dest.clear(); + if self.is_empty() || other.is_empty() { + self.iter(|s| dest.add(s)); + return; + } + + let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } else if a < b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } + } + for a in ita { + dest.add(a); + } + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/mod.rs b/vendor/regex-automata-0.2.0/src/dfa/mod.rs new file mode 100644 index 000000000..6f9fe605e --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/mod.rs @@ -0,0 +1,363 @@ +/*! +A module for building and searching with determinstic finite automata (DFAs). + +Like other modules in this crate, DFAs support a rich regex syntax with Unicode +features. DFAs also have extensive options for configuring the best space vs +time trade off for your use case and provides support for cheap deserialization +of automata for use in `no_std` environments. + +If you're looking for lazy DFAs that build themselves incrementally during +search, then please see the top-level [`hybrid` module](crate::hybrid). + +# Overview + +This section gives a brief overview of the primary types in this module: + +* A [`regex::Regex`] provides a way to search for matches of a regular +expression using DFAs. This includes iterating over matches with both the start +and end positions of each match. +* A [`dense::DFA`] provides low level access to a DFA that uses a dense +representation (uses lots of space, but fast searching). +* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse +representation (uses less space, but slower searching). +* An [`Automaton`] trait that defines an interface that both dense and sparse +DFAs implement. (A `regex::Regex` is generic over this trait.) +* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g., +[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g., +[`dense::DFA::from_bytes`]). + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: searching with regex sets + +The DFAs in this module all fully support searching with multiple regexes +simultaneously. You can use this support with standard leftmost-first style +searching to find non-overlapping matches: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let re = Regex::new_many(&[r"\w+", r"\S+"])?; +let text = b"@foo bar"; +let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 4), + MultiMatch::must(0, 5, 8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +Or use overlapping style searches to find all possible occurrences: + +``` +use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}}; + +// N.B. For overlapping searches, we need the underlying DFA to report all +// possible matches. +let re = Regex::builder() + .dense(dense::Config::new().match_kind(MatchKind::All)) + .build_many(&[r"\w{3}", r"\S{3}"])?; +let text = b"@foo bar"; +let matches: Vec<MultiMatch> = re.find_overlapping_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 3), + MultiMatch::must(0, 1, 4), + MultiMatch::must(1, 1, 4), + MultiMatch::must(0, 5, 8), + MultiMatch::must(1, 5, 8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: use sparse DFAs + +By default, compiling a regex will use dense DFAs internally. This uses more +memory, but executes searches more quickly. If you can abide slower searches +(somewhere around 3-5x), then sparse DFAs might make more sense since they can +use significantly less space. + +Using sparse DFAs is as easy as using `Regex::new_sparse` instead of +`Regex::new`: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +If you already have dense DFAs for some reason, they can be converted to sparse +DFAs and used to build a new `Regex`. For example: + +``` +use regex_automata::{MultiMatch, dfa::regex::Regex}; + +let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let sparse_re = Regex::builder().build_from_dfas( + dense_re.forward().to_sparse()?, + dense_re.reverse().to_sparse()?, +); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = sparse_re.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: deserialize a DFA + +This shows how to first serialize a DFA into raw bytes, and then deserialize +those raw bytes back into a DFA. While this particular example is a +bit contrived, this same technique can be used in your program to +deserialize a DFA at start up time or by memory mapping a file. + +``` +use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}}; + +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both the forward and reverse DFAs, see note below +let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian(); +let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian(); +// now deserialize both---we need to specify the correct type! +let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0; +let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0; +// finally, reconstruct our regex +let re2 = Regex::builder().build_from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +There are a few points worth noting here: + +* We need to extract the raw DFAs used by the regex and serialize those. You +can build the DFAs manually yourself using [`dense::Builder`], but using +the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In +particular, a `Regex` constructs a reverse DFA for finding the starting +location of matches.) +* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method. +In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`] +or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're +deserializing your DFA from. If you intend to deserialize on either platform, +then you'll need to serialize both and deserialize the right one depending on +your target's endianness. +* Safely deserializing a DFA requires verifying the raw bytes, particularly if +they are untrusted, since an invalid DFA could cause logical errors, panics +or even undefined behavior. This verification step requires visiting all of +the transitions in the DFA, which can be costly. If cheaper verification is +desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does +verification that can be performed in constant time. However, one can only use +this routine if the caller can guarantee that the bytes provided encoded a +valid DFA. + +The same process can be achieved with sparse DFAs as well: + +``` +use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}}; + +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both +let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian(); +let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian(); +// now deserialize both---we need to specify the correct type! +let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0; +let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0; +// finally, reconstruct our regex +let re2 = Regex::builder().build_from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +Note that unlike dense DFAs, sparse DFAs have no alignment requirements. +Conversely, dense DFAs must be be aligned to the same alignment as a +[`StateID`](crate::util::id::StateID). + +# Support for `no_std` and `alloc`-only + +This crate comes with `alloc` and `std` features that are enabled by default. +When the `alloc` or `std` features are enabled, the API of this module will +include the facilities necessary for compiling, serializing, deserializing +and searching with DFAs. When only the `alloc` feature is enabled, then +implementations of the `std::error::Error` trait are dropped, but everything +else generally remains the same. When both the `alloc` and `std` features are +disabled, the API of this module will shrink such that it only includes the +facilities necessary for deserializing and searching with DFAs. + +The intended workflow for `no_std` environments is thus as follows: + +* Write a program with the `alloc` or `std` features that compiles and +serializes a regular expression. You may need to serialize both little and big +endian versions of each DFA. (So that's 4 DFAs in total for each regex.) +* In your `no_std` environment, follow the examples above for deserializing +your previously serialized DFAs into regexes. You can then search with them as +you would any regex. + +Deserialization can happen anywhere. For example, with bytes embedded into a +binary or with a file memory mapped at runtime. + +TODO: Include link to `regex-cli` here pointing out how to generate Rust code +for deserializing DFAs. + +# Syntax + +This module supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). + +There are two things that are not supported by the DFAs in this module: + +* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top +of them) can only find the offsets of an entire match, but cannot resolve +the offsets of each capturing group. This is because DFAs do not have the +expressive power necessary. +* Unicode word boundaries. These present particularly difficult challenges for +DFA construction and would result in an explosion in the number of states. +One can enable [`dense::Config::unicode_word_boundary`] though, which provides +heuristic support for Unicode word boundaries that only works on ASCII text. +Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work +on any input. + +There are no plans to lift either of these limitations. + +Note that these restrictions are identical to the restrictions on lazy DFAs. + +# Differences with general purpose regexes + +The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a +general purpose regular expression engine. It aims to automatically balance low +compile times, fast search times and low memory usage, while also providing +a convenient API for users. In contrast, this module provides a lower level +regular expression interface based exclusively on DFAs that is a bit less +convenient while providing more explicit control over memory usage and search +times. + +Here are some specific negative differences: + +* **Compilation can take an exponential amount of time and space** in the size +of the regex pattern. While most patterns do not exhibit worst case exponential +time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA +with approximately `2^(N+2)` states. For this reason, untrusted patterns should +not be compiled with this module. (In the future, the API may expose an option +to return an error if the DFA gets too big.) +* This module does not support sub-match extraction via capturing groups, which +can be achieved with the regex crate's "captures" API. +* While the regex crate doesn't necessarily sport fast compilation times, +the regexes in this module are almost universally slow to compile, especially +when they contain large Unicode character classes. For example, on my system, +compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling +a sparse regex takes about the same time but only uses about 1.2MB of +memory.) Conversly, compiling the same regex without Unicode support, e.g., +`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this +reason, you should only use Unicode character classes if you absolutely need +them! (They are enabled by default though.) +* This module does not support Unicode word boundaries. ASCII word bondaries +may be used though by disabling Unicode or selectively doing so in the syntax, +e.g., `(?-u:\b)`. There is also an option to +[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary), +where the corresponding DFA will give up if any non-ASCII byte is seen. +* As a lower level API, this module does not do literal optimizations +automatically. Although it does provide hooks in its API to make use of the +[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal +optimizations means that searches may run much slower than what you're +accustomed to, although, it does provide more predictable and consistent +performance. +* There is no `&str` API like in the regex crate. In this module, all APIs +operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 +boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8), +[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or +[`regex::Config::utf8`] are disabled. + +With some of the downsides out of the way, here are some positive differences: + +* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply +deserialized. Deserialization can be done in constant time with the unchecked +APIs, since searching can be performed directly on the raw serialized bytes of +a DFA. +* This module was specifically designed so that the searching phase of a +DFA has minimal runtime requirements, and can therefore be used in `no_std` +environments. While `no_std` environments cannot compile regexes, they can +deserialize pre-compiled regexes. +* Since this module builds DFAs ahead of time, it will generally out-perform +the `regex` crate on equivalent tasks. The performance difference is likely +not large. However, because of a complex set of optimizations in the regex +crate (like literal optimizations), an accurate performance comparison may be +difficult to do. +* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search +performance a small amount, but uses much less storage space. Potentially even +less than what the regex crate uses. +* This module exposes DFAs directly, such as [`dense::DFA`] and +[`sparse::DFA`], which enables one to do less work in some cases. For example, +if you only need the end of a match and not the start of a match, then you can +use a DFA directly without building a `Regex`, which always requires a second +DFA to find the start of a match. +* This module provides more control over memory usage. Aside from choosing +between dense and sparse DFAs, one can also choose a smaller state identifier +representation to use less space. Also, one can enable DFA minimization +via [`dense::Config::minimize`], but it can increase compilation times +dramatically. +*/ + +pub use crate::dfa::automaton::{Automaton, OverlappingState}; +#[cfg(feature = "alloc")] +pub use crate::dfa::error::Error; + +/// This is an alias for a state ID of zero. It has special significance +/// because it always corresponds to the first state in a DFA, and the first +/// state in a DFA is always "dead." That is, the dead state always has all +/// of its transitions set to itself. Moreover, the dead state is used as a +/// sentinel for various things. e.g., In search, reaching a dead state means +/// that the search must stop. +const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO; + +mod accel; +mod automaton; +pub mod dense; +#[cfg(feature = "alloc")] +mod determinize; +#[cfg(feature = "alloc")] +pub(crate) mod error; +#[cfg(feature = "alloc")] +mod minimize; +pub mod regex; +mod search; +pub mod sparse; +mod special; +#[cfg(feature = "transducer")] +mod transducer; diff --git a/vendor/regex-automata-0.2.0/src/dfa/regex.rs b/vendor/regex-automata-0.2.0/src/dfa/regex.rs new file mode 100644 index 000000000..d0917e17d --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/regex.rs @@ -0,0 +1,2146 @@ +/*! +A DFA-backed `Regex`. + +This module provides [`Regex`], which is defined generically over the +[`Automaton`] trait. A `Regex` implements convenience routines you might have +come to expect, such as finding the start/end of a match and iterating over +all non-overlapping matches. This `Regex` type is limited in its capabilities +to what a DFA can provide. Therefore, APIs involving capturing groups, for +example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::dfa) for examples. +*/ + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use crate::{ + dfa::automaton::{Automaton, OverlappingState}, + util::prefilter::{self, Prefilter}, + MatchError, MultiMatch, +}; +#[cfg(feature = "alloc")] +use crate::{ + dfa::{dense, error::Error, sparse}, + nfa::thompson, + util::matchtypes::MatchKind, +}; + +// When the alloc feature is enabled, the regex type sets its A type parameter +// to default to an owned dense DFA. But without alloc, we set no default. This +// makes things a lot more convenient in the common case, since writing out the +// DFA types is pretty annoying. +// +// Since we have two different definitions but only want to write one doc +// string, we use a macro to capture the doc and other attributes once and then +// repeat them for each definition. +macro_rules! define_regex_type { + ($(#[$doc:meta])*) => { + #[cfg(feature = "alloc")] + $(#[$doc])* + pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> { + prefilter: Option<P>, + forward: A, + reverse: A, + utf8: bool, + } + + #[cfg(not(feature = "alloc"))] + $(#[$doc])* + pub struct Regex<A, P = prefilter::None> { + prefilter: Option<P>, + forward: A, + reverse: A, + utf8: bool, + } + }; +} + +define_regex_type!( + /// A regular expression that uses deterministic finite automata for fast + /// searching. + /// + /// A regular expression is comprised of two DFAs, a "forward" DFA and a + /// "reverse" DFA. The forward DFA is responsible for detecting the end of + /// a match while the reverse DFA is responsible for detecting the start + /// of a match. Thus, in order to find the bounds of any given match, a + /// forward search must first be run followed by a reverse search. A match + /// found by the forward DFA guarantees that the reverse DFA will also find + /// a match. + /// + /// The type of the DFA used by a `Regex` corresponds to the `A` type + /// parameter, which must satisfy the [`Automaton`] trait. Typically, + /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a + /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more + /// memory but search faster, while sparse DFAs use less memory but search + /// more slowly. + /// + /// By default, a regex's automaton type parameter is set to + /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most + /// in-memory work loads, this is the most convenient type that gives the + /// best search performance. When the `alloc` feature is disabled, no + /// default type is used. + /// + /// A `Regex` also has a `P` type parameter, which is used to select the + /// prefilter used during search. By default, no prefilter is enabled by + /// setting the type to default to [`prefilter::None`]. A prefilter can be + /// enabled by using the [`Regex::prefilter`] method. + /// + /// # When should I use this? + /// + /// Generally speaking, if you can afford the overhead of building a full + /// DFA for your regex, and you don't need things like capturing groups, + /// then this is a good choice if you're looking to optimize for matching + /// speed. Note however that its speed may be worse than a general purpose + /// regex engine if you don't select a good [prefilter]. + /// + /// # Earliest vs Leftmost vs Overlapping + /// + /// The search routines exposed on a `Regex` reflect three different ways + /// of searching: + /// + /// * "earliest" means to stop as soon as a match has been detected. + /// * "leftmost" means to continue matching until the underlying + /// automaton cannot advance. This reflects "standard" searching you + /// might be used to in other regex engines. e.g., This permits + /// non-greedy and greedy searching to work as you would expect. + /// * "overlapping" means to find all possible matches, even if they + /// overlap. + /// + /// Generally speaking, when doing an overlapping search, you'll want to + /// build your regex DFAs with [`MatchKind::All`] semantics. Using + /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is + /// likely to lead to odd behavior since `LeftmostFirst` specifically omits + /// some matches that can never be reported due to its semantics. + /// + /// The following example shows the differences between how these different + /// types of searches impact looking for matches of `[a-z]+` in the + /// haystack `abc`. + /// + /// ``` + /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; + /// + /// let pattern = r"[a-z]+"; + /// let haystack = "abc".as_bytes(); + /// + /// // With leftmost-first semantics, we test "earliest" and "leftmost". + /// let re = dfa::regex::Builder::new() + /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) + /// .build(pattern)?; + /// + /// // "earliest" searching isn't impacted by greediness + /// let mut it = re.find_earliest_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// // "leftmost" searching supports greediness (and non-greediness) + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// // For overlapping, we want "all" match kind semantics. + /// let re = dfa::regex::Builder::new() + /// .dense(dense::Config::new().match_kind(MatchKind::All)) + /// .build(pattern)?; + /// + /// // In the overlapping search, we find all three possible matches + /// // starting at the beginning of the haystack. + /// let mut it = re.find_overlapping_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Sparse DFAs + /// + /// Since a `Regex` is generic over the [`Automaton`] trait, it can be + /// used with any kind of DFA. While this crate constructs dense DFAs by + /// default, it is easy enough to build corresponding sparse DFAs, and then + /// build a regex from them: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// // First, build a regex that uses dense DFAs. + /// let dense_re = Regex::new("foo[0-9]+")?; + /// + /// // Second, build sparse DFAs from the forward and reverse dense DFAs. + /// let fwd = dense_re.forward().to_sparse()?; + /// let rev = dense_re.reverse().to_sparse()?; + /// + /// // Third, build a new regex from the constituent sparse DFAs. + /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); + /// + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert_eq!(true, sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Alternatively, one can use a [`Builder`] to construct a sparse DFA + /// more succinctly. (Note though that dense DFAs are still constructed + /// first internally, and then converted to sparse DFAs, as in the example + /// above.) + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert!(sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Fallibility + /// + /// In non-default configurations, the DFAs generated in this module may + /// return an error during a search. (Currently, the only way this happens + /// is if quit bytes are added or Unicode word boundaries are heuristically + /// enabled, both of which are turned off by default.) For convenience, the + /// main search routines, like [`find_leftmost`](Regex::find_leftmost), + /// will panic if an error occurs. However, if you need to use DFAs + /// which may produce an error at search time, then there are fallible + /// equivalents of all search routines. For example, for `find_leftmost`, + /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). + /// The routines prefixed with `try_` return `Result<Option<MultiMatch>, + /// MatchError>`, where as the infallible routines simply return + /// `Option<MultiMatch>`. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte, and handle the error returned. This could be useful if, for + /// example, you wanted to prevent a user supplied pattern from matching + /// across a line boundary. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = re.try_find_leftmost(haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[derive(Clone, Debug)] +); + +#[cfg(feature = "alloc")] +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<Regex, Error> { + Builder::new().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> { + Builder::new().build_many(patterns) + } +} + +#[cfg(feature = "alloc")] +impl Regex<sparse::DFA<Vec<u8>>> { + /// Parse the given regular expression using the default configuration, + /// except using sparse DFAs, and return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_sparse("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_sparse( + pattern: &str, + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + Builder::new().build_sparse(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set" + /// using sparse DFAs. This otherwise similarly uses the default regex + /// configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many_sparse<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + Builder::new().build_many_sparse(patterns) + } +} + +/// Convenience routines for regex construction. +#[cfg(feature = "alloc")] +impl Regex { + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a regex. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode for `Regex` iteration. + /// When UTF-8 mode is disabled, the position immediately following an + /// empty match is where the next search begins, instead of the next + /// position of a UTF-8 encoded codepoint. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// dfa::regex::Regex, + /// nfa::thompson, + /// MultiMatch, SyntaxConfig, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(MultiMatch::must(0, 1, 9)); + /// let got = re.find_leftmost(haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } +} + +/// Standard search routines for finding and iterating over matches. +impl<A: Automaton, P: Prefilter> Regex<A, P> { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_is_match`](Regex::try_is_match). + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match(b"foo12345bar")); + /// assert_eq!(false, re.is_match(b"foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.is_match_at(haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest`](Regex::try_find_earliest). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// // Normally, the leftmost first match would greedily consume as many + /// // decimal digits as it could. But a match is detected as soon as one + /// // digit is seen. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 4)), + /// re.find_earliest(b"foo12345"), + /// ); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the "earliest" match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> { + self.find_earliest_at(haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// // Greediness is applied appropriately when compared to find_earliest. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 11)), + /// re.find_leftmost(b"zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> { + self.find_leftmost_at(haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping`](Regex::try_find_overlapping). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = dfa::OverlappingState::start(); + /// + /// let expected = Some(MultiMatch::must(1, 0, 4)); + /// let got = re.find_overlapping(haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(MultiMatch::must(0, 1, 4)); + /// let got = re.find_overlapping(haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping( + &self, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.find_overlapping_at(haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). + /// + /// # Example + /// + /// This example shows how to run an "earliest" iterator. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::new("[0-9]+")?; + /// let haystack = "123".as_bytes(); + /// + /// // Normally, a standard leftmost iterator would return a single + /// // match, but since "earliest" detects matches earlier, we get + /// // three matches. + /// let mut it = re.find_earliest_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindEarliestMatches<'r, 't, A, P> { + FindEarliestMatches::new(self, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); + /// assert_eq!(matches, vec![ + /// MultiMatch::must(0, 0, 4), + /// MultiMatch::must(0, 5, 10), + /// MultiMatch::must(0, 11, 17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 't, A, P> { + FindLeftmostMatches::new(self, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// + /// let mut it = re.find_overlapping_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindOverlappingMatches<'r, 't, A, P> { + FindOverlappingMatches::new(self, haystack) + } +} + +/// Lower level infallible search routines that permit controlling where +/// the search starts and ends in a particular sequence. This is useful for +/// executing searches that need to take surrounding context into account. This +/// is required for correctly implementing iteration because of look-around +/// operators (`^`, `$`, `\b`). +impl<A: Automaton, P: Prefilter> Regex<A, P> { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_is_match_at`](Regex::try_is_match_at). + pub fn is_match_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> bool { + self.try_is_match_at(haystack, start, end).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_at`](Regex::try_find_earliest_at). + pub fn find_earliest_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_earliest_at(haystack, start, end).unwrap() + } + + /// Returns the same as `find_leftmost`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches within the + /// same haystack, which cannot be done correctly by simply providing a + /// subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). + pub fn find_leftmost_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_leftmost_at(haystack, start, end).unwrap() + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). + pub fn find_overlapping_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping_at(haystack, start, end, state).unwrap() + } +} + +/// Fallible search routines. These may return an error when the underlying +/// DFAs have been configured in a way that permits them to fail during a +/// search. +/// +/// Errors during search only occur when the DFA has been explicitly +/// configured to do so, usually by specifying one or more "quit" bytes or by +/// heuristically enabling Unicode word boundaries. +/// +/// Errors will never be returned using the default configuration. So these +/// fallible routines are only needed for particular configurations. +impl<A: Automaton, P: Prefilter> Regex<A, P> { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match`](Regex::is_match). + pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> { + self.try_is_match_at(haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest`](Regex::find_earliest). + pub fn try_find_earliest( + &self, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at(haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost`](Regex::find_leftmost). + pub fn try_find_leftmost( + &self, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at(haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping`](Regex::find_overlapping). + pub fn try_find_overlapping( + &self, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at(haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_iter`](Regex::find_earliest_iter). + pub fn try_find_earliest_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindEarliestMatches<'r, 't, A, P> { + TryFindEarliestMatches::new(self, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_iter`](Regex::find_leftmost_iter). + pub fn try_find_leftmost_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 't, A, P> { + TryFindLeftmostMatches::new(self, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_iter`](Regex::find_overlapping_iter). + pub fn try_find_overlapping_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 't, A, P> { + TryFindOverlappingMatches::new(self, haystack) + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl<A: Automaton, P: Prefilter> Regex<A, P> { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match_at`](Regex::is_match_at). + pub fn try_is_match_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<bool, MatchError> { + self.forward() + .find_earliest_fwd_at( + self.scanner().as_mut(), + None, + haystack, + start, + end, + ) + .map(|x| x.is_some()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_at`](Regex::find_earliest_at). + pub fn try_find_earliest_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + ) + } + + /// The implementation of "earliest" searching, where a prefilter scanner + /// may be given. + fn try_find_earliest_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match (&fwd) + .find_earliest_fwd_at(pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. + let start = (&rev) + .find_earliest_rev_at(None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern" + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_at`](Regex::find_leftmost_at). + pub fn try_find_leftmost_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + ) + } + + /// The implementation of leftmost searching, where a prefilter scanner + /// may be given. + fn try_find_leftmost_at_imp( + &self, + scanner: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match (&fwd) + .find_leftmost_fwd_at(scanner, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = (&rev) + .find_leftmost_rev_at(None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_at`](Regex::find_overlapping_at). + pub fn try_find_overlapping_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + state, + ) + } + + /// The implementation of overlapping search at a given range in + /// `haystack`, where `scanner` is a prefilter (if active) and `state` is + /// the current state of the search. + fn try_find_overlapping_at_imp( + &self, + scanner: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + // TODO: Decide whether it's worth making this assert work. It doesn't + // work currently because 'has_starts_for_each_pattern' isn't on the + // Automaton trait. Without this assert, we still get a panic, but it's + // a bit more inscrutable. + // assert!( + // rev.has_starts_for_each_pattern(), + // "overlapping searches require that the reverse DFA is \ + // compiled with the 'starts_for_each_pattern' option", + // ); + let end = match (&fwd).find_overlapping_fwd_at( + scanner, None, haystack, start, end, state, + )? { + None => return Ok(None), + Some(end) => end, + }; + // Unlike the leftmost cases, the reverse overlapping search may match + // a different pattern than the forward search. See test failures when + // using `None` instead of `Some(end.pattern())` below. Thus, we must + // run our reverse search using the pattern that matched in the forward + // direction. + let start = (&rev) + .find_leftmost_rev_at( + Some(end.pattern()), + haystack, + 0, + end.offset(), + )? + .expect("reverse search must match if forward search does"); + assert!(start.offset() <= end.offset()); + assert_eq!(start.pattern(), end.pattern()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl<A: Automaton, P: Prefilter> Regex<A, P> { + /// Attach the given prefilter to this regex. + pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> { + Regex { + prefilter: Some(prefilter), + forward: self.forward, + reverse: self.reverse, + utf8: self.utf8, + } + } + + /// Remove any prefilter from this regex. + pub fn without_prefilter(self) -> Regex<A> { + Regex { + prefilter: None, + forward: self.forward, + reverse: self.reverse, + utf8: self.utf8, + } + } + + /// Return the underlying DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn forward(&self) -> &A { + &self.forward + } + + /// Return the underlying DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn reverse(&self) -> &A { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_count()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + assert_eq!( + self.forward().pattern_count(), + self.reverse().pattern_count() + ); + self.forward().pattern_count() + } + + /// Convenience function for returning this regex's prefilter as a trait + /// object. + /// + /// If this regex doesn't have a prefilter, then `None` is returned. + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + match self.prefilter { + None => None, + Some(ref x) => Some(&*x), + } + } + + /// Convenience function for returning a prefilter scanner. + fn scanner(&self) -> Option<prefilter::Scanner> { + self.prefilter().map(prefilter::Scanner::new) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindEarliestMatches<'r, 't, A, P>( + TryFindEarliestMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { + fn new( + re: &'r Regex<A, P>, + text: &'t [u8], + ) -> FindEarliestMatches<'r, 't, A, P> { + FindEarliestMatches(TryFindEarliestMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindEarliestMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindLeftmostMatches<'r, 't, A, P>( + TryFindLeftmostMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { + fn new( + re: &'r Regex<A, P>, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 't, A, P> { + FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindLeftmostMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all overlapping matches for a particular infallible +/// search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( + TryFindOverlappingMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { + fn new( + re: &'r Regex<A, P>, + text: &'t [u8], + ) -> FindOverlappingMatches<'r, 't, A, P> { + FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindOverlappingMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindEarliestMatches<'r, 't, A, P> { + re: &'r Regex<A, P>, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { + fn new( + re: &'r Regex<A, P>, + text: &'t [u8], + ) -> TryFindEarliestMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindEarliestMatches { + re, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindEarliestMatches<'r, 't, A, P> +{ + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_earliest_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindLeftmostMatches<'r, 't, A, P> { + re: &'r Regex<A, P>, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { + fn new( + re: &'r Regex<A, P>, + text: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindLeftmostMatches { + re, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindLeftmostMatches<'r, 't, A, P> +{ + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_leftmost_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all overlapping matches for a particular fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { + re: &'r Regex<A, P>, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + state: OverlappingState, +} + +impl<'r, 't, A: Automaton, P: Prefilter> + TryFindOverlappingMatches<'r, 't, A, P> +{ + fn new( + re: &'r Regex<A, P>, + text: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindOverlappingMatches { + re, + scanner, + text, + last_end: 0, + state: OverlappingState::start(), + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindOverlappingMatches<'r, 't, A, P> +{ + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_overlapping_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + &mut self.state, + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + // Unlike the non-overlapping case, we're OK with empty matches at this + // level. In particular, the overlapping search algorithm is itself + // responsible for ensuring that progress is always made. + self.last_end = m.end(); + Some(Ok(m)) + } +} + +/// The configuration used for compiling a DFA-backed regex. +/// +/// A regex configuration is a simple data object that is typically used with +/// [`Builder::configure`]. +#[cfg(feature = "alloc")] +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + utf8: Option<bool>, +} + +#[cfg(feature = "alloc")] +impl Config { + /// Return a new default regex compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (the default) and an empty match is seen, + /// the iterators on [`Regex`] will always start the next search at the + /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 + /// mode is disabled, such searches are begun at the next byte offset. + /// + /// If this mode is enabled and invalid UTF-8 is given to search, then + /// behavior is unspecified. + /// + /// Generally speaking, one should enable this when + /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) + /// and + /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) + /// are enabled, and disable it otherwise. + /// + /// # Example + /// + /// This example demonstrates the differences between when this option is + /// enabled and disabled. The differences only arise when the regex can + /// return matches of length zero. + /// + /// In this first snippet, we show the results when UTF-8 mode is disabled. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And in this snippet, we execute the same search on the same haystack, + /// but with UTF-8 mode enabled. Notice that byte offsets that would + /// otherwise split the encoding of `☃` are not returned. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(true)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Returns true if and only if this configuration has UTF-8 mode enabled. + /// + /// When UTF-8 mode is enabled and an empty match is seen, the iterators on + /// [`Regex`] will always start the next search at the next UTF-8 encoded + /// codepoint. When UTF-8 mode is disabled, such searches are begun at the + /// next byte offset. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { utf8: o.utf8.or(self.utf8) } + } +} + +/// A builder for a regex based on deterministic finite automatons. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder in +/// that it permits fine grain configuration of the construction process. The +/// trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are three +/// different UTF-8 modes: +/// +/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the +/// pattern itself can contain sub-expressions that match invalid UTF-8. +/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) +/// controls whether the implicit unanchored prefix added to the NFA can +/// match through invalid UTF-8 or not. +/// * [`Config::utf8`] controls how the regex iterators themselves advance +/// the starting position of the next search when a match with zero length is +/// found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two DFAs, where one is +/// responsible for finding the end of a match and the other is responsible +/// for finding the start of a match. If you only need to detect whether +/// something matched, or only the end of a match, then you should use a +/// [`dense::Builder`] to construct a single DFA, which is cheaper than +/// building two DFAs. +/// +/// # Build methods +/// +/// This builder has a few "build" methods. In general, it's the result of +/// combining the following parameters: +/// +/// * Building one or many regexes. +/// * Building a regex with dense or sparse DFAs. +/// +/// The simplest "build" method is [`Builder::build`]. It accepts a single +/// pattern and builds a dense DFA using `usize` for the state identifier +/// representation. +/// +/// The most general "build" method is [`Builder::build_many`], which permits +/// building a regex that searches for multiple patterns simultaneously while +/// using a specific state identifier representation. +/// +/// The most flexible "build" method, but hardest to use, is +/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is +/// just a pair of DFAs, and this method allows you to specify those DFAs +/// exactly. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax, the NFA and +/// the regex itself. This is generally what you want for matching on +/// arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// }; +/// +/// let re = Regex::builder() +/// .configure(Regex::config().utf8(false)) +/// .syntax(SyntaxConfig::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(MultiMatch::must(0, 1, 9)); +/// let got = re.find_leftmost(haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. Notice also that the +/// // search was unanchored and skipped over invalid UTF-8. +/// // Disabling UTF-8 on the Thompson NFA permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + dfa: dense::Builder, +} + +#[cfg(feature = "alloc")] +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), dfa: dense::Builder::new() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<Regex, Error> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given pattern using sparse DFAs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build_sparse( + &self, + pattern: &str, + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + self.build_many_sparse(&[pattern]) + } + + /// Build a regex from the given patterns. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, Error> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + dense::Config::new() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(true), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a sparse regex from the given patterns. + pub fn build_many_sparse<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + let re = self.build_many(patterns)?; + let forward = re.forward().to_sparse()?; + let reverse = re.reverse().to_sparse()?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse DFAs. + /// + /// This is useful when deserializing a regex from some arbitrary + /// memory region. This is also useful for building regexes from other + /// types of DFAs. + /// + /// If you're building the DFAs from scratch instead of building new DFAs + /// from other DFAs, then you'll need to make sure that the reverse DFA is + /// configured correctly to match the intended semantics. Namely: + /// + /// * It should be anchored. + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * It should have anchored start states compiled for each pattern. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions are satisfied, then behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, only the configuration from + /// [`Config`] is applied. The only configuration settings on this builder + /// only apply when the builder owns the construction of the DFAs + /// themselves. + /// + /// # Example + /// + /// This example is a bit a contrived. The usual use of these methods + /// would involve serializing `initial_re` somewhere and then deserializing + /// it later to build a regex. But in this case, we do everything in + /// memory. + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// This example shows how to build a `Regex` that uses sparse DFAs instead + /// of dense DFAs without using one of the convenience `build_sparse` + /// routines: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let fwd = initial_re.forward().to_sparse()?; + /// let rev = initial_re.reverse().to_sparse()?; + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_dfas<A: Automaton>( + &self, + forward: A, + reverse: A, + ) -> Regex<A> { + let utf8 = self.config.get_utf8(); + Regex { prefilter: None, forward, reverse, utf8 } + } + + /// Apply the given regex configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the dense DFA compilation configuration for this builder using + /// [`dense::Config`](dense::Config). + /// + /// This permits setting things like whether the underlying DFAs should + /// be minimized. + pub fn dense(&mut self, config: dense::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +#[cfg(feature = "alloc")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +#[inline(always)] +fn next_unwrap( + item: Option<Result<MultiMatch, MatchError>>, +) -> Option<MultiMatch> { + match item { + None => None, + Some(Ok(m)) => Some(m), + Some(Err(err)) => panic!( + "unexpected regex search error: {}\n\ + to handle search errors, use try_ methods", + err, + ), + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/search.rs b/vendor/regex-automata-0.2.0/src/dfa/search.rs new file mode 100644 index 000000000..492414981 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/search.rs @@ -0,0 +1,493 @@ +use crate::{ + dfa::{ + accel, + automaton::{Automaton, OverlappingState, StateMatch}, + }, + util::{ + id::{PatternID, StateID}, + matchtypes::HalfMatch, + prefilter, MATCH_OFFSET, + }, + MatchError, +}; + +#[inline(never)] +pub fn find_earliest_fwd<A: Automaton + ?Sized>( + pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, true, dfa, pattern_id, bytes, start, end) + } else { + find_fwd(None, true, dfa, pattern_id, bytes, start, end) + } +} + +#[inline(never)] +pub fn find_leftmost_fwd<A: Automaton + ?Sized>( + pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, false, dfa, pattern_id, bytes, start, end) + } else { + find_fwd(None, false, dfa, pattern_id, bytes, start, end) + } +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters +/// getting inlined eliminate some critical branches. To avoid bloating binary +/// size, we only call this function in a fixed number of places. +#[inline(always)] +fn find_fwd<A: Automaton + ?Sized>( + mut pre: Option<&mut prefilter::Scanner>, + earliest: bool, + dfa: &A, + pattern_id: Option<PatternID>, + haystack: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= haystack.len()); + assert!(end <= haystack.len()); + + // Why do this? This lets 'bytes[at]' work without bounds checks below. + // It seems the assert on 'end <= haystack.len()' above is otherwise + // not enough. Why not just make 'bytes' scoped this way anyway? Well, + // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' + // for resolving look-ahead. + let bytes = &haystack[..end]; + + let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?; + let mut last_match = None; + let mut at = start; + if let Some(ref mut pre) = pre { + // If a prefilter doesn't report false positives, then we don't need to + // touch the DFA at all. However, since all matches include the pattern + // ID, and the prefilter infrastructure doesn't report pattern IDs, we + // limit this optimization to cases where there is exactly one pattern. + // In that case, any match must be the 0th pattern. + if dfa.pattern_count() == 1 && !pre.reports_false_positives() { + return Ok(pre.next_candidate(bytes, at).into_option().map( + |offset| HalfMatch { pattern: PatternID::ZERO, offset }, + )); + } else if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + if dfa.is_start_state(state) { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } else if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_fwd(needles, bytes, at) + .unwrap_or(bytes.len()); + } + } else if dfa.is_match_state(state) { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: at - MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_fwd(needles, bytes, at) + .unwrap_or(bytes.len()); + } + } else if dfa.is_accel_state(state) { + let needs = dfa.accelerator(state); + at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); + } else if dfa.is_dead_state(state) { + return Ok(last_match); + } else { + debug_assert!(dfa.is_quit_state(state)); + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + } + while at < end && dfa.next_state(state, bytes[at]) == state { + at += 1; + } + } + Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match)) +} + +#[inline(never)] +pub fn find_earliest_rev<A: Automaton + ?Sized>( + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + find_rev(true, dfa, pattern_id, bytes, start, end) +} + +#[inline(never)] +pub fn find_leftmost_rev<A: Automaton + ?Sized>( + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + find_rev(false, dfa, pattern_id, bytes, start, end) +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +fn find_rev<A: Automaton + ?Sized>( + earliest: bool, + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let mut state = init_rev(dfa, pattern_id, bytes, start, end)?; + let mut last_match = None; + let mut at = end; + while at > start { + at -= 1; + while at > start && dfa.next_state(state, bytes[at]) == state { + at -= 1; + } + + let byte = bytes[at]; + state = dfa.next_state(state, byte); + if dfa.is_special_state(state) { + if dfa.is_start_state(state) { + if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_rev(needles, bytes, at) + .map(|i| i + 1) + .unwrap_or(0); + } + } else if dfa.is_match_state(state) { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: at + MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_rev(needles, bytes, at) + .map(|i| i + 1) + .unwrap_or(0); + } + } else if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_rev(needles, bytes, at) + .map(|i| i + 1) + .unwrap_or(0); + } else if dfa.is_dead_state(state) { + return Ok(last_match); + } else { + debug_assert!(dfa.is_quit_state(state)); + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::Quit { byte, offset: at }); + } + } + } + Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match)) +} + +#[inline(never)] +pub fn find_overlapping_fwd<A: Automaton + ?Sized>( + pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should only ever + // use a prefilter when no pattern ID is given. + if pre.is_some() && pattern_id.is_none() { + find_overlapping_fwd_imp( + pre, + dfa, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } else { + find_overlapping_fwd_imp( + None, + dfa, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined +/// permits eliminating a few crucial branches and reduces code size when it is +/// not used. +#[inline(always)] +fn find_overlapping_fwd_imp<A: Automaton + ?Sized>( + mut pre: Option<&mut prefilter::Scanner>, + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let mut state = match caller_state.id() { + None => init_fwd(dfa, pattern_id, bytes, start, end)?, + Some(id) => { + if let Some(last) = caller_state.last_match() { + let match_count = dfa.match_count(id); + if last.match_index < match_count { + let m = HalfMatch { + pattern: dfa.match_pattern(id, last.match_index), + offset: last.offset, + }; + last.match_index += 1; + return Ok(Some(m)); + } + } + + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `MATCH_OFFSET` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `MATCH_OFFSET` bytes. + // + // Incidentally, since MATCH_OFFSET is non-zero, this also makes + // dealing with empty matches convenient. Namely, callers needn't + // special case them when implementing an iterator. Instead, this + // ensures that forward progress is always made. + start += MATCH_OFFSET; + id + } + }; + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + caller_state.set_id(state); + if dfa.is_start_state(state) { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } else if dfa.is_accel_state(state) { + let needles = dfa.accelerator(state); + at = accel::find_fwd(needles, bytes, at) + .unwrap_or(bytes.len()); + } + } else if dfa.is_match_state(state) { + let offset = at - MATCH_OFFSET; + caller_state + .set_last_match(StateMatch { match_index: 1, offset }); + return Ok(Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset, + })); + } else if dfa.is_accel_state(state) { + let needs = dfa.accelerator(state); + at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); + } else if dfa.is_dead_state(state) { + return Ok(None); + } else { + debug_assert!(dfa.is_quit_state(state)); + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + } + } + + let result = eoi_fwd(dfa, bytes, end, &mut state); + caller_state.set_id(state); + if let Ok(Some(ref last_match)) = result { + caller_state.set_last_match(StateMatch { + match_index: 1, + offset: last_match.offset(), + }); + } + result +} + +fn init_fwd<A: Automaton + ?Sized>( + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<StateID, MatchError> { + let state = dfa.start_state_forward(pattern_id, bytes, start, end); + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!dfa.is_match_state(state)); + Ok(state) +} + +fn init_rev<A: Automaton + ?Sized>( + dfa: &A, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<StateID, MatchError> { + let state = dfa.start_state_reverse(pattern_id, bytes, start, end); + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!dfa.is_match_state(state)); + Ok(state) +} + +fn eoi_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + end: usize, + state: &mut StateID, +) -> Result<Option<HalfMatch>, MatchError> { + match bytes.get(end) { + Some(&b) => { + *state = dfa.next_state(*state, b); + if dfa.is_match_state(*state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(*state, 0), + offset: end, + })) + } else { + Ok(None) + } + } + None => { + *state = dfa.next_eoi_state(*state); + if dfa.is_match_state(*state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(*state, 0), + offset: bytes.len(), + })) + } else { + Ok(None) + } + } + } +} + +fn eoi_rev<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + state: StateID, +) -> Result<Option<HalfMatch>, MatchError> { + if start > 0 { + let state = dfa.next_state(state, bytes[start - 1]); + if dfa.is_match_state(state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: start, + })) + } else { + Ok(None) + } + } else { + let state = dfa.next_eoi_state(state); + if dfa.is_match_state(state) { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(state, 0), + offset: 0, + })) + } else { + Ok(None) + } + } +} + +// Currently unused, but is useful to keep around. This was originally used +// when the code above used raw pointers for its main loop. +// /// Returns the distance between the given pointer and the start of `bytes`. +// /// This assumes that the given pointer points to somewhere in the `bytes` +// /// slice given. +// fn offset(bytes: &[u8], p: *const u8) -> usize { +// debug_assert!(bytes.as_ptr() <= p); +// debug_assert!(bytes[bytes.len()..].as_ptr() >= p); +// ((p as isize) - (bytes.as_ptr() as isize)) as usize +// } diff --git a/vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs b/vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs new file mode 100644 index 000000000..ea1c29ff7 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/search_unsafe.rs @@ -0,0 +1,321 @@ +use crate::dfa::automaton::{Automaton, State}; +use crate::MatchError; + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +pub fn find_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, + earliest: bool, +) -> Result<Option<usize>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?; + if earliest && last_match.is_some() { + return Ok(last_match); + } + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + last_match = Some(at - dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + /* + unsafe { + let mut p = bytes.as_ptr().add(start); + while p < bytes[end..].as_ptr() { + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + p = p.add(1); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { + byte, + offset: offset(bytes, p) - 1, + }); + } + last_match = Some(offset(bytes, p) - dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + } + */ + Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match)) +} + +/// This is marked as `inline(always)` specifically because it supports +/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined +/// permits eliminating a few crucial branches. +#[inline(always)] +pub fn find_rev<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, + earliest: bool, +) -> Result<Option<usize>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?; + if earliest && last_match.is_some() { + return Ok(last_match); + } + + let mut at = end; + while at > start { + at -= 1; + let byte = bytes[at]; + state = dfa.next_state(state, byte); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at }); + } + last_match = Some(at + dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + /* + unsafe { + let mut p = bytes.as_ptr().add(end); + while p > bytes[start..].as_ptr() { + p = p.sub(1); + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + if dfa.is_special_state(state) { + if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { + byte, + offset: offset(bytes, p), + }); + } + last_match = Some(offset(bytes, p) + dfa.match_offset()); + if earliest { + return Ok(last_match); + } + } + } + } + */ + Ok(eof_rev(dfa, state, bytes, start)?.or(last_match)) +} + +pub fn find_overlapping_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut State<A::ID>, +) -> Result<Option<usize>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let (mut state, mut last_match) = match caller_state.as_option() { + None => init_fwd(dfa, bytes, start, end)?, + Some(id) => { + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `match_offset` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `match_offset` bytes. + start += dfa.match_offset(); + // Since match_offset could be any arbitrary value and we use + // `start` in pointer arithmetic below, we check that we are still + // in bounds. Otherwise, we could materialize a pointer that is + // more than one past the end point of `bytes`, which is UB. + if start > end { + return Ok(None); + } + (id, None) + } + }; + if last_match.is_some() { + caller_state.set(state); + return Ok(last_match); + } + + let mut at = start; + while at < end { + let byte = bytes[at]; + state = dfa.next_state(state, byte); + at += 1; + if dfa.is_special_state(state) { + caller_state.set(state); + if dfa.is_dead_state(state) { + return Ok(None); + } else if dfa.is_quit_state(state) { + return Err(MatchError::Quit { byte, offset: at - 1 }); + } else { + return Ok(Some(at - dfa.match_offset())); + } + } + } + /* + // SAFETY: Other than the normal pointer arithmetic happening here, a + // unique aspect of safety for this function is the fact that the caller + // can provide the state that the search routine will start with. If this + // state were invalid, it would be possible to incorrectly index the + // transition table. We however prevent this from happening by guaranteeing + // that State is valid. Namely, callers cannot mutate a State. All they can + // do is create a "start" state or otherwise reuse a previously set state. + // Since callers can't mutate a state, it follows that a previously set + // state can only be retrieved by crate internal functions. Therefore, our + // use of it is safe since this code will only ever set the provided state + // to a valid state. + unsafe { + let mut p = bytes.as_ptr().add(start); + while p < bytes[end..].as_ptr() { + let byte = *p; + state = dfa.next_state_unchecked(state, byte); + p = p.add(1); + if dfa.is_special_state(state) { + caller_state.set(state); + return if dfa.is_dead_state(state) { + Ok(None) + } else if dfa.is_quit_state(state) { + Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 }) + } else { + Ok(Some(offset(bytes, p) - dfa.match_offset())) + }; + } + } + } + */ + + let result = eof_fwd(dfa, bytes, end, &mut state); + caller_state.set(state); + result +} + +fn init_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<(A::ID, Option<usize>), MatchError> { + let state = dfa.start_state_forward(bytes, start, end); + if dfa.is_match_state(state) { + Ok((state, Some(start - dfa.match_offset()))) + } else { + Ok((state, None)) + } +} + +fn init_rev<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<(A::ID, Option<usize>), MatchError> { + let state = dfa.start_state_reverse(bytes, start, end); + if dfa.is_match_state(state) { + Ok((state, Some(end + dfa.match_offset()))) + } else { + Ok((state, None)) + } +} + +fn eof_fwd<A: Automaton + ?Sized>( + dfa: &A, + bytes: &[u8], + end: usize, + state: &mut A::ID, +) -> Result<Option<usize>, MatchError> { + match bytes.get(end) { + Some(&b) => { + *state = dfa.next_state(*state, b); + if dfa.is_match_state(*state) { + Ok(Some(end)) + } else { + Ok(None) + } + } + None => { + *state = dfa.next_eof_state(*state); + if dfa.is_match_state(*state) { + Ok(Some(bytes.len())) + } else { + Ok(None) + } + } + } +} + +fn eof_rev<A: Automaton + ?Sized>( + dfa: &A, + state: A::ID, + bytes: &[u8], + start: usize, +) -> Result<Option<usize>, MatchError> { + if start > 0 { + if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) { + Ok(Some(start)) + } else { + Ok(None) + } + } else { + if dfa.is_match_state(dfa.next_eof_state(state)) { + Ok(Some(0)) + } else { + Ok(None) + } + } +} + +/// Returns the distance between the given pointer and the start of `bytes`. +/// This assumes that the given pointer points to somewhere in the `bytes` +/// slice given. +fn offset(bytes: &[u8], p: *const u8) -> usize { + debug_assert!(bytes.as_ptr() <= p); + debug_assert!(bytes[bytes.len()..].as_ptr() >= p); + ((p as isize) - (bytes.as_ptr() as isize)) as usize +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/sparse.rs b/vendor/regex-automata-0.2.0/src/dfa/sparse.rs new file mode 100644 index 000000000..346606987 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/sparse.rs @@ -0,0 +1,2283 @@ +/*! +Types and routines specific to sparse DFAs. + +This module is the home of [`sparse::DFA`](DFA). + +Unlike the [`dense`](super::dense) module, this module does not contain a +builder or configuration specific for sparse DFAs. Instead, the intended +way to build a sparse DFA is either by using a default configuration with +its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the +construction of a dense DFA with [`dense::Builder`](super::dense::Builder) +and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For +example, this configures a sparse DFA to do an overlapping search: + +``` +use regex_automata::{ + dfa::{Automaton, OverlappingState, dense}, + HalfMatch, MatchKind, +}; + +let dense_re = dense::Builder::new() + .configure(dense::Config::new().match_kind(MatchKind::All)) + .build(r"Samwise|Sam")?; +let sparse_re = dense_re.to_sparse()?; + +// Setup our haystack and initial start state. +let haystack = b"Samwise"; +let mut state = OverlappingState::start(); + +// First, 'Sam' will match. +let end1 = sparse_re.find_overlapping_fwd_at( + None, None, haystack, 0, haystack.len(), &mut state, +)?; +assert_eq!(end1, Some(HalfMatch::must(0, 3))); + +// And now 'Samwise' will match. +let end2 = sparse_re.find_overlapping_fwd_at( + None, None, haystack, 3, haystack.len(), &mut state, +)?; +assert_eq!(end2, Some(HalfMatch::must(0, 7))); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` +*/ + +#[cfg(feature = "alloc")] +use core::iter; +use core::{ + convert::{TryFrom, TryInto}, + fmt, + mem::size_of, +}; + +#[cfg(feature = "alloc")] +use alloc::{collections::BTreeSet, vec, vec::Vec}; + +#[cfg(feature = "alloc")] +use crate::dfa::{dense, error::Error}; +use crate::{ + dfa::{ + automaton::{fmt_state_indicator, Automaton}, + special::Special, + DEAD, + }, + util::{ + alphabet::ByteClasses, + bytes::{self, DeserializeError, Endian, SerializeError}, + id::{PatternID, StateID}, + start::Start, + DebugByte, + }, +}; + +const LABEL: &str = "rust-regex-automata-dfa-sparse"; +const VERSION: u32 = 2; + +/// A sparse deterministic finite automaton (DFA) with variable sized states. +/// +/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses +/// a more space efficient representation for its transitions. Consequently, +/// sparse DFAs may use much less memory than dense DFAs, but this comes at a +/// price. In particular, reading the more space efficient transitions takes +/// more work, and consequently, searching using a sparse DFA is typically +/// slower than a dense DFA. +/// +/// A sparse DFA can be built using the default configuration via the +/// [`DFA::new`] constructor. Otherwise, one can configure various aspects +/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder), +/// and then convert a dense DFA to a sparse DFA using +/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse). +/// +/// In general, a sparse DFA supports all the same search operations as a dense +/// DFA. +/// +/// Making the choice between a dense and sparse DFA depends on your specific +/// work load. If you can sacrifice a bit of search time performance, then a +/// sparse DFA might be the best choice. In particular, while sparse DFAs are +/// probably always slower than dense DFAs, you may find that they are easily +/// fast enough for your purposes! +/// +/// # Type parameters +/// +/// A `DFA` has one type parameter, `T`, which is used to represent the parts +/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`. +/// +/// # The `Automaton` trait +/// +/// This type implements the [`Automaton`] trait, which means it can be used +/// for searching. For example: +/// +/// ``` +/// use regex_automata::{ +/// dfa::{Automaton, sparse::DFA}, +/// HalfMatch, +/// }; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let expected = HalfMatch::must(0, 8); +/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct DFA<T> { + // When compared to a dense DFA, a sparse DFA *looks* a lot simpler + // representation-wise. In reality, it is perhaps more complicated. Namely, + // in a dense DFA, all information needs to be very cheaply accessible + // using only state IDs. In a sparse DFA however, each state uses a + // variable amount of space because each state encodes more information + // than just its transitions. Each state also includes an accelerator if + // one exists, along with the matching pattern IDs if the state is a match + // state. + // + // That is, a lot of the complexity is pushed down into how each state + // itself is represented. + trans: Transitions<T>, + starts: StartTable<T>, + special: Special, +} + +#[cfg(feature = "alloc")] +impl DFA<Vec<u8>> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding sparse DFA. + /// + /// If you want a non-default configuration, then use + /// the [`dense::Builder`](crate::dfa::dense::Builder) + /// to set your own configuration, and then call + /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create + /// a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; + /// + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> { + dense::Builder::new() + .build(pattern) + .and_then(|dense| dense.to_sparse()) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding multi-DFA. + /// + /// If you want a non-default configuration, then use + /// the [`dense::Builder`](crate::dfa::dense::Builder) + /// to set your own configuration, and then call + /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create + /// a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<DFA<Vec<u8>>, Error> { + dense::Builder::new() + .build_many(patterns) + .and_then(|dense| dense.to_sparse()) + } +} + +#[cfg(feature = "alloc")] +impl DFA<Vec<u8>> { + /// Create a new DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// let dfa = sparse::DFA::always_match()?; + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA<Vec<u8>>, Error> { + dense::DFA::always_match()?.to_sparse() + } + + /// Create a new sparse DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::{Automaton, sparse}; + /// + /// let dfa = sparse::DFA::never_match()?; + /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); + /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA<Vec<u8>>, Error> { + dense::DFA::never_match()?.to_sparse() + } + + /// The implementation for constructing a sparse DFA from a dense DFA. + pub(crate) fn from_dense<T: AsRef<[u32]>>( + dfa: &dense::DFA<T>, + ) -> Result<DFA<Vec<u8>>, Error> { + // In order to build the transition table, we need to be able to write + // state identifiers for each of the "next" transitions in each state. + // Our state identifiers correspond to the byte offset in the + // transition table at which the state is encoded. Therefore, we do not + // actually know what the state identifiers are until we've allocated + // exactly as much space as we need for each state. Thus, construction + // of the transition table happens in two passes. + // + // In the first pass, we fill out the shell of each state, which + // includes the transition count, the input byte ranges and zero-filled + // space for the transitions and accelerators, if present. In this + // first pass, we also build up a map from the state identifier index + // of the dense DFA to the state identifier in this sparse DFA. + // + // In the second pass, we fill in the transitions based on the map + // built in the first pass. + + // The capacity given here reflects a minimum. (Well, the true minimum + // is likely even bigger, but hopefully this saves a few reallocs.) + let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count()); + // This maps state indices from the dense DFA to StateIDs in the sparse + // DFA. We build out this map on the first pass, and then use it in the + // second pass to back-fill our transitions. + let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()]; + for state in dfa.states() { + let pos = sparse.len(); + + remap[dfa.to_index(state.id())] = + StateID::new(pos).map_err(|_| Error::too_many_states())?; + // zero-filled space for the transition count + sparse.push(0); + sparse.push(0); + + let mut transition_count = 0; + for (unit1, unit2, _) in state.sparse_transitions() { + match (unit1.as_u8(), unit2.as_u8()) { + (Some(b1), Some(b2)) => { + transition_count += 1; + sparse.push(b1); + sparse.push(b2); + } + (None, None) => {} + (Some(_), None) | (None, Some(_)) => { + // can never occur because sparse_transitions never + // groups EOI with any other transition. + unreachable!() + } + } + } + // Add dummy EOI transition. This is never actually read while + // searching, but having space equivalent to the total number + // of transitions is convenient. Otherwise, we'd need to track + // a different number of transitions for the byte ranges as for + // the 'next' states. + // + // N.B. The loop above is not guaranteed to yield the EOI + // transition, since it may point to a DEAD state. By putting + // it here, we always write the EOI transition, and thus + // guarantee that our transition count is >0. Why do we always + // need the EOI transition? Because in order to implement + // Automaton::next_eoi_state, this lets us just ask for the last + // transition. There are probably other/better ways to do this. + transition_count += 1; + sparse.push(0); + sparse.push(0); + + // Check some assumptions about transition count. + assert_ne!( + transition_count, 0, + "transition count should be non-zero", + ); + assert!( + transition_count <= 257, + "expected transition count {} to be <= 257", + transition_count, + ); + + // Fill in the transition count. + // Since transition count is always <= 257, we use the most + // significant bit to indicate whether this is a match state or + // not. + let ntrans = if dfa.is_match_state(state.id()) { + transition_count | (1 << 15) + } else { + transition_count + }; + bytes::NE::write_u16(ntrans, &mut sparse[pos..]); + + // zero-fill the actual transitions. + // Unwraps are OK since transition_count <= 257 and our minimum + // support usize size is 16-bits. + let zeros = usize::try_from(transition_count) + .unwrap() + .checked_mul(StateID::SIZE) + .unwrap(); + sparse.extend(iter::repeat(0).take(zeros)); + + // If this is a match state, write the pattern IDs matched by this + // state. + if dfa.is_match_state(state.id()) { + let plen = dfa.match_pattern_len(state.id()); + // Write the actual pattern IDs with a u32 length prefix. + // First, zero-fill space. + let mut pos = sparse.len(); + // Unwraps are OK since it's guaranteed that plen <= + // PatternID::LIMIT, which is in turn guaranteed to fit into a + // u32. + let zeros = size_of::<u32>() + .checked_mul(plen) + .unwrap() + .checked_add(size_of::<u32>()) + .unwrap(); + sparse.extend(iter::repeat(0).take(zeros)); + + // Now write the length prefix. + bytes::NE::write_u32( + // Will never fail since u32::MAX is invalid pattern ID. + // Thus, the number of pattern IDs is representable by a + // u32. + plen.try_into().expect("pattern ID count fits in u32"), + &mut sparse[pos..], + ); + pos += size_of::<u32>(); + + // Now write the pattern IDs. + for &pid in dfa.pattern_id_slice(state.id()) { + pos += bytes::write_pattern_id::<bytes::NE>( + pid, + &mut sparse[pos..], + ); + } + } + + // And now add the accelerator, if one exists. An accelerator is + // at most 4 bytes and at least 1 byte. The first byte is the + // length, N. N bytes follow the length. The set of bytes that + // follow correspond (exhaustively) to the bytes that must be seen + // to leave this state. + let accel = dfa.accelerator(state.id()); + sparse.push(accel.len().try_into().unwrap()); + sparse.extend_from_slice(accel); + } + + let mut new = DFA { + trans: Transitions { + sparse, + classes: dfa.byte_classes().clone(), + count: dfa.state_count(), + patterns: dfa.pattern_count(), + }, + starts: StartTable::from_dense_dfa(dfa, &remap)?, + special: dfa.special().remap(|id| remap[dfa.to_index(id)]), + }; + // And here's our second pass. Iterate over all of the dense states + // again, and update the transitions in each of the states in the + // sparse DFA. + for old_state in dfa.states() { + let new_id = remap[dfa.to_index(old_state.id())]; + let mut new_state = new.trans.state_mut(new_id); + let sparse = old_state.sparse_transitions(); + for (i, (_, _, next)) in sparse.enumerate() { + let next = remap[dfa.to_index(next)]; + new_state.set_next_at(i, next); + } + } + trace!( + "created sparse DFA, memory usage: {} (dense memory usage: {})", + new.memory_usage(), + dfa.memory_usage(), + ); + Ok(new) + } +} + +impl<T: AsRef<[u8]>> DFA<T> { + /// Cheaply return a borrowed version of this sparse DFA. Specifically, the + /// DFA returned always uses `&[u8]` for its transitions. + pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { + DFA { + trans: self.trans.as_ref(), + starts: self.starts.as_ref(), + special: self.special, + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u8>` for its transitions. + /// + /// Effectively, this returns a sparse DFA whose transitions live on the + /// heap. + #[cfg(feature = "alloc")] + pub fn to_owned(&self) -> DFA<Vec<u8>> { + DFA { + trans: self.trans.to_owned(), + starts: self.starts.to_owned(), + special: self.special, + } + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<sparse::DFA>()`. + pub fn memory_usage(&self) -> usize { + self.trans.memory_usage() + self.starts.memory_usage() + } + + /// Returns true only if this DFA has starting states for each pattern. + /// + /// When a DFA has starting states for each pattern, then a search with the + /// DFA can be configured to only look for anchored matches of a specific + /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] + /// can accept a non-None `pattern_id` if and only if this method returns + /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// + /// Note that if the DFA is empty, this always returns false. + pub fn has_starts_for_each_pattern(&self) -> bool { + self.starts.patterns > 0 + } +} + +/// Routines for converting a sparse DFA to other representations, such as raw +/// bytes suitable for persistent storage. +impl<T: AsRef<[u8]>> DFA<T> { + /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_little_endian would work on a little endian target. + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_little_endian(&self) -> Vec<u8> { + self.to_bytes::<bytes::LE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // N.B. We use native endianness here to make the example work, but + /// // using to_bytes_big_endian would work on a big endian target. + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_big_endian(&self) -> Vec<u8> { + self.to_bytes::<bytes::BE>() + } + + /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian + /// format. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s + /// serialization methods, this does not add any initial padding to the + /// returned bytes. Padding isn't required for sparse DFAs since they have + /// no alignment requirements. + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let buf = original_dfa.to_bytes_native_endian(); + /// // Even if buf has initial padding, DFA::from_bytes will automatically + /// // ignore it. + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] + pub fn to_bytes_native_endian(&self) -> Vec<u8> { + self.to_bytes::<bytes::NE>() + } + + /// The implementation of the public `to_bytes` serialization methods, + /// which is generic over endianness. + #[cfg(feature = "alloc")] + fn to_bytes<E: Endian>(&self) -> Vec<u8> { + let mut buf = vec![0; self.write_to_len()]; + // This should always succeed since the only possible serialization + // error is providing a buffer that's too small, but we've ensured that + // `buf` is big enough here. + self.write_to::<E>(&mut buf).unwrap(); + buf + } + + /// Serialize this DFA as raw bytes to the given slice, in little endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_little_endian would work on a little endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_little_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.write_to::<bytes::LE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in big endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// // N.B. We use native endianness here to make the example work, but + /// // using write_to_big_endian would work on a big endian target. + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_big_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.write_to::<bytes::BE>(dst) + } + + /// Serialize this DFA as raw bytes to the given slice, in native endian + /// format. Upon success, the total number of bytes written to `dst` is + /// returned. + /// + /// The written bytes are guaranteed to be deserialized correctly and + /// without errors in a semver compatible release of this crate by a + /// `DFA`'s deserialization APIs (assuming all other criteria for the + /// deserialization APIs has been satisfied): + /// + /// * [`DFA::from_bytes`] + /// * [`DFA::from_bytes_unchecked`] + /// + /// Generally speaking, native endian format should only be used when + /// you know that the target you're compiling the DFA for matches the + /// endianness of the target on which you're compiling DFA. For example, + /// if serialization and deserialization happen in the same process or on + /// the same machine. Otherwise, when serializing a DFA for use in a + /// portable environment, you'll almost certainly want to serialize _both_ + /// a little endian and a big endian version and then load the correct one + /// based on the target's configuration. + /// + /// # Errors + /// + /// This returns an error if the given destination slice is not big enough + /// to contain the full serialized DFA. If an error occurs, then nothing + /// is written to `dst`. + /// + /// # Example + /// + /// This example shows how to serialize and deserialize a DFA without + /// dynamic memory allocation. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// // Create a 4KB buffer on the stack to store our serialized DFA. + /// let mut buf = [0u8; 4 * (1<<10)]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_native_endian( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + self.write_to::<bytes::NE>(dst) + } + + /// The implementation of the public `write_to` serialization methods, + /// which is generic over endianness. + fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let mut nw = 0; + nw += bytes::write_label(LABEL, &mut dst[nw..])?; + nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?; + nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?; + nw += { + // Currently unused, intended for future flexibility + E::write_u32(0, &mut dst[nw..]); + size_of::<u32>() + }; + nw += self.trans.write_to::<E>(&mut dst[nw..])?; + nw += self.starts.write_to::<E>(&mut dst[nw..])?; + nw += self.special.write_to::<E>(&mut dst[nw..])?; + Ok(nw) + } + + /// Return the total number of bytes required to serialize this DFA. + /// + /// This is useful for determining the size of the buffer required to pass + /// to one of the serialization routines: + /// + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// Passing a buffer smaller than the size returned by this method will + /// result in a serialization error. + /// + /// # Example + /// + /// This example shows how to dynamically allocate enough room to serialize + /// a sparse DFA. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// // Compile our original DFA. + /// let original_dfa = DFA::new("foo[0-9]+")?; + /// + /// let mut buf = vec![0; original_dfa.write_to_len()]; + /// let written = original_dfa.write_to_native_endian(&mut buf)?; + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn write_to_len(&self) -> usize { + bytes::write_label_len(LABEL) + + bytes::write_endianness_check_len() + + bytes::write_version_len() + + size_of::<u32>() // unused, intended for future flexibility + + self.trans.write_to_len() + + self.starts.write_to_len() + + self.special.write_to_len() + } +} + +impl<'a> DFA<&'a [u8]> { + /// Safely deserialize a sparse DFA with a specific state identifier + /// representation. Upon success, this returns both the deserialized DFA + /// and the number of bytes read from the given slice. Namely, the contents + /// of the slice beyond the DFA are not read. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// For safety purposes, the DFA's transitions will be verified such that + /// every transition points to a valid state. If this verification is too + /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which + /// will always execute in constant time. + /// + /// The bytes given must be generated by one of the serialization APIs + /// of a `DFA` using a semver compatible release of this crate. Those + /// include: + /// + /// * [`DFA::to_bytes_little_endian`] + /// * [`DFA::to_bytes_big_endian`] + /// * [`DFA::to_bytes_native_endian`] + /// * [`DFA::write_to_little_endian`] + /// * [`DFA::write_to_big_endian`] + /// * [`DFA::write_to_native_endian`] + /// + /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The + /// `write_to` methods do not allocate and write to an existing slice + /// (which may be on the stack). Since deserialization always uses the + /// native endianness of the target platform, the serialization API you use + /// should match the endianness of the target platform. (It's often a good + /// idea to generate serialized DFAs for both forms of endianness and then + /// load the correct one based on endianness.) + /// + /// # Errors + /// + /// Generally speaking, it's easier to state the conditions in which an + /// error is _not_ returned. All of the following must be true: + /// + /// * The bytes given must be produced by one of the serialization APIs + /// on this DFA, as mentioned above. + /// * The endianness of the target platform matches the endianness used to + /// serialized the provided DFA. + /// + /// If any of the above are not true, then an error will be returned. + /// + /// Note that unlike deserializing a + /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has + /// no alignment requirements. That is, an alignment of `1` is valid. + /// + /// # Panics + /// + /// This routine will never panic for any input. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let bytes = initial.to_bytes_native_endian(); + /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: loading a DFA from static memory + /// + /// One use case this library supports is the ability to serialize a + /// DFA to disk and then use `include_bytes!` to store it in a compiled + /// Rust program. Those bytes can then be cheaply deserialized into a + /// `DFA` structure at runtime and used for searching without having to + /// re-compile the DFA (which can be quite costly). + /// + /// We can show this in two parts. The first part is serializing the DFA to + /// a file: + /// + /// ```no_run + /// use regex_automata::dfa::{Automaton, sparse::DFA}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// + /// // Write a big endian serialized version of this DFA to a file. + /// let bytes = dfa.to_bytes_big_endian(); + /// std::fs::write("foo.bigendian.dfa", &bytes)?; + /// + /// // Do it again, but this time for little endian. + /// let bytes = dfa.to_bytes_little_endian(); + /// std::fs::write("foo.littleendian.dfa", &bytes)?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And now the second part is embedding the DFA into the compiled program + /// and deserializing it at runtime on first use. We use conditional + /// compilation to choose the correct endianness. As mentioned above, we + /// do not need to employ any special tricks to ensure a proper alignment, + /// since a sparse DFA has no alignment requirements. + /// + /// ```no_run + /// use regex_automata::{ + /// dfa::{Automaton, sparse}, + /// HalfMatch, + /// }; + /// + /// type DFA = sparse::DFA<&'static [u8]>; + /// + /// fn get_foo() -> &'static DFA { + /// use std::cell::Cell; + /// use std::mem::MaybeUninit; + /// use std::sync::Once; + /// + /// # const _: &str = stringify! { + /// #[cfg(target_endian = "big")] + /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); + /// #[cfg(target_endian = "little")] + /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); + /// # }; + /// # static BYTES: &[u8] = b""; + /// + /// struct Lazy(Cell<MaybeUninit<DFA>>); + /// // SAFETY: This is safe because DFA impls Sync. + /// unsafe impl Sync for Lazy {} + /// + /// static INIT: Once = Once::new(); + /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); + /// + /// INIT.call_once(|| { + /// let (dfa, _) = DFA::from_bytes(BYTES) + /// .expect("serialized DFA should be valid"); + /// // SAFETY: This is guaranteed to only execute once, and all + /// // we do with the pointer is write the DFA to it. + /// unsafe { + /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); + /// } + /// }); + /// // SAFETY: DFA is guaranteed to by initialized via INIT and is + /// // stored in static memory. + /// unsafe { + /// let dfa = (*DFA.0.as_ptr()).as_ptr(); + /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) + /// } + /// } + /// + /// let dfa = get_foo(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// ``` + /// + /// Alternatively, consider using + /// [`lazy_static`](https://crates.io/crates/lazy_static) + /// or + /// [`once_cell`](https://crates.io/crates/once_cell), + /// which will guarantee safety for you. + pub fn from_bytes( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { + // SAFETY: This is safe because we validate both the sparse transitions + // (by trying to decode every state) and start state ID list below. If + // either validation fails, then we return an error. + let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; + dfa.trans.validate()?; + dfa.starts.validate(&dfa.trans)?; + // N.B. dfa.special doesn't have a way to do unchecked deserialization, + // so it has already been validated. + Ok((dfa, nread)) + } + + /// Deserialize a DFA with a specific state identifier representation in + /// constant time by omitting the verification of the validity of the + /// sparse transitions. + /// + /// This is just like [`DFA::from_bytes`], except it can potentially return + /// a DFA that exhibits undefined behavior if its transitions contains + /// invalid state identifiers. + /// + /// This routine is useful if you need to deserialize a DFA cheaply and + /// cannot afford the transition validation performed by `from_bytes`. + /// + /// # Safety + /// + /// This routine is unsafe because it permits callers to provide + /// arbitrary transitions with possibly incorrect state identifiers. While + /// the various serialization routines will never return an incorrect + /// DFA, there is no guarantee that the bytes provided here + /// are correct. While `from_bytes_unchecked` will still do several forms + /// of basic validation, this routine does not check that the transitions + /// themselves are correct. Given an incorrect transition table, it is + /// possible for the search routines to access out-of-bounds memory because + /// of explicit bounds check elision. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, sparse::DFA}, + /// HalfMatch, + /// }; + /// + /// let initial = DFA::new("foo[0-9]+")?; + /// let bytes = initial.to_bytes_native_endian(); + /// // SAFETY: This is guaranteed to be safe since the bytes given come + /// // directly from a compatible serialization routine. + /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; + /// + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub unsafe fn from_bytes_unchecked( + slice: &'a [u8], + ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { + let mut nr = 0; + + nr += bytes::read_label(&slice[nr..], LABEL)?; + nr += bytes::read_endianness_check(&slice[nr..])?; + nr += bytes::read_version(&slice[nr..], VERSION)?; + + let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + nr += size_of::<u32>(); + + let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + nr += nread; + + let (special, nread) = Special::from_bytes(&slice[nr..])?; + nr += nread; + if special.max.as_usize() >= trans.sparse().len() { + return Err(DeserializeError::generic( + "max should not be greater than or equal to sparse bytes", + )); + } + + Ok((DFA { trans, starts, special }, nr)) + } +} + +impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "sparse::DFA(")?; + for state in self.trans.states() { + fmt_state_indicator(f, self, state.id())?; + writeln!(f, "{:06?}: {:?}", state.id(), state)?; + } + writeln!(f, "")?; + for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() { + if i % self.starts.stride == 0 { + match pid { + None => writeln!(f, "START-GROUP(ALL)")?, + Some(pid) => { + writeln!(f, "START_GROUP(pattern: {:?})", pid)? + } + } + } + writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; + } + writeln!(f, "state count: {:?}", self.trans.count)?; + writeln!(f, ")")?; + Ok(()) + } +} + +unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { + #[inline] + fn is_special_state(&self, id: StateID) -> bool { + self.special.is_special_state(id) + } + + #[inline] + fn is_dead_state(&self, id: StateID) -> bool { + self.special.is_dead_state(id) + } + + #[inline] + fn is_quit_state(&self, id: StateID) -> bool { + self.special.is_quit_state(id) + } + + #[inline] + fn is_match_state(&self, id: StateID) -> bool { + self.special.is_match_state(id) + } + + #[inline] + fn is_start_state(&self, id: StateID) -> bool { + self.special.is_start_state(id) + } + + #[inline] + fn is_accel_state(&self, id: StateID) -> bool { + self.special.is_accel_state(id) + } + + // This is marked as inline to help dramatically boost sparse searching, + // which decodes each state it enters to follow the next transition. + #[inline(always)] + fn next_state(&self, current: StateID, input: u8) -> StateID { + let input = self.trans.classes.get(input); + self.trans.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: StateID, + input: u8, + ) -> StateID { + self.next_state(current, input) + } + + #[inline] + fn next_eoi_state(&self, current: StateID) -> StateID { + self.trans.state(current).next_eoi() + } + + #[inline] + fn pattern_count(&self) -> usize { + self.trans.patterns + } + + #[inline] + fn match_count(&self, id: StateID) -> usize { + self.trans.state(id).pattern_count() + } + + #[inline] + fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the state machine, which requires + // a bit of slicing/pointer-chasing. This optimization tends to only + // matter when matches are frequent. + if self.trans.patterns == 1 { + return PatternID::ZERO; + } + self.trans.state(id).pattern_id(match_index) + } + + #[inline] + fn start_state_forward( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_fwd(bytes, start, end); + self.starts.start(index, pattern_id) + } + + #[inline] + fn start_state_reverse( + &self, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> StateID { + let index = Start::from_position_rev(bytes, start, end); + self.starts.start(index, pattern_id) + } + + #[inline] + fn accelerator(&self, id: StateID) -> &[u8] { + self.trans.state(id).accelerator() + } +} + +/// The transition table portion of a sparse DFA. +/// +/// The transition table is the core part of the DFA in that it describes how +/// to move from one state to another based on the input sequence observed. +/// +/// Unlike a typical dense table based DFA, states in a sparse transition +/// table have variable size. That is, states with more transitions use more +/// space than states with fewer transitions. This means that finding the next +/// transition takes more work than with a dense DFA, but also typically uses +/// much less space. +#[derive(Clone)] +struct Transitions<T> { + /// The raw encoding of each state in this DFA. + /// + /// Each state has the following information: + /// + /// * A set of transitions to subsequent states. Transitions to the dead + /// state are omitted. + /// * If the state can be accelerated, then any additional accelerator + /// information. + /// * If the state is a match state, then the state contains all pattern + /// IDs that match when in that state. + /// + /// To decode a state, use Transitions::state. + /// + /// In practice, T is either Vec<u8> or &[u8]. + sparse: T, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to a + /// single character in this DFA's alphabet, where the maximum number of + /// characters is 257 (each possible value of a byte plus the special + /// EOI transition). Consequently, the number of equivalence classes + /// corresponds to the number of transitions for each DFA state. Note + /// though that the *space* used by each DFA state in the transition table + /// may be larger. The total space used by each DFA state is known as the + /// stride and is documented above. + /// + /// The only time the number of equivalence classes is fewer than 257 is + /// if the DFA's kind uses byte classes which is the default. Equivalence + /// classes should generally only be disabled when debugging, so that + /// the transitions themselves aren't obscured. Disabling them has no + /// other benefit, since the equivalence class map is always used while + /// searching. In the vast majority of cases, the number of equivalence + /// classes is substantially smaller than 257, particularly when large + /// Unicode classes aren't used. + /// + /// N.B. Equivalence classes aren't particularly useful in a sparse DFA + /// in the current implementation, since equivalence classes generally tend + /// to correspond to continuous ranges of bytes that map to the same + /// transition. So in a sparse DFA, equivalence classes don't really lead + /// to a space savings. In the future, it would be good to try and remove + /// them from sparse DFAs entirely, but requires a bit of work since sparse + /// DFAs are built from dense DFAs, which are in turn built on top of + /// equivalence classes. + classes: ByteClasses, + /// The total number of states in this DFA. Note that a DFA always has at + /// least one state---the dead state---even the empty DFA. In particular, + /// the dead state always has ID 0 and is correspondingly always the first + /// state. The dead state is never a match state. + count: usize, + /// The total number of unique patterns represented by these match states. + patterns: usize, +} + +impl<'a> Transitions<&'a [u8]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (state_count, nr) = + bytes::try_read_u32_as_usize(&slice, "state count")?; + slice = &slice[nr..]; + + let (pattern_count, nr) = + bytes::try_read_u32_as_usize(&slice, "pattern count")?; + slice = &slice[nr..]; + + let (classes, nr) = ByteClasses::from_bytes(&slice)?; + slice = &slice[nr..]; + + let (len, nr) = + bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?; + slice = &slice[nr..]; + + bytes::check_slice_len(slice, len, "sparse states byte length")?; + let sparse = &slice[..len]; + slice = &slice[len..]; + + let trans = Transitions { + sparse, + classes, + count: state_count, + patterns: pattern_count, + }; + Ok((trans, slice.as_ptr() as usize - slice_start)) + } +} + +impl<T: AsRef<[u8]>> Transitions<T> { + /// Writes a serialized form of this transition table to the buffer given. + /// If the buffer is too small, then an error is returned. To determine + /// how big the buffer must be, use `write_to_len`. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse transition table", + )); + } + dst = &mut dst[..nwrite]; + + // write state count + E::write_u32(u32::try_from(self.count).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write pattern count + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write byte class map + let n = self.classes.write_to(dst)?; + dst = &mut dst[n..]; + + // write number of bytes in sparse transitions + E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + + // write actual transitions + dst.copy_from_slice(self.sparse()); + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // state count + + size_of::<u32>() // pattern count + + self.classes.write_to_len() + + size_of::<u32>() // sparse transitions length + + self.sparse().len() + } + + /// Validates that every state ID in this transition table is valid. + /// + /// That is, every state ID can be used to correctly index a state in this + /// table. + fn validate(&self) -> Result<(), DeserializeError> { + // In order to validate everything, we not only need to make sure we + // can decode every state, but that every transition in every state + // points to a valid state. There are many duplicative transitions, so + // we record state IDs that we've verified so that we don't redo the + // decoding work. + // + // Except, when in no_std mode, we don't have dynamic memory allocation + // available to us, so we skip this optimization. It's not clear + // whether doing something more clever is worth it just yet. If you're + // profiling this code and need it to run faster, please file an issue. + // + // ---AG + struct Seen { + #[cfg(feature = "alloc")] + set: BTreeSet<StateID>, + #[cfg(not(feature = "alloc"))] + set: core::marker::PhantomData<StateID>, + } + + #[cfg(feature = "alloc")] + impl Seen { + fn new() -> Seen { + Seen { set: BTreeSet::new() } + } + fn insert(&mut self, id: StateID) { + self.set.insert(id); + } + fn contains(&self, id: &StateID) -> bool { + self.set.contains(id) + } + } + + #[cfg(not(feature = "alloc"))] + impl Seen { + fn new() -> Seen { + Seen { set: core::marker::PhantomData } + } + fn insert(&mut self, _id: StateID) {} + fn contains(&self, _id: &StateID) -> bool { + false + } + } + + let mut verified: Seen = Seen::new(); + // We need to make sure that we decode the correct number of states. + // Otherwise, an empty set of transitions would validate even if the + // recorded state count is non-empty. + let mut count = 0; + // We can't use the self.states() iterator because it assumes the state + // encodings are valid. It could panic if they aren't. + let mut id = DEAD; + while id.as_usize() < self.sparse().len() { + let state = self.try_state(id)?; + verified.insert(id); + // The next ID should be the offset immediately following `state`. + id = StateID::new(bytes::add( + id.as_usize(), + state.bytes_len(), + "next state ID offset", + )?) + .map_err(|err| { + DeserializeError::state_id_error(err, "next state ID offset") + })?; + count += 1; + + // Now check that all transitions in this state are correct. + for i in 0..state.ntrans { + let to = state.next_at(i); + if verified.contains(&to) { + continue; + } + let _ = self.try_state(to)?; + verified.insert(id); + } + } + if count != self.count { + return Err(DeserializeError::generic( + "mismatching sparse state count", + )); + } + Ok(()) + } + + /// Converts these transitions to a borrowed value. + fn as_ref(&self) -> Transitions<&'_ [u8]> { + Transitions { + sparse: self.sparse(), + classes: self.classes.clone(), + count: self.count, + patterns: self.patterns, + } + } + + /// Converts these transitions to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> Transitions<Vec<u8>> { + Transitions { + sparse: self.sparse().to_vec(), + classes: self.classes.clone(), + count: self.count, + patterns: self.patterns, + } + } + + /// Return a convenient representation of the given state. + /// + /// This panics if the state is invalid. + /// + /// This is marked as inline to help dramatically boost sparse searching, + /// which decodes each state it enters to follow the next transition. Other + /// functions involved are also inlined, which should hopefully eliminate + /// a lot of the extraneous decoding that is never needed just to follow + /// the next transition. + #[inline(always)] + fn state(&self, id: StateID) -> State<'_> { + let mut state = &self.sparse()[id.as_usize()..]; + let mut ntrans = bytes::read_u16(&state) as usize; + let is_match = (1 << 15) & ntrans != 0; + ntrans &= !(1 << 15); + state = &state[2..]; + + let (input_ranges, state) = state.split_at(ntrans * 2); + let (next, state) = state.split_at(ntrans * StateID::SIZE); + let (pattern_ids, state) = if is_match { + let npats = bytes::read_u32(&state) as usize; + state[4..].split_at(npats * 4) + } else { + (&[][..], state) + }; + + let accel_len = state[0] as usize; + let accel = &state[1..accel_len + 1]; + State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } + } + + /// Like `state`, but will return an error if the state encoding is + /// invalid. This is useful for verifying states after deserialization, + /// which is required for a safe deserialization API. + /// + /// Note that this only verifies that this state is decodable and that + /// all of its data is consistent. It does not verify that its state ID + /// transitions point to valid states themselves, nor does it verify that + /// every pattern ID is valid. + fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> { + if id.as_usize() > self.sparse().len() { + return Err(DeserializeError::generic("invalid sparse state ID")); + } + let mut state = &self.sparse()[id.as_usize()..]; + // Encoding format starts with a u16 that stores the total number of + // transitions in this state. + let (mut ntrans, _) = + bytes::try_read_u16_as_usize(state, "state transition count")?; + let is_match = ((1 << 15) & ntrans) != 0; + ntrans &= !(1 << 15); + state = &state[2..]; + if ntrans > 257 || ntrans == 0 { + return Err(DeserializeError::generic("invalid transition count")); + } + + // Each transition has two pieces: an inclusive range of bytes on which + // it is defined, and the state ID that those bytes transition to. The + // pairs come first, followed by a corresponding sequence of state IDs. + let input_ranges_len = ntrans.checked_mul(2).unwrap(); + bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; + let (input_ranges, state) = state.split_at(input_ranges_len); + // Every range should be of the form A-B, where A<=B. + for pair in input_ranges.chunks(2) { + let (start, end) = (pair[0], pair[1]); + if start > end { + return Err(DeserializeError::generic("invalid input range")); + } + } + + // And now extract the corresponding sequence of state IDs. We leave + // this sequence as a &[u8] instead of a &[S] because sparse DFAs do + // not have any alignment requirements. + let next_len = ntrans + .checked_mul(self.id_len()) + .expect("state size * #trans should always fit in a usize"); + bytes::check_slice_len(state, next_len, "sparse trans state IDs")?; + let (next, state) = state.split_at(next_len); + // We can at least verify that every state ID is in bounds. + for idbytes in next.chunks(self.id_len()) { + let (id, _) = + bytes::read_state_id(idbytes, "sparse state ID in try_state")?; + bytes::check_slice_len( + self.sparse(), + id.as_usize(), + "invalid sparse state ID", + )?; + } + + // If this is a match state, then read the pattern IDs for this state. + // Pattern IDs is a u32-length prefixed sequence of native endian + // encoded 32-bit integers. + let (pattern_ids, state) = if is_match { + let (npats, nr) = + bytes::try_read_u32_as_usize(state, "pattern ID count")?; + let state = &state[nr..]; + + let pattern_ids_len = + bytes::mul(npats, 4, "sparse pattern ID byte length")?; + bytes::check_slice_len( + state, + pattern_ids_len, + "sparse pattern IDs", + )?; + let (pattern_ids, state) = state.split_at(pattern_ids_len); + for patbytes in pattern_ids.chunks(PatternID::SIZE) { + bytes::read_pattern_id( + patbytes, + "sparse pattern ID in try_state", + )?; + } + (pattern_ids, state) + } else { + (&[][..], state) + }; + + // Now read this state's accelerator info. The first byte is the length + // of the accelerator, which is typically 0 (for no acceleration) but + // is no bigger than 3. The length indicates the number of bytes that + // follow, where each byte corresponds to a transition out of this + // state. + if state.is_empty() { + return Err(DeserializeError::generic("no accelerator length")); + } + let (accel_len, state) = (state[0] as usize, &state[1..]); + + if accel_len > 3 { + return Err(DeserializeError::generic( + "sparse invalid accelerator length", + )); + } + bytes::check_slice_len( + state, + accel_len, + "sparse corrupt accelerator length", + )?; + let (accel, _) = (&state[..accel_len], &state[accel_len..]); + + Ok(State { + id, + is_match, + ntrans, + input_ranges, + next, + pattern_ids, + accel, + }) + } + + /// Return an iterator over all of the states in this DFA. + /// + /// The iterator returned yields tuples, where the first element is the + /// state ID and the second element is the state itself. + fn states(&self) -> StateIter<'_, T> { + StateIter { trans: self, id: DEAD.as_usize() } + } + + /// Returns the sparse transitions as raw bytes. + fn sparse(&self) -> &[u8] { + self.sparse.as_ref() + } + + /// Returns the number of bytes represented by a single state ID. + fn id_len(&self) -> usize { + StateID::SIZE + } + + /// Return the memory usage, in bytes, of these transitions. + /// + /// This does not include the size of a `Transitions` value itself. + fn memory_usage(&self) -> usize { + self.sparse().len() + } +} + +#[cfg(feature = "alloc")] +impl<T: AsMut<[u8]>> Transitions<T> { + /// Return a convenient mutable representation of the given state. + /// This panics if the state is invalid. + fn state_mut(&mut self, id: StateID) -> StateMut<'_> { + let mut state = &mut self.sparse_mut()[id.as_usize()..]; + let mut ntrans = bytes::read_u16(&state) as usize; + let is_match = (1 << 15) & ntrans != 0; + ntrans &= !(1 << 15); + state = &mut state[2..]; + + let (input_ranges, state) = state.split_at_mut(ntrans * 2); + let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); + let (pattern_ids, state) = if is_match { + let npats = bytes::read_u32(&state) as usize; + state[4..].split_at_mut(npats * 4) + } else { + (&mut [][..], state) + }; + + let accel_len = state[0] as usize; + let accel = &mut state[1..accel_len + 1]; + StateMut { + id, + is_match, + ntrans, + input_ranges, + next, + pattern_ids, + accel, + } + } + + /// Returns the sparse transitions as raw mutable bytes. + fn sparse_mut(&mut self) -> &mut [u8] { + self.sparse.as_mut() + } +} + +/// The set of all possible starting states in a DFA. +/// +/// See the eponymous type in the `dense` module for more details. This type +/// is very similar to `dense::StartTable`, except that its underlying +/// representation is `&[u8]` instead of `&[S]`. (The latter would require +/// sparse DFAs to be aligned, which is explicitly something we do not require +/// because we don't really need it.) +#[derive(Clone)] +struct StartTable<T> { + /// The initial start state IDs as a contiguous table of native endian + /// encoded integers, represented by `S`. + /// + /// In practice, T is either Vec<u8> or &[u8] and has no alignment + /// requirements. + /// + /// The first `stride` (currently always 4) entries always correspond to + /// the start states for the entire DFA. After that, there are + /// `stride * patterns` state IDs, where `patterns` may be zero in the + /// case of a DFA with no patterns or in the case where the DFA was built + /// without enabling starting states for each pattern. + table: T, + /// The number of starting state IDs per pattern. + stride: usize, + /// The total number of patterns for which starting states are encoded. + /// This may be zero for non-empty DFAs when the DFA was built without + /// start states for each pattern. + patterns: usize, +} + +#[cfg(feature = "alloc")] +impl StartTable<Vec<u8>> { + fn new(patterns: usize) -> StartTable<Vec<u8>> { + let stride = Start::count(); + // This is OK since the only way we're here is if a dense DFA could be + // constructed successfully, which uses the same space. + let len = stride + .checked_mul(patterns) + .unwrap() + .checked_add(stride) + .unwrap() + .checked_mul(StateID::SIZE) + .unwrap(); + StartTable { table: vec![0; len], stride, patterns } + } + + fn from_dense_dfa<T: AsRef<[u32]>>( + dfa: &dense::DFA<T>, + remap: &[StateID], + ) -> Result<StartTable<Vec<u8>>, Error> { + // Unless the DFA has start states compiled for each pattern, then + // as far as the starting state table is concerned, there are zero + // patterns to account for. It will instead only store starting states + // for the entire DFA. + let start_pattern_count = if dfa.has_starts_for_each_pattern() { + dfa.pattern_count() + } else { + 0 + }; + let mut sl = StartTable::new(start_pattern_count); + for (old_start_id, sty, pid) in dfa.starts() { + let new_start_id = remap[dfa.to_index(old_start_id)]; + sl.set_start(sty, pid, new_start_id); + } + Ok(sl) + } +} + +impl<'a> StartTable<&'a [u8]> { + unsafe fn from_bytes_unchecked( + mut slice: &'a [u8], + ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { + let slice_start = slice.as_ptr() as usize; + + let (stride, nr) = + bytes::try_read_u32_as_usize(slice, "sparse start table stride")?; + slice = &slice[nr..]; + + let (patterns, nr) = bytes::try_read_u32_as_usize( + slice, + "sparse start table patterns", + )?; + slice = &slice[nr..]; + + if stride != Start::count() { + return Err(DeserializeError::generic( + "invalid sparse starting table stride", + )); + } + if patterns > PatternID::LIMIT { + return Err(DeserializeError::generic( + "sparse invalid number of patterns", + )); + } + let pattern_table_size = + bytes::mul(stride, patterns, "sparse invalid pattern count")?; + // Our start states always start with a single stride of start states + // for the entire automaton which permit it to match any pattern. What + // follows it are an optional set of start states for each pattern. + let start_state_count = bytes::add( + stride, + pattern_table_size, + "sparse invalid 'any' pattern starts size", + )?; + let table_bytes_len = bytes::mul( + start_state_count, + StateID::SIZE, + "sparse pattern table bytes length", + )?; + bytes::check_slice_len( + slice, + table_bytes_len, + "sparse start ID table", + )?; + let table_bytes = &slice[..table_bytes_len]; + slice = &slice[table_bytes_len..]; + + let sl = StartTable { table: table_bytes, stride, patterns }; + Ok((sl, slice.as_ptr() as usize - slice_start)) + } +} + +impl<T: AsRef<[u8]>> StartTable<T> { + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse starting table ids", + )); + } + dst = &mut dst[..nwrite]; + + // write stride + E::write_u32(u32::try_from(self.stride).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + // write pattern count + E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + // write start IDs + dst.copy_from_slice(self.table()); + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of this transition + /// table will use. + fn write_to_len(&self) -> usize { + size_of::<u32>() // stride + + size_of::<u32>() // # patterns + + self.table().len() + } + + /// Validates that every starting state ID in this table is valid. + /// + /// That is, every starting state ID can be used to correctly decode a + /// state in the DFA's sparse transitions. + fn validate( + &self, + trans: &Transitions<T>, + ) -> Result<(), DeserializeError> { + for (id, _, _) in self.iter() { + let _ = trans.try_state(id)?; + } + Ok(()) + } + + /// Converts this start list to a borrowed value. + fn as_ref(&self) -> StartTable<&'_ [u8]> { + StartTable { + table: self.table(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Converts this start list to an owned value. + #[cfg(feature = "alloc")] + fn to_owned(&self) -> StartTable<Vec<u8>> { + StartTable { + table: self.table().to_vec(), + stride: self.stride, + patterns: self.patterns, + } + } + + /// Return the start state for the given index and pattern ID. If the + /// pattern ID is None, then the corresponding start state for the entire + /// DFA is returned. If the pattern ID is not None, then the corresponding + /// starting state for the given pattern is returned. If this start table + /// does not have individual starting states for each pattern, then this + /// panics. + fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride) + .unwrap() + .checked_add(start_index) + .unwrap() + } + }; + let start = index * StateID::SIZE; + // This OK since we're allowed to assume that the start table contains + // valid StateIDs. + bytes::read_state_id_unchecked(&self.table()[start..]).0 + } + + /// Return an iterator over all start IDs in this table. + fn iter(&self) -> StartStateIter<'_, T> { + StartStateIter { st: self, i: 0 } + } + + /// Returns the total number of start state IDs in this table. + fn len(&self) -> usize { + self.table().len() / StateID::SIZE + } + + /// Returns the table as a raw slice of bytes. + fn table(&self) -> &[u8] { + self.table.as_ref() + } + + /// Return the memory usage, in bytes, of this start list. + /// + /// This does not include the size of a `StartTable` value itself. + fn memory_usage(&self) -> usize { + self.table().len() + } +} + +#[cfg(feature = "alloc")] +impl<T: AsMut<[u8]>> StartTable<T> { + /// Set the start state for the given index and pattern. + /// + /// If the pattern ID or state ID are not valid, then this will panic. + fn set_start( + &mut self, + index: Start, + pattern_id: Option<PatternID>, + id: StateID, + ) { + let start_index = index.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride) + .unwrap() + .checked_add(start_index) + .unwrap() + } + }; + let start = index * StateID::SIZE; + let end = start + StateID::SIZE; + bytes::write_state_id::<bytes::NE>( + id, + &mut self.table.as_mut()[start..end], + ); + } +} + +/// An iterator over all state state IDs in a sparse DFA. +struct StartStateIter<'a, T> { + st: &'a StartTable<T>, + i: usize, +} + +impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { + type Item = (StateID, Start, Option<PatternID>); + + fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> { + let i = self.i; + if i >= self.st.len() { + return None; + } + self.i += 1; + + // This unwrap is okay since the stride of any DFA must always match + // the number of start state types. + let start_type = Start::from_usize(i % self.st.stride).unwrap(); + let pid = if i < self.st.stride { + // This means we don't have start states for each pattern. + None + } else { + // These unwraps are OK since we may assume our table and stride + // is correct. + let pid = i + .checked_sub(self.st.stride) + .unwrap() + .checked_div(self.st.stride) + .unwrap(); + Some(PatternID::new(pid).unwrap()) + }; + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + let bytes = self.st.table()[start..end].try_into().unwrap(); + // This is OK since we're allowed to assume that any IDs in this start + // table are correct and valid for this DFA. + let id = StateID::from_ne_bytes_unchecked(bytes); + Some((id, start_type, pid)) + } +} + +impl<'a, T> fmt::Debug for StartStateIter<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("StartStateIter").field("i", &self.i).finish() + } +} + +/// An iterator over all states in a sparse DFA. +/// +/// This iterator yields tuples, where the first element is the state ID and +/// the second element is the state itself. +struct StateIter<'a, T> { + trans: &'a Transitions<T>, + id: usize, +} + +impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { + type Item = State<'a>; + + fn next(&mut self) -> Option<State<'a>> { + if self.id >= self.trans.sparse().len() { + return None; + } + let state = self.trans.state(StateID::new_unchecked(self.id)); + self.id = self.id + state.bytes_len(); + Some(state) + } +} + +impl<'a, T> fmt::Debug for StateIter<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("StateIter").field("id", &self.id).finish() + } +} + +/// A representation of a sparse DFA state that can be cheaply materialized +/// from a state identifier. +#[derive(Clone)] +struct State<'a> { + /// The identifier of this state. + id: StateID, + /// Whether this is a match state or not. + is_match: bool, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a [u8], + /// If this is a match state, then this contains the pattern IDs that match + /// when the DFA is in this state. + /// + /// This is a contiguous sequence of 32-bit native endian encoded integers. + pattern_ids: &'a [u8], + /// An accelerator for this state, if present. If this state has no + /// accelerator, then this is an empty slice. When non-empty, this slice + /// has length at most 3 and corresponds to the exhaustive set of bytes + /// that must be seen in order to transition out of this state. + accel: &'a [u8], +} + +impl<'a> State<'a> { + /// Searches for the next transition given an input byte. If no such + /// transition could be found, then a dead state is returned. + /// + /// This is marked as inline to help dramatically boost sparse searching, + /// which decodes each state it enters to follow the next transition. + #[inline(always)] + fn next(&self, input: u8) -> StateID { + // This straight linear search was observed to be much better than + // binary search on ASCII haystacks, likely because a binary search + // visits the ASCII case last but a linear search sees it first. A + // binary search does do a little better on non-ASCII haystacks, but + // not by much. There might be a better trade off lurking here. + for i in 0..(self.ntrans - 1) { + let (start, end) = self.range(i); + if start <= input && input <= end { + return self.next_at(i); + } + // We could bail early with an extra branch: if input < b1, then + // we know we'll never find a matching transition. Interestingly, + // this extra branch seems to not help performance, or will even + // hurt it. It's likely very dependent on the DFA itself and what + // is being searched. + } + DEAD + } + + /// Returns the next state ID for the special EOI transition. + fn next_eoi(&self) -> StateID { + self.next_at(self.ntrans - 1) + } + + /// Returns the identifier for this state. + fn id(&self) -> StateID { + self.id + } + + /// Returns the inclusive input byte range for the ith transition in this + /// state. + fn range(&self, i: usize) -> (u8, u8) { + (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) + } + + /// Returns the next state for the ith transition in this state. + fn next_at(&self, i: usize) -> StateID { + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + let bytes = self.next[start..end].try_into().unwrap(); + StateID::from_ne_bytes_unchecked(bytes) + } + + /// Returns the pattern ID for the given match index. If the match index + /// is invalid, then this panics. + fn pattern_id(&self, match_index: usize) -> PatternID { + let start = match_index * PatternID::SIZE; + bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 + } + + /// Returns the total number of pattern IDs for this state. This is always + /// zero when `is_match` is false. + fn pattern_count(&self) -> usize { + assert_eq!(0, self.pattern_ids.len() % 4); + self.pattern_ids.len() / 4 + } + + /// Return the total number of bytes that this state consumes in its + /// encoded form. + fn bytes_len(&self) -> usize { + let mut len = 2 + + (self.ntrans * 2) + + (self.ntrans * StateID::SIZE) + + (1 + self.accel.len()); + if self.is_match { + len += size_of::<u32>() + self.pattern_ids.len(); + } + len + } + + /// Return an accelerator for this state. + fn accelerator(&self) -> &'a [u8] { + self.accel + } +} + +impl<'a> fmt::Debug for State<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut printed = false; + for i in 0..(self.ntrans - 1) { + let next = self.next_at(i); + if next == DEAD { + continue; + } + + if printed { + write!(f, ", ")?; + } + let (start, end) = self.range(i); + if start == end { + write!(f, "{:?} => {:?}", DebugByte(start), next)?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next, + )?; + } + printed = true; + } + let eoi = self.next_at(self.ntrans - 1); + if eoi != DEAD { + if printed { + write!(f, ", ")?; + } + write!(f, "EOI => {:?}", eoi)?; + } + Ok(()) + } +} + +/// A representation of a mutable sparse DFA state that can be cheaply +/// materialized from a state identifier. +#[cfg(feature = "alloc")] +struct StateMut<'a> { + /// The identifier of this state. + id: StateID, + /// Whether this is a match state or not. + is_match: bool, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a mut [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a mut [u8], + /// If this is a match state, then this contains the pattern IDs that match + /// when the DFA is in this state. + /// + /// This is a contiguous sequence of 32-bit native endian encoded integers. + pattern_ids: &'a [u8], + /// An accelerator for this state, if present. If this state has no + /// accelerator, then this is an empty slice. When non-empty, this slice + /// has length at most 3 and corresponds to the exhaustive set of bytes + /// that must be seen in order to transition out of this state. + accel: &'a mut [u8], +} + +#[cfg(feature = "alloc")] +impl<'a> StateMut<'a> { + /// Sets the ith transition to the given state. + fn set_next_at(&mut self, i: usize, next: StateID) { + let start = i * StateID::SIZE; + let end = start + StateID::SIZE; + bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]); + } +} + +#[cfg(feature = "alloc")] +impl<'a> fmt::Debug for StateMut<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let state = State { + id: self.id, + is_match: self.is_match, + ntrans: self.ntrans, + input_ranges: self.input_ranges, + next: self.next, + pattern_ids: self.pattern_ids, + accel: self.accel, + }; + fmt::Debug::fmt(&state, f) + } +} + +/// A binary search routine specialized specifically to a sparse DFA state's +/// transitions. Specifically, the transitions are defined as a set of pairs +/// of input bytes that delineate an inclusive range of bytes. If the input +/// byte is in the range, then the corresponding transition is a match. +/// +/// This binary search accepts a slice of these pairs and returns the position +/// of the matching pair (the ith transition), or None if no matching pair +/// could be found. +/// +/// Note that this routine is not currently used since it was observed to +/// either decrease performance when searching ASCII, or did not provide enough +/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here +/// for posterity in case we can find a way to use it. +/// +/// In theory, we could use the standard library's search routine if we could +/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently +/// guaranteed to be safe and is thus UB (since I don't think the in-memory +/// representation of `(u8, u8)` has been nailed down). One could define a +/// repr(C) type, but the casting doesn't seem justified. +#[allow(dead_code)] +#[inline(always)] +fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { + debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); + debug_assert!(ranges.len() <= 512, "ranges should be short"); + + let (mut left, mut right) = (0, ranges.len() / 2); + while left < right { + let mid = (left + right) / 2; + let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); + if needle < b1 { + right = mid; + } else if needle > b2 { + left = mid + 1; + } else { + return Some(mid); + } + } + None +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/special.rs b/vendor/regex-automata-0.2.0/src/dfa/special.rs new file mode 100644 index 000000000..3db95a707 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/special.rs @@ -0,0 +1,477 @@ +use crate::{ + dfa::DEAD, + util::{ + bytes::{self, DeserializeError, Endian, SerializeError}, + id::StateID, + }, +}; + +macro_rules! err { + ($msg:expr) => { + return Err(DeserializeError::generic($msg)); + }; +} + +// Special represents the identifiers in a DFA that correspond to "special" +// states. If a state is one or more of the following, then it is considered +// special: +// +// * dead - A non-matching state where all outgoing transitions lead back to +// itself. There is only one of these, regardless of whether minimization +// has run. The dead state always has an ID of 0. i.e., It is always the +// first state in a DFA. +// * quit - A state that is entered whenever a byte is seen that should cause +// a DFA to give up and stop searching. This results in a MatchError::Quit +// error being returned at search time. The default configuration for a DFA +// has no quit bytes, which means this state is unreachable by default, +// although it is always present for reasons of implementation simplicity. +// This state is only reachable when the caller configures the DFA to quit +// on certain bytes. There is always exactly one of these states and it +// is always the second state. (Its actual ID depends on the size of the +// alphabet in dense DFAs, since state IDs are premultiplied in order to +// allow them to be used directly as indices into the transition table.) +// * match - An accepting state, i.e., indicative of a match. There may be +// zero or more of these states. +// * accelerated - A state where all of its outgoing transitions, except a +// few, loop back to itself. These states are candidates for acceleration +// via memchr during search. There may be zero or more of these states. +// * start - A non-matching state that indicates where the automaton should +// start during a search. There is always at least one starting state and +// all are guaranteed to be non-match states. (A start state cannot be a +// match state because the DFAs in this crate delay all matches by one byte. +// So every search that finds a match must move through one transition to +// some other match state, even when searching an empty string.) +// +// These are not mutually exclusive categories. Namely, the following +// overlappings can occur: +// +// * {dead, start} - If a DFA can never lead to a match and it is minimized, +// then it will typically compile to something where all starting IDs point +// to the DFA's dead state. +// * {match, accelerated} - It is possible for a match state to have the +// majority of its transitions loop back to itself, which means it's +// possible for a match state to be accelerated. +// * {start, accelerated} - Similarly, it is possible for a start state to be +// accelerated. Note that it is possible for an accelerated state to be +// neither a match or a start state. Also note that just because both match +// and start states overlap with accelerated states does not mean that +// match and start states overlap with each other. In fact, they are +// guaranteed not to overlap. +// +// As a special mention, every DFA always has a dead and a quit state, even +// though from the perspective of the DFA, they are equivalent. (Indeed, +// minimization special cases them to ensure they don't get merged.) The +// purpose of keeping them distinct is to use the quit state as a sentinel to +// distguish between whether a search finished successfully without finding +// anything or whether it gave up before finishing. +// +// So the main problem we want to solve here is the *fast* detection of whether +// a state is special or not. And we also want to do this while storing as +// little extra data as possible. AND we want to be able to quickly determine +// which categories a state falls into above if it is special. +// +// We achieve this by essentially shuffling all special states to the beginning +// of a DFA. That is, all special states appear before every other non-special +// state. By representing special states this way, we can determine whether a +// state is special or not by a single comparison, where special.max is the +// identifier of the last special state in the DFA: +// +// if current_state <= special.max: +// ... do something with special state +// +// The only thing left to do is to determine what kind of special state +// it is. Because what we do next depends on that. Since special states +// are typically rare, we can afford to do a bit more extra work, but we'd +// still like this to be as fast as possible. The trick we employ here is to +// continue shuffling states even within the special state range. Such that +// one contiguous region corresponds to match states, another for start states +// and then an overlapping range for accelerated states. At a high level, our +// special state detection might look like this (for leftmost searching, where +// we continue searching even after seeing a match): +// +// byte = input[offset] +// current_state = next_state(current_state, byte) +// offset += 1 +// if current_state <= special.max: +// if current_state == 0: +// # We can never leave a dead state, so this always marks the +// # end of our search. +// return last_match +// if current_state == special.quit_id: +// # A quit state means we give up. If he DFA has no quit state, +// # then special.quit_id == 0 == dead, which is handled by the +// # conditional above. +// return Err(MatchError::Quit { byte, offset: offset - 1 }) +// if special.min_match <= current_state <= special.max_match: +// last_match = Some(offset) +// if special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// last_match = Some(offset) +// elif special.min_start <= current_state <= special.max_start: +// offset = prefilter.find(input, offset) +// if special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// elif special.min_accel <= current_state <= special.max_accel: +// offset = accelerate(input, offset) +// +// There are some small details left out of the logic above. For example, +// in order to accelerate a state, we need to know which bytes to search for. +// This in turn implies some extra data we need to store in the DFA. To keep +// things compact, we would ideally only store +// +// N = special.max_accel - special.min_accel + 1 +// +// items. But state IDs are premultiplied, which means they are not contiguous. +// So in order to take a state ID and index an array of accelerated structures, +// we need to do: +// +// i = (state_id - special.min_accel) / stride +// +// (N.B. 'stride' is always a power of 2, so the above can be implemented via +// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in +// 2^x=stride.) +// +// Moreover, some of these specialty categories may be empty. For example, +// DFAs are not required to have any match states or any accelerated states. +// In that case, the lower and upper bounds are both set to 0 (the dead state +// ID) and the first `current_state == 0` check subsumes cases where the +// ranges are empty. +// +// Loop unrolling, if applicable, has also been left out of the logic above. +// +// Graphically, the ranges look like this, where asterisks indicate ranges +// that can be empty. Each 'x' is a state. +// +// quit +// dead| +// || +// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +// | | | | start | | +// | |-------------| |-------| | +// | match* | | | | +// | | | | | +// | |----------| | | +// | accel* | | +// | | | +// | | | +// |----------------------------|------------------------ +// special non-special* +#[derive(Clone, Copy, Debug)] +pub struct Special { + /// The identifier of the last special state in a DFA. A state is special + /// if and only if its identifier is less than or equal to `max`. + pub max: StateID, + /// The identifier of the quit state in a DFA. (There is no analogous field + /// for the dead state since the dead state's ID is always zero, regardless + /// of state ID size.) + pub quit_id: StateID, + /// The identifier of the first match state. + pub min_match: StateID, + /// The identifier of the last match state. + pub max_match: StateID, + /// The identifier of the first accelerated state. + pub min_accel: StateID, + /// The identifier of the last accelerated state. + pub max_accel: StateID, + /// The identifier of the first start state. + pub min_start: StateID, + /// The identifier of the last start state. + pub max_start: StateID, +} + +impl Special { + /// Creates a new set of special ranges for a DFA. All ranges are initially + /// set to only contain the dead state. This is interpreted as an empty + /// range. + #[cfg(feature = "alloc")] + pub fn new() -> Special { + Special { + max: DEAD, + quit_id: DEAD, + min_match: DEAD, + max_match: DEAD, + min_accel: DEAD, + max_accel: DEAD, + min_start: DEAD, + max_start: DEAD, + } + } + + /// Remaps all of the special state identifiers using the function given. + #[cfg(feature = "alloc")] + pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { + Special { + max: map(self.max), + quit_id: map(self.quit_id), + min_match: map(self.min_match), + max_match: map(self.max_match), + min_accel: map(self.min_accel), + max_accel: map(self.max_accel), + min_start: map(self.min_start), + max_start: map(self.max_start), + } + } + + /// Deserialize the given bytes into special state ranges. If the slice + /// given is not big enough, then this returns an error. Similarly, if + /// any of the expected invariants around special state ranges aren't + /// upheld, an error is returned. Note that this does not guarantee that + /// the information returned is correct. + /// + /// Upon success, this returns the number of bytes read in addition to the + /// special state IDs themselves. + pub fn from_bytes( + mut slice: &[u8], + ) -> Result<(Special, usize), DeserializeError> { + bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; + + let mut nread = 0; + let mut read_id = |what| -> Result<StateID, DeserializeError> { + let (id, nr) = bytes::try_read_state_id(slice, what)?; + nread += nr; + slice = &slice[StateID::SIZE..]; + Ok(id) + }; + + let max = read_id("special max id")?; + let quit_id = read_id("special quit id")?; + let min_match = read_id("special min match id")?; + let max_match = read_id("special max match id")?; + let min_accel = read_id("special min accel id")?; + let max_accel = read_id("special max accel id")?; + let min_start = read_id("special min start id")?; + let max_start = read_id("special max start id")?; + + let special = Special { + max, + quit_id, + min_match, + max_match, + min_accel, + max_accel, + min_start, + max_start, + }; + special.validate()?; + assert_eq!(nread, special.write_to_len()); + Ok((special, nread)) + } + + /// Validate that the information describing special states satisfies + /// all known invariants. + pub fn validate(&self) -> Result<(), DeserializeError> { + // Check that both ends of the range are DEAD or neither are. + if self.min_match == DEAD && self.max_match != DEAD { + err!("min_match is DEAD, but max_match is not"); + } + if self.min_match != DEAD && self.max_match == DEAD { + err!("max_match is DEAD, but min_match is not"); + } + if self.min_accel == DEAD && self.max_accel != DEAD { + err!("min_accel is DEAD, but max_accel is not"); + } + if self.min_accel != DEAD && self.max_accel == DEAD { + err!("max_accel is DEAD, but min_accel is not"); + } + if self.min_start == DEAD && self.max_start != DEAD { + err!("min_start is DEAD, but max_start is not"); + } + if self.min_start != DEAD && self.max_start == DEAD { + err!("max_start is DEAD, but min_start is not"); + } + + // Check that ranges are well formed. + if self.min_match > self.max_match { + err!("min_match should not be greater than max_match"); + } + if self.min_accel > self.max_accel { + err!("min_accel should not be greater than max_accel"); + } + if self.min_start > self.max_start { + err!("min_start should not be greater than max_start"); + } + + // Check that ranges are ordered with respect to one another. + if self.matches() && self.quit_id >= self.min_match { + err!("quit_id should not be greater than min_match"); + } + if self.accels() && self.quit_id >= self.min_accel { + err!("quit_id should not be greater than min_accel"); + } + if self.starts() && self.quit_id >= self.min_start { + err!("quit_id should not be greater than min_start"); + } + if self.matches() && self.accels() && self.min_accel < self.min_match { + err!("min_match should not be greater than min_accel"); + } + if self.matches() && self.starts() && self.min_start < self.min_match { + err!("min_match should not be greater than min_start"); + } + if self.accels() && self.starts() && self.min_start < self.min_accel { + err!("min_accel should not be greater than min_start"); + } + + // Check that max is at least as big as everything else. + if self.max < self.quit_id { + err!("quit_id should not be greater than max"); + } + if self.max < self.max_match { + err!("max_match should not be greater than max"); + } + if self.max < self.max_accel { + err!("max_accel should not be greater than max"); + } + if self.max < self.max_start { + err!("max_start should not be greater than max"); + } + + Ok(()) + } + + /// Validate that the special state information is compatible with the + /// given state count. + pub fn validate_state_count( + &self, + count: usize, + stride2: usize, + ) -> Result<(), DeserializeError> { + // We assume that 'validate' has already passed, so we know that 'max' + // is truly the max. So all we need to check is that the max state + // ID is less than the state ID count. The max legal value here is + // count-1, which occurs when there are no non-special states. + if (self.max.as_usize() >> stride2) >= count { + err!("max should not be greater than or equal to state count"); + } + Ok(()) + } + + /// Write the IDs and ranges for special states to the given byte buffer. + /// The buffer given must have enough room to store all data, otherwise + /// this will return an error. The number of bytes written is returned + /// on success. The number of bytes written is guaranteed to be a multiple + /// of 8. + pub fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + use crate::util::bytes::write_state_id as write; + + if dst.len() < self.write_to_len() { + return Err(SerializeError::buffer_too_small("special state ids")); + } + + let mut nwrite = 0; + nwrite += write::<E>(self.max, &mut dst[nwrite..]); + nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]); + nwrite += write::<E>(self.min_match, &mut dst[nwrite..]); + nwrite += write::<E>(self.max_match, &mut dst[nwrite..]); + nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]); + nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]); + nwrite += write::<E>(self.min_start, &mut dst[nwrite..]); + nwrite += write::<E>(self.max_start, &mut dst[nwrite..]); + + assert_eq!( + self.write_to_len(), + nwrite, + "expected to write certain number of bytes", + ); + assert_eq!( + nwrite % 8, + 0, + "expected to write multiple of 8 bytes for special states", + ); + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + 8 * StateID::SIZE + } + + /// Sets the maximum special state ID based on the current values. This + /// should be used once all possible state IDs are set. + #[cfg(feature = "alloc")] + pub fn set_max(&mut self) { + use core::cmp::max; + self.max = max( + self.quit_id, + max(self.max_match, max(self.max_accel, self.max_start)), + ); + } + + /// Returns true if and only if the given state ID is a special state. + #[inline] + pub fn is_special_state(&self, id: StateID) -> bool { + id <= self.max + } + + /// Returns true if and only if the given state ID is a dead state. + #[inline] + pub fn is_dead_state(&self, id: StateID) -> bool { + id == DEAD + } + + /// Returns true if and only if the given state ID is a quit state. + #[inline] + pub fn is_quit_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.quit_id == id + } + + /// Returns true if and only if the given state ID is a match state. + #[inline] + pub fn is_match_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match + } + + /// Returns true if and only if the given state ID is an accel state. + #[inline] + pub fn is_accel_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel + } + + /// Returns true if and only if the given state ID is a start state. + #[inline] + pub fn is_start_state(&self, id: StateID) -> bool { + !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start + } + + /// Returns the total number of match states for a dense table based DFA. + #[inline] + pub fn match_len(&self, stride: usize) -> usize { + if self.matches() { + (self.max_match.as_usize() - self.min_match.as_usize() + stride) + / stride + } else { + 0 + } + } + + /// Returns true if and only if there is at least one match state. + #[inline] + pub fn matches(&self) -> bool { + self.min_match != DEAD + } + + /// Returns the total number of accel states. + #[cfg(feature = "alloc")] + pub fn accel_len(&self, stride: usize) -> usize { + if self.accels() { + (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) + / stride + } else { + 0 + } + } + + /// Returns true if and only if there is at least one accel state. + #[inline] + pub fn accels(&self) -> bool { + self.min_accel != DEAD + } + + /// Returns true if and only if there is at least one start state. + #[inline] + pub fn starts(&self) -> bool { + self.min_start != DEAD + } +} diff --git a/vendor/regex-automata-0.2.0/src/dfa/transducer.rs b/vendor/regex-automata-0.2.0/src/dfa/transducer.rs new file mode 100644 index 000000000..58b34e00a --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/dfa/transducer.rs @@ -0,0 +1,207 @@ +use crate::{ + dfa::{automaton::Automaton, dense, sparse}, + util::id::StateID, +}; + +impl<T: AsRef<[u32]>> fst::Automaton for dense::DFA<T> { + type State = StateID; + + #[inline] + fn start(&self) -> StateID { + self.start_state_forward(None, &[], 0, 0) + } + + #[inline] + fn is_match(&self, state: &StateID) -> bool { + self.is_match_state(*state) + } + + #[inline] + fn accept(&self, state: &StateID, byte: u8) -> StateID { + if fst::Automaton::is_match(self, state) { + return *state; + } + self.next_state(*state, byte) + } + + #[inline] + fn accept_eof(&self, state: &StateID) -> Option<StateID> { + if fst::Automaton::is_match(self, state) { + return Some(*state); + } + Some(self.next_eoi_state(*state)) + } + + #[inline] + fn can_match(&self, state: &StateID) -> bool { + !self.is_dead_state(*state) + } +} + +impl<T: AsRef<[u8]>> fst::Automaton for sparse::DFA<T> { + type State = StateID; + + #[inline] + fn start(&self) -> StateID { + self.start_state_forward(None, &[], 0, 0) + } + + #[inline] + fn is_match(&self, state: &StateID) -> bool { + self.is_match_state(*state) + } + + #[inline] + fn accept(&self, state: &StateID, byte: u8) -> StateID { + if fst::Automaton::is_match(self, state) { + return *state; + } + self.next_state(*state, byte) + } + + #[inline] + fn accept_eof(&self, state: &StateID) -> Option<StateID> { + if fst::Automaton::is_match(self, state) { + return Some(*state); + } + Some(self.next_eoi_state(*state)) + } + + #[inline] + fn can_match(&self, state: &StateID) -> bool { + !self.is_dead_state(*state) + } +} + +#[cfg(test)] +mod tests { + use bstr::BString; + use fst::{Automaton, IntoStreamer, Set, Streamer}; + + use crate::dfa::{dense, sparse}; + + fn search<A: Automaton, D: AsRef<[u8]>>( + set: &Set<D>, + aut: A, + ) -> Vec<BString> { + let mut stream = set.search(aut).into_stream(); + + let mut results = vec![]; + while let Some(key) = stream.next() { + results.push(BString::from(key)); + } + results + } + + #[test] + fn dense_anywhere() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::DFA::new("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); + } + + #[test] + fn dense_anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .build("ba.*") + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn dense_assertions_start() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new().build("^ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn dense_assertions_end() { + let set = + Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new().build(".*x$").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bax", "xbax"]); + } + + #[test] + fn dense_assertions_word() { + let set = + Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); + let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["foo", "zzz foo zzz"]); + } + + #[test] + fn sparse_anywhere() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = sparse::DFA::new("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); + } + + #[test] + fn sparse_anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .build("ba.*") + .unwrap() + .to_sparse() + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn sparse_assertions_start() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = + dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn sparse_assertions_end() { + let set = + Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = + dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bax", "xbax"]); + } + + #[test] + fn sparse_assertions_word() { + let set = + Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); + let dfa = dense::Builder::new() + .build(r"(?-u)\bfoo\b") + .unwrap() + .to_sparse() + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["foo", "zzz foo zzz"]); + } +} diff --git a/vendor/regex-automata-0.2.0/src/hybrid/dfa.rs b/vendor/regex-automata-0.2.0/src/hybrid/dfa.rs new file mode 100644 index 000000000..1fbce5f5f --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/dfa.rs @@ -0,0 +1,3817 @@ +/*! +Types and routines specific to lazy DFAs. + +This module is the home of [`hybrid::dfa::DFA`](DFA). + +This module also contains a [`hybrid::dfa::Builder`](Builder) and a +[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. +*/ + +use core::{borrow::Borrow, iter, mem::size_of}; + +use alloc::{sync::Arc, vec::Vec}; + +use crate::{ + hybrid::{ + error::{BuildError, CacheError}, + id::{LazyStateID, LazyStateIDError, OverlappingState}, + search, + }, + nfa::thompson, + util::{ + alphabet::{self, ByteClasses, ByteSet}, + determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, + id::{PatternID, StateID as NFAStateID}, + matchtypes::{HalfMatch, MatchError, MatchKind}, + prefilter, + sparse_set::SparseSets, + start::Start, + }, +}; + +/// The mininum number of states that a lazy DFA's cache size must support. +/// +/// This is checked at time of construction to ensure that at least some small +/// number of states can fit in the given capacity allotment. If we can't fit +/// at least this number of states, then the thinking is that it's pretty +/// senseless to use the lazy DFA. More to the point, parts of the code do +/// assume that the cache can fit at least some small number of states. +const MIN_STATES: usize = 5; + +/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. +/// +/// A lazy DFA is a DFA that builds itself at search time. It otherwise has +/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA). +/// Indeed, both support precisely the same regex features with precisely the +/// same semantics. +/// +/// Where as a `dense::DFA` must be completely built to handle any input before +/// it may be used for search, a lazy DFA starts off effectively empty. During +/// a search, a lazy DFA will build itself depending on whether it has already +/// computed the next transition or not. If it has, then it looks a lot like +/// a `dense::DFA` internally: it does a very fast table based access to find +/// the next transition. Otherwise, if the state hasn't been computed, then it +/// does determinization _for that specific transition_ to compute the next DFA +/// state. +/// +/// The main selling point of a lazy DFA is that, in practice, it has +/// the performance profile of a `dense::DFA` without the weakness of it +/// taking worst case exponential time to build. Indeed, for each byte of +/// input, the lazy DFA will construct as most one new DFA state. Thus, a +/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~ +/// pattern.len()` and `n ~ haystack.len()`). +/// +/// The main downsides of a lazy DFA are: +/// +/// 1. It requires mutable "cache" space during search. This is where the +/// transition table, among other things, is stored. +/// 2. In pathological cases (e.g., if the cache is too small), it will run +/// out of room and either require a bigger cache capacity or will repeatedly +/// clear the cache and thus repeatedly regenerate DFA states. Overall, this +/// will tend to be slower than a typical NFA simulation. +/// +/// # Capabilities +/// +/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following +/// operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of a match. +/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched +/// is reported as well. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of +/// a match, *two* lazy DFAs are required. This functionality is provided by a +/// [`Regex`](crate::hybrid::regex::Regex). +/// +/// # Example +/// +/// This shows how to build a lazy DFA with the default configuration and +/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create +/// a cache and pass it to our search routine. +/// +/// ``` +/// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let mut cache = dfa.create_cache(); +/// +/// let expected = Some(HalfMatch::must(0, 8)); +/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct DFA { + nfa: Arc<thompson::NFA>, + stride2: usize, + classes: ByteClasses, + quitset: ByteSet, + anchored: bool, + match_kind: MatchKind, + starts_for_each_pattern: bool, + cache_capacity: usize, + minimum_cache_clear_count: Option<usize>, +} + +impl DFA { + /// Parse the given regular expression using a default configuration and + /// return the corresponding lazy DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new("foo[0-9]+bar")?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<DFA, BuildError> { + DFA::builder().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding lazy multi-DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { + DFA::builder().build_many(patterns) + } + + /// Create a new lazy DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::always_match()?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(Arc::new(nfa)) + } + + /// Create a new lazy DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::never_match()?; + /// let mut cache = dfa.create_cache(); + /// + /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?); + /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(Arc::new(nfa)) + } + + /// Return a default configuration for a `DFA`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a lazy DFA. + /// + /// # Example + /// + /// This example shows how to build a lazy DFA that only executes searches + /// in anchored mode. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().anchored(true)) + /// .build(r"[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "abc123xyz".as_bytes(); + /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// re.find_leftmost_fwd(&mut cache, &haystack[3..6])?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere for lazy DFAs. This includes disabling it for both the + /// concrete syntax (e.g., `.` matches any byte and Unicode character + /// classes like `\p{Letter}` are not allowed) and for the unanchored + /// search prefix. The latter enables the regex to match anywhere in a + /// sequence of arbitrary bytes. (Typically, the unanchored search prefix + /// will only permit matching valid UTF-8.) + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, SyntaxConfig, + /// }; + /// + /// let re = DFA::builder() + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(HalfMatch::must(0, 9)); + /// let got = re.find_leftmost_fwd(&mut cache, haystack)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this lazy DFA. + /// + /// The cache returned should only be used for searches for this + /// lazy DFA. If you want to reuse the cache for another DFA, then + /// you must call [`Cache::reset`] with that DFA (or, equivalently, + /// [`DFA::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this lazy DFA (and only this DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different lazy DFA. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// lazy DFA has been configured to "give up" after it has cleared the + /// cache a certain number of times. + /// + /// Any lazy state ID generated by the cache prior to resetting it is + /// invalid after the reset. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa1 = DFA::new(r"\w")?; + /// let dfa2 = DFA::new(r"\W")?; + /// + /// let mut cache = dfa1.create_cache(); + /// assert_eq!( + /// Some(HalfMatch::must(0, 2)), + /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?, + /// ); + /// + /// // Using 'cache' with dfa2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 'dfa1' is also not + /// // allowed. + /// dfa2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + Lazy::new(self, cache).reset_cache() + } + + /// Returns the total number of patterns compiled into this lazy DFA. + /// + /// In the case of a DFA that contains no patterns, this returns `0`. + /// + /// # Example + /// + /// This example shows the pattern count for a DFA that never matches: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::never_match()?; + /// assert_eq!(dfa.pattern_count(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a DFA that matches at every position: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::always_match()?; + /// assert_eq!(dfa.pattern_count(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a DFA that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(dfa.pattern_count(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + self.nfa.pattern_len() + } + + /// Returns a reference to the underlying NFA. + pub fn nfa(&self) -> &Arc<thompson::NFA> { + &self.nfa + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. + fn stride2(&self) -> usize { + self.stride2 + } + + /// Returns the total stride for every state in this lazy DFA. This + /// corresponds to the total number of transitions used by each state in + /// this DFA's transition table. + fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns the total number of elements in the alphabet for this + /// transition table. This is always less than or equal to `self.stride()`. + /// It is only equal when the alphabet length is a power of 2. Otherwise, + /// it is always strictly less. + fn alphabet_len(&self) -> usize { + self.classes.alphabet_len() + } + + /// Returns the memory usage, in bytes, of this lazy DFA. + /// + /// This does **not** include the stack size used up by this lazy DFA. To + /// compute that, use `std::mem::size_of::<DFA>()`. This also does + /// not include the size of the `Cache` used. + pub fn memory_usage(&self) -> usize { + // Everything else is on the stack. + self.nfa.memory_usage() + } +} + +impl DFA { + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as + /// providing a prefilter, a specific pattern to match and the bounds of + /// the search within the haystack. This routine is meant as a convenience + /// for common cases where the additional functionality is not needed. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example demonstrates how the position returned might differ from + /// what one might expect when executing a traditional leftmost search. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 8, + /// // corresponding to the end of the input. But the "earliest" semantics + /// // this routine cause it to stop as soon as a match is known, which + /// // occurs once 'foo[0-9]' has matched. + /// let expected = HalfMatch::must(0, 4); + /// assert_eq!( + /// Some(expected), + /// dfa.find_earliest_fwd(&mut cache, b"foo12345")?, + /// ); + /// + /// let dfa = DFA::new("abc|a")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 1); + /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_earliest_fwd( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// Note that while it is not technically necessary to build a reverse + /// automaton to use a reverse search, it is likely that you'll want to do + /// so. Namely, the typical use of a reverse search is to find the starting + /// location of a match once its end is discovered from a forward search. A + /// reverse DFA automaton can be built by configuring the intermediate NFA + /// to be reversed via + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example demonstrates how the position returned might differ from + /// what one might expect when executing a traditional leftmost reverse + /// search. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch}; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("[a-z]+[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 0, + /// // corresponding to the beginning of the input. But the "earliest" + /// // semantics of this routine cause it to stop as soon as a match is + /// // known, which occurs once '[a-z][0-9]+' has matched. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!( + /// Some(expected), + /// dfa.find_earliest_rev(&mut cache, b"foo12345")?, + /// ); + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 0, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_earliest_rev( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len()) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_fwd(&mut cache, b"foo12345")?, + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = DFA::new("abc|a")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_leftmost_fwd( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// In particular, this routine is principally + /// useful when used in conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers + /// e) configuration. In general, it's unlikely to be correct to use both + /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since + /// any particular DFA will only support searching in one direction with + /// respect to the pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_rev(&mut cache, b"foo12345")?, + /// ); + /// + /// // Even though a match is found after reading the last byte (`c`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_leftmost_rev( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len()) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic overlapping search. Notice + /// that we build the automaton with a `MatchKind::All` configuration. + /// Overlapping searches are unlikely to work as one would expect when + /// using the default `MatchKind::LeftmostFirst` match semantics, since + /// leftmost-first matching is fundamentally incompatible with overlapping + /// searches. Namely, overlapping searches need to report matches as they + /// are seen, where as leftmost-first searches will continue searching even + /// after a match has been observed in order to find the conventional end + /// position of the match. More concretely, leftmost-first searches use + /// dead states to terminate a search after a specific match can no longer + /// be extended. Overlapping searches instead do the opposite by continuing + /// the search to find totally new matches (potentially of other patterns). + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, OverlappingState}, + /// HalfMatch, + /// MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_overlapping_fwd( + &self, + cache: &mut Cache, + bytes: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_overlapping_fwd_at( + cache, + None, + None, + bytes, + 0, + bytes.len(), + state, + ) + } + + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// This is like [`DFA::find_earliest_fwd`], except it provides some + /// additional control over how the search is executed: + /// + /// * `pre` is a prefilter scanner that, when given, is used whenever the + /// DFA enters its starting state. This is meant to speed up searches where + /// one or a small number of literal prefixes are known. + /// * `pattern_id` specifies a specific pattern in the DFA to run an + /// anchored search for. If not given, then a search for any pattern is + /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be + /// enabled to use this functionality. + /// * `start` and `end` permit searching a specific region of the haystack + /// `bytes`. This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `bytes`. (Because the existence of look-around + /// operations such as `\b`, `^` and `$` need to take the surrounding + /// context into account. This cannot be done if the haystack doesn't + /// contain it.) + /// + /// The examples below demonstrate each of these additional parameters. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and this lazy DFA does + /// not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + /// + /// # Example: prefilter + /// + /// This example shows how to provide a prefilter for a pattern where all + /// matches start with a `z` byte. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::prefilter::{Candidate, Prefilter, Scanner, State}, + /// HalfMatch, + /// }; + /// + /// #[derive(Debug)] + /// pub struct ZPrefilter; + /// + /// impl Prefilter for ZPrefilter { + /// fn next_candidate( + /// &self, + /// _: &mut State, + /// haystack: &[u8], + /// at: usize, + /// ) -> Candidate { + /// // Try changing b'z' to b'q' and observe this test fail since + /// // the prefilter will skip right over the match. + /// match haystack.iter().position(|&b| b == b'z') { + /// None => Candidate::None, + /// Some(i) => Candidate::PossibleStartOfMatch(at + i), + /// } + /// } + /// + /// fn heap_bytes(&self) -> usize { + /// 0 + /// } + /// } + /// + /// let dfa = DFA::new("z[0-9]{3}")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "foobar z123 q123".as_bytes(); + /// // A scanner executes a prefilter while tracking some state that helps + /// // determine whether a prefilter is still "effective" or not. + /// let mut scanner = Scanner::new(&ZPrefilter); + /// + /// let expected = Some(HalfMatch::must(0, 11)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// Some(&mut scanner), + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a lazy multi-DFA that permits searching + /// for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// HalfMatch, + /// PatternID, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "foo123".as_bytes(); + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(HalfMatch::must(1, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// Some(PatternID::must(1)), + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries since our haystack is pure ASCII. + /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "foo123bar".as_bytes(); + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// None, + /// &haystack[3..6], + /// 0, + /// haystack[3..6].len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// None, + /// haystack, + /// 3, + /// 6, + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_earliest_fwd_at( + &self, + cache: &mut Cache, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_earliest_fwd( + pre, self, cache, pattern_id, bytes, start, end, + ) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// This is like [`DFA::find_earliest_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_earliest_rev_at( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_earliest_rev(self, cache, pattern_id, bytes, start, end) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// This is like [`DFA::find_leftmost_fwd`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the + /// additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_leftmost_fwd_at( + &self, + cache: &mut Cache, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_leftmost_fwd( + pre, self, cache, pattern_id, bytes, start, end, + ) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// This is like [`DFA::find_leftmost_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the + /// additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_leftmost_rev_at( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// This is like [`DFA::find_overlapping_fwd`], except it provides + /// some additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should always be set to the end + /// of the last match. If more patterns match at the previous location, + /// then they will be immediately returned. (This is tracked by the given + /// overlapping state.) Otherwise, the search continues at the starting + /// position given. + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_overlapping_fwd_at( + &self, + cache: &mut Cache, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_overlapping_fwd( + pre, self, cache, pattern_id, bytes, start, end, state, + ) + } +} + +impl DFA { + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// The given cache is used to either reuse pre-computed state + /// transitions, or to store this newly computed transition for future + /// reuse. Thus, this routine guarantees that it will never return a state + /// ID that has an "unknown" tag. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is the lazy state ID returned + /// by the most recent call to `next_state`, `next_state_untagged`, + /// `next_state_untagged_unchecked`, `start_state_forward` or + /// `state_state_reverse` for the given `cache`. Any state ID returned from + /// prior calls to these routines (with the same `cache`) is considered + /// invalid (even if it gives an appearance of working). State IDs returned + /// from _any_ prior call for different `cache` values are also always + /// invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID. Moreover, this routine is defined for all possible values of + /// `input`. + /// + /// These validity rules are not checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a lazy DFA for a given + /// haystack by using the `next_state` method. + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_state( + &self, + cache: &mut Cache, + current: LazyStateID, + input: u8, + ) -> Result<LazyStateID, CacheError> { + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_untagged() + class; + let sid = cache.trans[offset]; + if !sid.is_unknown() { + return Ok(sid); + } + let unit = alphabet::Unit::u8(input); + Lazy::new(self, cache).cache_next_state(current, unit) + } + + /// Transitions from the current state to the next state, given the next + /// byte of input and a state ID that is not tagged. + /// + /// The only reason to use this routine is performance. In particular, the + /// `next_state` method needs to do some additional checks, among them is + /// to account for identifiers to states that are not yet computed. In + /// such a case, the transition is computed on the fly. However, if it is + /// known that the `current` state ID is untagged, then these checks can be + /// omitted. + /// + /// Since this routine does not compute states on the fly, it does not + /// modify the cache and thus cannot return an error. Consequently, `cache` + /// does not need to be mutable and it is possible for this routine to + /// return a state ID corresponding to the special "unknown" state. In + /// this case, it is the caller's responsibility to use the prior state + /// ID and `input` with `next_state` in order to force the computation of + /// the unknown transition. Otherwise, trying to use the "unknown" state + /// ID will just result in transitioning back to itself, and thus never + /// terminating. (This is technically a special exemption to the state ID + /// validity rules, but is permissible since this routine is guarateed to + /// never mutate the given `cache`, and thus the identifier is guaranteed + /// to remain valid.) + /// + /// See [`LazyStateID`] for more details on what it means for a state ID + /// to be tagged. Also, see + /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked) + /// for this same idea, but with bounds checks forcefully elided. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is an **untagged** lazy + /// state ID returned by the most recent call to `next_state`, + /// `next_state_untagged`, `next_state_untagged_unchecked`, + /// `start_state_forward` or `state_state_reverse` for the given `cache`. + /// Any state ID returned from prior calls to these routines (with the + /// same `cache`) is considered invalid (even if it gives an appearance + /// of working). State IDs returned from _any_ prior call for different + /// `cache` values are also always invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID, although it may be tagged. Moreover, this routine is defined for + /// all possible values of `input`. + /// + /// Not all validity rules are checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a lazy DFA for a given + /// haystack by using the `next_state_untagged` method where possible. + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// let mut at = 0; + /// while at < haystack.len() { + /// if sid.is_tagged() { + /// sid = dfa.next_state(&mut cache, sid, haystack[at])?; + /// } else { + /// let mut prev_sid = sid; + /// // We attempt to chew through as much as we can while moving + /// // through untagged state IDs. Thus, the transition function + /// // does less work on average per byte. (Unrolling this loop + /// // may help even more.) + /// while at < haystack.len() { + /// prev_sid = sid; + /// sid = dfa.next_state_untagged( + /// &mut cache, sid, haystack[at], + /// ); + /// at += 1; + /// if sid.is_tagged() { + /// break; + /// } + /// } + /// // We must ensure that we never proceed to the next iteration + /// // with an unknown state ID. If we don't account for this + /// // case, then search isn't guaranteed to terminate since all + /// // transitions on unknown states loop back to itself. + /// if sid.is_unknown() { + /// sid = dfa.next_state( + /// &mut cache, prev_sid, haystack[at - 1], + /// )?; + /// } + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_state_untagged( + &self, + cache: &Cache, + current: LazyStateID, + input: u8, + ) -> LazyStateID { + debug_assert!(!current.is_tagged()); + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_unchecked() + class; + cache.trans[offset] + } + + /// Transitions from the current state to the next state, eliding bounds + /// checks, given the next byte of input and a state ID that is not tagged. + /// + /// The only reason to use this routine is performance. In particular, the + /// `next_state` method needs to do some additional checks, among them is + /// to account for identifiers to states that are not yet computed. In + /// such a case, the transition is computed on the fly. However, if it is + /// known that the `current` state ID is untagged, then these checks can be + /// omitted. + /// + /// Since this routine does not compute states on the fly, it does not + /// modify the cache and thus cannot return an error. Consequently, `cache` + /// does not need to be mutable and it is possible for this routine to + /// return a state ID corresponding to the special "unknown" state. In + /// this case, it is the caller's responsibility to use the prior state + /// ID and `input` with `next_state` in order to force the computation of + /// the unknown transition. Otherwise, trying to use the "unknown" state + /// ID will just result in transitioning back to itself, and thus never + /// terminating. (This is technically a special exemption to the state ID + /// validity rules, but is permissible since this routine is guarateed to + /// never mutate the given `cache`, and thus the identifier is guaranteed + /// to remain valid.) + /// + /// See [`LazyStateID`] for more details on what it means for a state ID + /// to be tagged. Also, see + /// [`next_state_untagged`](DFA::next_state_untagged) + /// for this same idea, but with memory safety guaranteed by retaining + /// bounds checks. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is an **untagged** lazy + /// state ID returned by the most recent call to `next_state`, + /// `next_state_untagged`, `next_state_untagged_unchecked`, + /// `start_state_forward` or `state_state_reverse` for the given `cache`. + /// Any state ID returned from prior calls to these routines (with the + /// same `cache`) is considered invalid (even if it gives an appearance + /// of working). State IDs returned from _any_ prior call for different + /// `cache` values are also always invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID, although it may be tagged. Moreover, this routine is defined for + /// all possible values of `input`. + /// + /// Not all validity rules are checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Safety + /// + /// Callers of this method must guarantee that `current` refers to a valid + /// state ID according to the rules described above. If `current` is not a + /// valid state ID for this automaton, then calling this routine may result + /// in undefined behavior. + /// + /// If `current` is valid, then the ID returned is valid for all possible + /// values of `input`. + #[inline] + pub unsafe fn next_state_untagged_unchecked( + &self, + cache: &Cache, + current: LazyStateID, + input: u8, + ) -> LazyStateID { + debug_assert!(!current.is_tagged()); + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_unchecked() + class; + *cache.trans.get_unchecked(offset) + } + + /// Transitions from the current state to the next state for the special + /// EOI symbol. + /// + /// The given cache is used to either reuse pre-computed state + /// transitions, or to store this newly computed transition for future + /// reuse. Thus, this routine guarantees that it will never return a state + /// ID that has an "unknown" tag. + /// + /// This routine must be called at the end of every search in a correct + /// implementation of search. Namely, lazy DFAs in this crate delay matches + /// by one byte in order to support look-around operators. Thus, after + /// reaching the end of a haystack, a search implementation must follow one + /// last EOI transition. + /// + /// It is best to think of EOI as an additional symbol in the alphabet of a + /// DFA that is distinct from every other symbol. That is, the alphabet of + /// lazy DFAs in this crate has a logical size of 257 instead of 256, where + /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the + /// physical alphabet size may be smaller because of alphabet compression + /// via equivalence classes, but EOI is always represented somehow in the + /// alphabet.) + /// + /// # State identifier validity + /// + /// The only valid value for `current` is the lazy state ID returned + /// by the most recent call to `next_state`, `next_state_untagged`, + /// `next_state_untagged_unchecked`, `start_state_forward` or + /// `state_state_reverse` for the given `cache`. Any state ID returned from + /// prior calls to these routines (with the same `cache`) is considered + /// invalid (even if it gives an appearance of working). State IDs returned + /// from _any_ prior call for different `cache` values are also always + /// invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID. + /// + /// These validity rules are not checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack, + /// and then finishing the search with the final EOI transition. + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. Without this + /// // final transition, the assert below will fail since the DFA will not + /// // have entered a match state yet! + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_eoi_state( + &self, + cache: &mut Cache, + current: LazyStateID, + ) -> Result<LazyStateID, CacheError> { + let eoi = self.classes.eoi().as_usize(); + let offset = current.as_usize_untagged() + eoi; + let sid = cache.trans[offset]; + if !sid.is_unknown() { + return Ok(sid); + } + let unit = self.classes.eoi(); + Lazy::new(self, cache).cache_next_state(current, unit) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been + /// configured with multiple patterns _and_ the DFA has been configured to + /// build an anchored start state for each pattern, then a pattern ID may + /// be specified to execute an anchored search for that specific pattern. + /// If `pattern_id` is invalid or if the DFA isn't configured to build + /// start states for each pattern, then implementations must panic. DFAs in + /// this crate can be configured to build start states for each pattern via + /// [`Config::starts_for_each_pattern`]. + /// * When `start > 0`, the byte at index `start - 1` may influence the + /// start state if the regex uses `^` or `\b`. + /// * Similarly, when `start == 0`, it may influence the start state when + /// the regex uses `^` or `\A`. + /// * Currently, `end` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Panics + /// + /// This panics if `start..end` is not a valid sub-slice of `bytes`. This + /// also panics if `pattern_id` is non-None and does not refer to a valid + /// pattern, or if the DFA was not configured to build anchored start + /// states for each pattern. + #[inline] + pub fn start_state_forward( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<LazyStateID, CacheError> { + let mut lazy = Lazy::new(self, cache); + let start_type = Start::from_position_fwd(bytes, start, end); + let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type); + if !sid.is_unknown() { + return Ok(sid); + } + lazy.cache_start_group(pattern_id, start_type) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// reverse search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been + /// configured with multiple patterns _and_ the DFA has been configured to + /// build an anchored start state for each pattern, then a pattern ID may + /// be specified to execute an anchored search for that specific pattern. + /// If `pattern_id` is invalid or if the DFA isn't configured to build + /// start states for each pattern, then implementations must panic. DFAs in + /// this crate can be configured to build start states for each pattern via + /// [`Config::starts_for_each_pattern`]. + /// * When `end < bytes.len()`, the byte at index `end` may influence the + /// start state if the regex uses `$` or `\b`. + /// * Similarly, when `end == bytes.len()`, it may influence the start + /// state when the regex uses `$` or `\z`. + /// * Currently, `start` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Panics + /// + /// This panics if `start..end` is not a valid sub-slice of `bytes`. This + /// also panics if `pattern_id` is non-None and does not refer to a valid + /// pattern, or if the DFA was not configured to build anchored start + /// states for each pattern. + #[inline] + pub fn start_state_reverse( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<LazyStateID, CacheError> { + let mut lazy = Lazy::new(self, cache); + let start_type = Start::from_position_rev(bytes, start, end); + let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type); + if !sid.is_unknown() { + return Ok(sid); + } + lazy.cache_start_group(pattern_id, start_type) + } + + /// Returns the total number of patterns that match in this state. + /// + /// If the lazy DFA was compiled with one pattern, then this must + /// necessarily always return `1` for all match states. + /// + /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with + /// indices up to (but not including) the count returned by this routine + /// without panicking. + /// + /// If the given state is not a match state, then this may either panic + /// or return an incorrect result. + /// + /// # Example + /// + /// This example shows a simple instance of implementing overlapping + /// matches. In particular, it shows not only how to determine how many + /// patterns have matched in a particular state, but also how to access + /// which specific patterns have matched. + /// + /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All) + /// when building the DFA. If we used + /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) + /// instead, then the DFA would not be constructed in a way that supports + /// overlapping matches. (It would only report a single pattern that + /// matches at any particular point in time.) + /// + /// Another thing to take note of is the patterns used and the order in + /// which the pattern IDs are reported. In the example below, pattern `3` + /// is yielded first. Why? Because it corresponds to the match that + /// appears first. Namely, the `@` symbol is part of `\S+` but not part + /// of any of the other patterns. Since the `\S+` pattern has a match that + /// starts to the left of any other pattern, its ID is returned before any + /// other. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchKind}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[ + /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", + /// ])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "@bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// + /// assert!(sid.is_match()); + /// assert_eq!(dfa.match_count(&mut cache, sid), 3); + /// // The following calls are guaranteed to not panic since `match_count` + /// // returned `3` above. + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize { + assert!(id.is_match()); + LazyRef::new(self, cache).get_cached_state(id).match_count() + } + + /// Returns the pattern ID corresponding to the given match index in the + /// given state. + /// + /// See [`DFA::match_count`] for an example of how to use this method + /// correctly. Note that if you know your lazy DFA is configured with a + /// single pattern, then this routine is never necessary since it will + /// always return a pattern ID of `0` for an index of `0` when `id` + /// corresponds to a match state. + /// + /// Typically, this routine is used when implementing an overlapping + /// search, as the example for `DFA::match_count` does. + /// + /// # Panics + /// + /// If the state ID is not a match state or if the match index is out + /// of bounds for the given state, then this routine may either panic + /// or produce an incorrect result. If the state ID is correct and the + /// match index is correct, then this routine always produces a valid + /// `PatternID`. + #[inline] + pub fn match_pattern( + &self, + cache: &Cache, + id: LazyStateID, + match_index: usize, + ) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the corresponding `State`, which + // requires a bit of slicing/pointer-chasing. This optimization tends + // to only matter when matches are frequent. + if self.pattern_count() == 1 { + return PatternID::ZERO; + } + LazyRef::new(self, cache) + .get_cached_state(id) + .match_pattern(match_index) + } +} + +/// A cache represents a partially computed DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`DFA`]. +/// +/// Caches can be created from their corresponding DFA via +/// [`DFA::create_cache`]. A cache can only be used with either the DFA that +/// created it, or the DFA that was most recently used to reset it with +/// [`Cache::reset`]. Using a cache with any other DFA may result in panics +/// or incorrect results. +#[derive(Clone, Debug)] +pub struct Cache { + // N.B. If you're looking to understand how determinization works, it + // is probably simpler to first grok src/dfa/determinize.rs, since that + // doesn't have the "laziness" component. + /// The transition table. + /// + /// Given a `current` LazyStateID and an `input` byte, the next state can + /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice + /// that no multiplication is used. That's because state identifiers are + /// "premultiplied." + /// + /// Note that the next state may be the "unknown" state. In this case, the + /// next state is not known and determinization for `current` on `input` + /// must be performed. + trans: Vec<LazyStateID>, + /// The starting states for this DFA. + /// + /// These are computed lazily. Initially, these are all set to "unknown" + /// lazy state IDs. + /// + /// When 'starts_for_each_pattern' is disabled (the default), then the size + /// of this is constrained to the possible starting configurations based + /// on the search parameters. (At time of writing, that's 4.) However, + /// when starting states for each pattern is enabled, then there are N + /// additional groups of starting states, where each group reflects the + /// different possible configurations and N is the number of patterns. + starts: Vec<LazyStateID>, + /// A sequence of NFA/DFA powerset states that have been computed for this + /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every + /// tagged LazyStateID can be used to index this sequence by converting it + /// to its untagged form.) + states: Vec<State>, + /// A map from states to their corresponding IDs. This map may be accessed + /// via the raw byte representation of a state, which means that a `State` + /// does not need to be allocated to determine whether it already exists + /// in this map. Indeed, the existence of such a state is what determines + /// whether we allocate a new `State` or not. + /// + /// The higher level idea here is that we do just enough determinization + /// for a state to check whether we've already computed it. If we have, + /// then we can save a little (albeit not much) work. The real savings is + /// in memory usage. If we never checked for trivially duplicate states, + /// then our memory usage would explode to unreasonable levels. + states_to_id: StateMap, + /// Sparse sets used to track which NFA states have been visited during + /// various traversals. + sparses: SparseSets, + /// Scratch space for traversing the NFA graph. (We use space on the heap + /// instead of the call stack.) + stack: Vec<NFAStateID>, + /// Scratch space for building a NFA/DFA powerset state. This is used to + /// help amortize allocation since not every powerset state generated is + /// added to the cache. In particular, if it already exists in the cache, + /// then there is no need to allocate a new `State` for it. + scratch_state_builder: StateBuilderEmpty, + /// A simple abstraction for handling the saving of at most a single state + /// across a cache clearing. This is required for correctness. Namely, if + /// adding a new state after clearing the cache fails, then the caller + /// must retain the ability to continue using the state ID given. The + /// state corresponding to the state ID is what we preserve across cache + /// clearings. + state_saver: StateSaver, + /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We + /// track this as new states are added since states use a variable amount + /// of heap. Tracking this as we add states makes it possible to compute + /// the total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// The number of times the cache has been cleared. When a minimum cache + /// clear count is set, then the cache will return an error instead of + /// clearing the cache if the count has been exceeded. + clear_count: usize, +} + +impl Cache { + /// Create a new cache for the given lazy DFA. + /// + /// The cache returned should only be used for searches for the given DFA. + /// If you want to reuse the cache for another DFA, then you must call + /// [`Cache::reset`] with that DFA. + pub fn new(dfa: &DFA) -> Cache { + let mut cache = Cache { + trans: alloc::vec![], + starts: alloc::vec![], + states: alloc::vec![], + states_to_id: StateMap::new(), + sparses: SparseSets::new(dfa.nfa.len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + state_saver: StateSaver::none(), + memory_usage_state: 0, + clear_count: 0, + }; + Lazy { dfa, cache: &mut cache }.init_cache(); + cache + } + + /// Reset this cache such that it can be used for searching with the given + /// lazy DFA (and only that DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different lazy DFA. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// lazy DFA has been configured to "give up" after it has cleared the + /// cache a certain number of times. + /// + /// Any lazy state ID generated by the cache prior to resetting it is + /// invalid after the reset. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa1 = DFA::new(r"\w")?; + /// let dfa2 = DFA::new(r"\W")?; + /// + /// let mut cache = dfa1.create_cache(); + /// assert_eq!( + /// Some(HalfMatch::must(0, 2)), + /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?, + /// ); + /// + /// // Using 'cache' with dfa2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 'dfa1' is also not + /// // allowed. + /// cache.reset(&dfa2); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, dfa: &DFA) { + Lazy::new(dfa, self).reset_cache() + } + + /// Returns the total number of times this cache has been cleared since it + /// was either created or last reset. + /// + /// This is useful for informational purposes or if you want to change + /// search strategies based on the number of times the cache has been + /// cleared. + pub fn clear_count(&self) -> usize { + self.clear_count + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + self.trans.len() * ID_SIZE + + self.starts.len() * ID_SIZE + + self.states.len() * STATE_SIZE + // Maps likely use more memory than this, but it's probably close. + + self.states_to_id.len() * (STATE_SIZE + ID_SIZE) + + self.sparses.memory_usage() + + self.stack.capacity() * ID_SIZE + + self.scratch_state_builder.capacity() + // Heap memory used by 'State' in both 'states' and 'states_to_id'. + + self.memory_usage_state + } +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap<State, LazyStateID>; +#[cfg(not(feature = "std"))] +type StateMap = alloc::collections::BTreeMap<State, LazyStateID>; + +/// A type that groups methods that require the base NFA/DFA and writable +/// access to the cache. +#[derive(Debug)] +struct Lazy<'i, 'c> { + dfa: &'i DFA, + cache: &'c mut Cache, +} + +impl<'i, 'c> Lazy<'i, 'c> { + /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. + fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> { + Lazy { dfa, cache } + } + + /// Return an immutable view by downgrading a writable cache to a read-only + /// cache. + fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> { + LazyRef::new(self.dfa, self.cache) + } + + /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA' + /// like 'next_state' and 'next_eoi_state' that are called in critical + /// areas. The idea is to let the optimizer focus on the other areas of + /// those methods as the hot path. + /// + /// Here's an example that justifies 'inline(never)' + /// + /// ```ignore + /// regex-cli find hybrid dfa \ + /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// ``` + /// + /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every + /// codepoint, in sequence, repeated 100 times. + /// + /// With 'inline(never)' hyperfine reports 1.1s per run. With + /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. + #[inline(never)] + fn cache_next_state( + &mut self, + mut current: LazyStateID, + unit: alphabet::Unit, + ) -> Result<LazyStateID, CacheError> { + let stride2 = self.dfa.stride2(); + let empty_builder = self.get_state_builder(); + let builder = determinize::next( + &self.dfa.nfa, + self.dfa.match_kind, + &mut self.cache.sparses, + &mut self.cache.stack, + &self.cache.states[current.as_usize_untagged() >> stride2], + unit, + empty_builder, + ); + let save_state = !self.as_ref().state_builder_fits_in_cache(&builder); + if save_state { + self.save_state(current); + } + let next = self.add_builder_state(builder, |sid| sid)?; + if save_state { + current = self.saved_state_id(); + } + // This is the payoff. The next time 'next_state' is called with this + // state and alphabet unit, it will find this transition and avoid + // having to re-determinize this transition. + self.set_transition(current, unit, next); + Ok(next) + } + + /// Compute and cache the starting state for the given pattern ID (if + /// present) and the starting configuration. + /// + /// This panics if a pattern ID is given and the DFA isn't configured to + /// build anchored start states for each pattern. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn cache_start_group( + &mut self, + pattern_id: Option<PatternID>, + start: Start, + ) -> Result<LazyStateID, CacheError> { + let nfa_start_id = match pattern_id { + Some(pid) => { + assert!( + self.dfa.starts_for_each_pattern, + "attempted to search for a specific pattern \ + without enabling starts_for_each_pattern", + ); + self.dfa.nfa.start_pattern(pid) + } + None if self.dfa.anchored => self.dfa.nfa.start_anchored(), + None => self.dfa.nfa.start_unanchored(), + }; + + let id = self.cache_start_one(nfa_start_id, start)?; + self.set_start_state(pattern_id, start, id); + Ok(id) + } + + /// Compute and cache the starting state for the given NFA state ID and the + /// starting configuration. The NFA state ID might be one of the following: + /// + /// 1) An unanchored start state to match any pattern. + /// 2) An anchored start state to match any pattern. + /// 3) An anchored start state for a particular pattern. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn cache_start_one( + &mut self, + nfa_start_id: NFAStateID, + start: Start, + ) -> Result<LazyStateID, CacheError> { + let mut builder_matches = self.get_state_builder().into_matches(); + determinize::set_lookbehind_from_start(&start, &mut builder_matches); + self.cache.sparses.set1.clear(); + determinize::epsilon_closure( + self.dfa.nfa.borrow(), + nfa_start_id, + *builder_matches.look_have(), + &mut self.cache.stack, + &mut self.cache.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + determinize::add_nfa_states( + self.dfa.nfa.borrow(), + &self.cache.sparses.set1, + &mut builder, + ); + self.add_builder_state(builder, |id| id.to_start()) + } + + /// Either add the given builder state to this cache, or return an ID to an + /// equivalent state already in this cache. + /// + /// In the case where no equivalent state exists, the idmap function given + /// may be used to transform the identifier allocated. This is useful if + /// the caller needs to tag the ID with additional information. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn add_builder_state( + &mut self, + builder: StateBuilderNFA, + idmap: impl Fn(LazyStateID) -> LazyStateID, + ) -> Result<LazyStateID, CacheError> { + if let Some(&cached_id) = + self.cache.states_to_id.get(builder.as_bytes()) + { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok(cached_id); + } + let result = self.add_state(builder.to_state(), idmap); + self.put_state_builder(builder); + result + } + + /// Allocate a new state ID and add the given state to this cache. + /// + /// The idmap function given may be used to transform the identifier + /// allocated. This is useful if the caller needs to tag the ID with + /// additional information. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn add_state( + &mut self, + state: State, + idmap: impl Fn(LazyStateID) -> LazyStateID, + ) -> Result<LazyStateID, CacheError> { + if !self.as_ref().state_fits_in_cache(&state) { + self.try_clear_cache()?; + } + // It's important for this to come second, since the above may clear + // the cache. If we clear the cache after ID generation, then the ID + // is likely bunk since it would have been generated based on a larger + // transition table. + let mut id = idmap(self.next_state_id()?); + if state.is_match() { + id = id.to_match(); + } + // Add room in the transition table. Since this is a fresh state, all + // of its transitions are unknown. + self.cache.trans.extend( + iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()), + ); + // When we add a sentinel state, we never want to set any quit + // transitions. Technically, this is harmless, since sentinel states + // have all of their transitions set to loop back to themselves. But + // when creating sentinel states before the quit sentinel state, + // this will try to call 'set_transition' on a state ID that doesn't + // actually exist yet, which isn't allowed. So we just skip doing so + // entirely. + if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) { + let quit_id = self.as_ref().quit_id(); + for b in self.dfa.quitset.iter() { + self.set_transition(id, alphabet::Unit::u8(b), quit_id); + } + } + self.cache.memory_usage_state += state.memory_usage(); + self.cache.states.push(state.clone()); + self.cache.states_to_id.insert(state, id); + Ok(id) + } + + /// Allocate a new state ID. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> { + let sid = match LazyStateID::new(self.cache.trans.len()) { + Ok(sid) => sid, + Err(_) => { + self.try_clear_cache()?; + // This has to pass since we check that ID capacity at + // construction time can fit at least MIN_STATES states. + LazyStateID::new(self.cache.trans.len()).unwrap() + } + }; + Ok(sid) + } + + /// Attempt to clear the cache used by this lazy DFA. + /// + /// If clearing the cache exceeds the minimum number of required cache + /// clearings, then this will return a cache error. In this case, + /// callers should bubble this up as the cache can't be used until it is + /// reset. Implementations of search should convert this error into a + /// `MatchError::GaveUp`. + /// + /// If 'self.state_saver' is set to save a state, then this state is + /// persisted through cache clearing. Otherwise, the cache is returned to + /// its state after initialization with two exceptions: its clear count + /// is incremented and some of its memory likely has additional capacity. + /// That is, clearing a cache does _not_ release memory. + /// + /// Otherwise, any lazy state ID generated by the cache prior to resetting + /// it is invalid after the reset. + fn try_clear_cache(&mut self) -> Result<(), CacheError> { + // Currently, the only heuristic we use is the minimum cache clear + // count. If we pass that minimum, then we give up. + // + // It would be good to also add a heuristic based on "bytes searched + // per generated state," but this requires API design work. Namely, + // we really do not want to add a counter increment to the transition + // function, which implies we need to expose APIs to update the number + // of bytes searched by implementers of the search routines. And that + // doesn't seem great... But we should do it if this heuristic isn't + // enough. (The original lazy DFA implementation in the 'regex' crate + // had this heuristic, since the lazy DFA was coupled with the search + // routines.) + if let Some(min_count) = self.dfa.minimum_cache_clear_count { + if self.cache.clear_count >= min_count { + return Err(CacheError::too_many_cache_clears()); + } + } + self.clear_cache(); + Ok(()) + } + + /// Clears _and_ resets the cache. Resetting the cache means that no + /// states are persisted and the clear count is reset to 0. No heap memory + /// is released. + /// + /// Note that the caller may reset a cache with a different DFA than what + /// it was created from. In which case, the cache can now be used with the + /// new DFA (and not the old DFA). + fn reset_cache(&mut self) { + self.cache.state_saver = StateSaver::none(); + self.clear_cache(); + // If a new DFA is used, it might have a different number of NFA + // states, so we need to make sure our sparse sets have the appropriate + // size. + self.cache.sparses.resize(self.dfa.nfa.len()); + self.cache.clear_count = 0; + } + + /// Clear the cache used by this lazy DFA. + /// + /// If clearing the cache exceeds the minimum number of required cache + /// clearings, then this will return a cache error. In this case, + /// callers should bubble this up as the cache can't be used until it is + /// reset. Implementations of search should convert this error into a + /// `MatchError::GaveUp`. + /// + /// If 'self.state_saver' is set to save a state, then this state is + /// persisted through cache clearing. Otherwise, the cache is returned to + /// its state after initialization with two exceptions: its clear count + /// is incremented and some of its memory likely has additional capacity. + /// That is, clearing a cache does _not_ release memory. + /// + /// Otherwise, any lazy state ID generated by the cache prior to resetting + /// it is invalid after the reset. + fn clear_cache(&mut self) { + self.cache.trans.clear(); + self.cache.starts.clear(); + self.cache.states.clear(); + self.cache.states_to_id.clear(); + self.cache.memory_usage_state = 0; + self.cache.clear_count += 1; + trace!( + "lazy DFA cache has been cleared (count: {})", + self.cache.clear_count + ); + self.init_cache(); + // If the state we want to save is one of the sentinel + // (unknown/dead/quit) states, then 'init_cache' adds those back, and + // their identifier values remains invariant. So there's no need to add + // it again. (And indeed, doing so would be incorrect!) + if let Some((old_id, state)) = self.cache.state_saver.take_to_save() { + // If the state is one of the special sentinel states, then it is + // automatically added by cache initialization and its ID always + // remains the same. With that said, this should never occur since + // the sentinel states are all loop states back to themselves. So + // we should never be in a position where we're attempting to save + // a sentinel state since we never compute transitions out of a + // sentinel state. + assert!( + !self.as_ref().is_sentinel(old_id), + "cannot save sentinel state" + ); + let new_id = self + .add_state(state, |id| { + if old_id.is_start() { + id.to_start() + } else { + id + } + }) + // The unwrap here is OK because lazy DFA creation ensures that + // we have room in the cache to add MIN_STATES states. Since + // 'init_cache' above adds 3, this adds a 4th. + .expect("adding one state after cache clear must work"); + self.cache.state_saver = StateSaver::Saved(new_id); + } + } + + /// Initialize this cache from emptiness to a place where it can be used + /// for search. + /// + /// This is called both at cache creation time and after the cache has been + /// cleared. + /// + /// Primarily, this adds the three sentinel states and allocates some + /// initial memory. + fn init_cache(&mut self) { + let mut starts_len = Start::count(); + if self.dfa.starts_for_each_pattern { + starts_len += Start::count() * self.dfa.pattern_count(); + } + self.cache + .starts + .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len)); + // This is the set of NFA states that corresponds to each of our three + // sentinel states: the empty set. + let dead = State::dead(); + // This sets up some states that we use as sentinels that are present + // in every DFA. While it would be technically possible to implement + // this DFA without explicitly putting these states in the transition + // table, this is convenient to do to make `next_state` correct for all + // valid state IDs without needing explicit conditionals to special + // case these sentinel states. + // + // All three of these states are "dead" states. That is, all of + // them transition only to themselves. So once you enter one of + // these states, it's impossible to leave them. Thus, any correct + // search routine must explicitly check for these state types. (Sans + // `unknown`, since that is only used internally to represent missing + // states.) + let unk_id = + self.add_state(dead.clone(), |id| id.to_unknown()).unwrap(); + let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap(); + let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap(); + assert_eq!(unk_id, self.as_ref().unknown_id()); + assert_eq!(dead_id, self.as_ref().dead_id()); + assert_eq!(quit_id, self.as_ref().quit_id()); + // The idea here is that if you start in an unknown/dead/quit state and + // try to transition on them, then you should end up where you started. + self.set_all_transitions(unk_id, unk_id); + self.set_all_transitions(dead_id, dead_id); + self.set_all_transitions(quit_id, quit_id); + // All of these states are technically equivalent from the FSM + // perspective, so putting all three of them in the cache isn't + // possible. (They are distinct merely because we use their + // identifiers as sentinels to mean something, as indicated by the + // names.) Moreover, we wouldn't want to do that. Unknown and quit + // states are special in that they are artificial constructions + // this implementation. But dead states are a natural part of + // determinization. When you reach a point in the NFA where you cannot + // go anywhere else, a dead state will naturally arise and we MUST + // reuse the canonical dead state that we've created here. Why? Because + // it is the state ID that tells the search routine whether a state is + // dead or not, and thus, whether to stop the search. Having a bunch of + // distinct dead states would be quite wasteful! + self.cache.states_to_id.insert(dead, dead_id); + } + + /// Save the state corresponding to the ID given such that the state + /// persists through a cache clearing. + /// + /// While the state may persist, the ID may not. In order to discover the + /// new state ID, one must call 'saved_state_id' after a cache clearing. + fn save_state(&mut self, id: LazyStateID) { + let state = self.as_ref().get_cached_state(id).clone(); + self.cache.state_saver = StateSaver::ToSave { id, state }; + } + + /// Returns the updated lazy state ID for a state that was persisted + /// through a cache clearing. + /// + /// It is only correct to call this routine when both a state has been + /// saved and the cache has just been cleared. Otherwise, this panics. + fn saved_state_id(&mut self) -> LazyStateID { + self.cache + .state_saver + .take_saved() + .expect("state saver does not have saved state ID") + } + + /// Set all transitions on the state 'from' to 'to'. + fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { + for unit in self.dfa.classes.representatives() { + self.set_transition(from, unit, to); + } + } + + /// Set the transition on 'from' for 'unit' to 'to'. + /// + /// This panics if either 'from' or 'to' is invalid. + /// + /// All unit values are OK. + fn set_transition( + &mut self, + from: LazyStateID, + unit: alphabet::Unit, + to: LazyStateID, + ) { + assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from); + assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to); + let offset = + from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit); + self.cache.trans[offset] = to; + } + + /// Set the start ID for the given pattern ID (if given) and starting + /// configuration to the ID given. + /// + /// This panics if 'id' is not valid or if a pattern ID is given and + /// 'starts_for_each_pattern' is not enabled. + fn set_start_state( + &mut self, + pattern_id: Option<PatternID>, + start: Start, + id: LazyStateID, + ) { + assert!(self.as_ref().is_valid(id)); + let start_index = start.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + assert!( + self.dfa.starts_for_each_pattern, + "attempted to search for a specific pattern \ + without enabling starts_for_each_pattern", + ); + let pid = pid.as_usize(); + Start::count() + (Start::count() * pid) + start_index + } + }; + self.cache.starts[index] = id; + } + + /// Returns a state builder from this DFA that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.cache.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this DFA for reuse. + /// + /// Note that building a 'State' from a builder always creates a new alloc, + /// so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.cache.scratch_state_builder, + builder.clear(), + ); + } +} + +/// A type that groups methods that require the base NFA/DFA and read-only +/// access to the cache. +#[derive(Debug)] +struct LazyRef<'i, 'c> { + dfa: &'i DFA, + cache: &'c Cache, +} + +impl<'i, 'c> LazyRef<'i, 'c> { + /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. + fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> { + LazyRef { dfa, cache } + } + + /// Return the ID of the start state for the given configuration. + /// + /// If the start state has not yet been computed, then this returns an + /// unknown lazy state ID. + fn get_cached_start_id( + &self, + pattern_id: Option<PatternID>, + start: Start, + ) -> LazyStateID { + let start_index = start.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!( + pid < self.dfa.pattern_count(), + "invalid pattern ID: {:?}", + pid + ); + Start::count() + (Start::count() * pid) + start_index + } + }; + self.cache.starts[index] + } + + /// Return the cached NFA/DFA powerset state for the given ID. + /// + /// This panics if the given ID does not address a valid state. + fn get_cached_state(&self, sid: LazyStateID) -> &State { + let index = sid.as_usize_untagged() >> self.dfa.stride2(); + &self.cache.states[index] + } + + /// Returns true if and only if the given ID corresponds to a "sentinel" + /// state. + /// + /// A sentinel state is a state that signifies a special condition of + /// search, and where every transition maps back to itself. See LazyStateID + /// for more details. Note that start and match states are _not_ sentinels + /// since they may otherwise be real states with non-trivial transitions. + /// The purposes of sentinel states is purely to indicate something. Their + /// transitions are not meant to be followed. + fn is_sentinel(&self, id: LazyStateID) -> bool { + id == self.unknown_id() || id == self.dead_id() || id == self.quit_id() + } + + /// Returns the ID of the unknown state for this lazy DFA. + fn unknown_id(&self) -> LazyStateID { + // This unwrap is OK since 0 is always a valid state ID. + LazyStateID::new(0).unwrap().to_unknown() + } + + /// Returns the ID of the dead state for this lazy DFA. + fn dead_id(&self) -> LazyStateID { + // This unwrap is OK since the maximum value here is 1 * 512 = 512, + // which is <= 2047 (the maximum state ID on 16-bit systems). Where + // 512 is the worst case for our equivalence classes (every byte is a + // distinct class). + LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead() + } + + /// Returns the ID of the quit state for this lazy DFA. + fn quit_id(&self) -> LazyStateID { + // This unwrap is OK since the maximum value here is 2 * 512 = 1024, + // which is <= 2047 (the maximum state ID on 16-bit systems). Where + // 512 is the worst case for our equivalence classes (every byte is a + // distinct class). + LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit() + } + + /// Returns true if and only if the given ID is valid. + /// + /// An ID is valid if it is both a valid index into the transition table + /// and is a multiple of the DFA's stride. + fn is_valid(&self, id: LazyStateID) -> bool { + let id = id.as_usize_untagged(); + id < self.cache.trans.len() && id % self.dfa.stride() == 0 + } + + /// Returns true if adding the state given would fit in this cache. + fn state_fits_in_cache(&self, state: &State) -> bool { + let needed = self.cache.memory_usage() + + self.memory_usage_for_one_more_state(state.memory_usage()); + needed <= self.dfa.cache_capacity + } + + /// Returns true if adding the state to be built by the given builder would + /// fit in this cache. + fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool { + let needed = self.cache.memory_usage() + + self.memory_usage_for_one_more_state(state.as_bytes().len()); + needed <= self.dfa.cache_capacity + } + + /// Returns the additional memory usage, in bytes, required to add one more + /// state to this cache. The given size should be the heap size, in bytes, + /// that would be used by the new state being added. + fn memory_usage_for_one_more_state( + &self, + state_heap_size: usize, + ) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + self.dfa.stride() * ID_SIZE // additional space needed in trans table + + STATE_SIZE // space in cache.states + + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id + + state_heap_size // heap memory used by state itself + } +} + +/// A simple type that encapsulates the saving of a state ID through a cache +/// clearing. +/// +/// A state ID can be marked for saving with ToSave, while a state ID can be +/// saved itself with Saved. +#[derive(Clone, Debug)] +enum StateSaver { + /// An empty state saver. In this case, no states (other than the special + /// sentinel states) are preserved after clearing the cache. + None, + /// An ID of a state (and the state itself) that should be preserved after + /// the lazy DFA's cache has been cleared. After clearing, the updated ID + /// is stored in 'Saved' since it may have changed. + ToSave { id: LazyStateID, state: State }, + /// An ID that of a state that has been persisted through a lazy DFA + /// cache clearing. The ID recorded here corresonds to an ID that was + /// once marked as ToSave. The IDs are likely not equivalent even though + /// the states they point to are. + Saved(LazyStateID), +} + +impl StateSaver { + /// Create an empty state saver. + fn none() -> StateSaver { + StateSaver::None + } + + /// Replace this state saver with an empty saver, and if this saver is a + /// request to save a state, return that request. + fn take_to_save(&mut self) -> Option<(LazyStateID, State)> { + match core::mem::replace(self, StateSaver::None) { + StateSaver::None | StateSaver::Saved(_) => None, + StateSaver::ToSave { id, state } => Some((id, state)), + } + } + + /// Replace this state saver with an empty saver, and if this saver is a + /// saved state (or a request to save a state), return that state's ID. + /// + /// The idea here is that a request to save a state isn't necessarily + /// honored because it might not be needed. e.g., Some higher level code + /// might request a state to be saved on the off chance that the cache gets + /// cleared when a new state is added at a lower level. But if that new + /// state is never added, then the cache is never cleared and the state and + /// its ID remain unchanged. + fn take_saved(&mut self) -> Option<LazyStateID> { + match core::mem::replace(self, StateSaver::None) { + StateSaver::None => None, + StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id), + } + } +} + +/// The configuration used for building a lazy DFA. +/// +/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Config` type directly. +/// +/// A lazy DFA configuration is a simple data object that is typically used +/// with [`Builder::configure`]. +/// +/// The default configuration guarantees that a search will _never_ return +/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with +/// [`Config::quit`], enabling heuristic support for Unicode word boundaries +/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear +/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search +/// to return an error. See the corresponding configuration options for more +/// details on when those error conditions arise. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + anchored: Option<bool>, + match_kind: Option<MatchKind>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + unicode_word_boundary: Option<bool>, + quitset: Option<ByteSet>, + cache_capacity: Option<usize>, + skip_cache_capacity_check: Option<bool>, + minimum_cache_clear_count: Option<Option<usize>>, +} + +impl Config { + /// Return a new default lazy DFA builder configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of a search. When + /// disabled (the default), the lazy DFA will act as if the pattern started + /// with a `(?s:.)*?`, which enables a match to appear anywhere. + /// + /// Note that if you want to run both anchored and unanchored + /// searches without building multiple automatons, you can enable the + /// [`Config::starts_for_each_pattern`] configuration instead. This will + /// permit unanchored any-pattern searches and pattern-specific anchored + /// searches. See the documentation for that configuration for an example. + /// + /// By default this is disabled. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// input, regardless of where you begin your search. In contrast, enabling + /// this option will allow your regex to match anywhere in your input, + /// but the match must start at the beginning of a search. (Most of the + /// higher level convenience search routines make "start of input" and + /// "start of search" equivalent, but some routines allow treating these as + /// orthogonal.) + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `anchored=false` and searches + /// `aba` starting at position `2`. Since `^` requires the match to + /// start at the beginning of the input and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the input. + /// 3. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the regex is anchored, it finds no match. + /// 4. The regex `a` is compiled with `anchored=false` and searches `aba` + /// startting at position `1`. Since the regex is neither anchored nor + /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?` + /// prefix that permits it to match anywhere. Thus, it reports a match + /// at `[2, 3]`. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let haystack = "aba".as_bytes(); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(false)) // default + /// .build(r"^a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 2, 3, + /// )?; + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(true)) + /// .build(r"a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 2, 3, + /// )?; + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(true)) + /// .build(r"a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 1, 3, + /// )?; + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(false)) + /// .build(r"a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 1, 3, + /// )?; + /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn anchored(mut self, yes: bool) -> Config { + self.anchored = Some(yes); + self + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to classical DFA construction + /// where all possible matches are added to the lazy DFA. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` adds dead states to the + /// lazy DFA as a way to terminate the search and report a match. + /// `LeftmostFirst` also supports non-greedy matches using this strategy + /// where as `All` does not. + /// + /// # Example: overlapping search + /// + /// This example shows the typical use of `MatchKind::All`, which is to + /// report overlapping matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, OverlappingState}, + /// HalfMatch, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: reverse automaton to find start of match + /// + /// Another example for using `MatchKind::All` is for constructing a + /// reverse automaton to find the start of a match. `All` semantics are + /// used for this in order to find the longest possible match, which + /// corresponds to the leftmost starting position. + /// + /// Note that if you need the starting position then + /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this + /// for you, so it's usually not necessary to do this yourself. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind}; + /// + /// let haystack = "123foobar456".as_bytes(); + /// let pattern = r"[a-z]+"; + /// + /// let dfa_fwd = DFA::new(pattern)?; + /// let dfa_rev = DFA::builder() + /// .configure(DFA::config() + /// .anchored(true) + /// .match_kind(MatchKind::All) + /// ) + /// .build(pattern)?; + /// let mut cache_fwd = dfa_fwd.create_cache(); + /// let mut cache_rev = dfa_rev.create_cache(); + /// + /// let expected_fwd = HalfMatch::must(0, 9); + /// let expected_rev = HalfMatch::must(0, 3); + /// let got_fwd = dfa_fwd.find_leftmost_fwd( + /// &mut cache_fwd, haystack, + /// )?.unwrap(); + /// // Here we don't specify the pattern to search for since there's only + /// // one pattern and we're doing a leftmost search. But if this were an + /// // overlapping search, you'd need to specify the pattern that matched + /// // in the forward direction. (Otherwise, you might wind up finding the + /// // starting position of a match of some other pattern.) That in turn + /// // requires building the reverse automaton with starts_for_each_pattern + /// // enabled. Indeed, this is what Regex does internally. + /// let got_rev = dfa_rev.find_leftmost_rev_at( + /// &mut cache_rev, None, haystack, 0, got_fwd.offset(), + /// )?.unwrap(); + /// assert_eq!(expected_fwd, got_fwd); + /// assert_eq!(expected_rev, got_rev); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// lazy DFA. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the lazy DFA. When this start state is used, then the DFA + /// will only search for matches for the pattern specified, even if there + /// are other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the + /// DFA at search time. However, since this is configuration for a lazy + /// DFA, these states aren't actually built unless they're used. Enabling + /// this isn't necessarily free, however, as it may result in higher cache + /// usage. + /// + /// There are a few reasons one might want to enable this (it's disabled + /// by default): + /// + /// 1. When looking for the start of an overlapping match (using a reverse + /// DFA), doing it correctly requires starting the reverse search using the + /// starting state of the pattern that matched in the forward direction. + /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it + /// will automatically enable this option when building the reverse DFA + /// internally. + /// 2. When you want to use a DFA with multiple patterns to both search + /// for matches of any pattern or to search for anchored matches of one + /// particular pattern while using the same DFA. (Otherwise, you would need + /// to compile a new DFA for each pattern.) + /// 3. Since the start states added for each pattern are anchored, if you + /// compile an unanchored DFA with one pattern while also enabling this + /// option, then you can use the same DFA to perform anchored or unanchored + /// searches. The latter you get with the standard search APIs. The former + /// you get from the various `_at` search methods that allow you specify a + /// pattern ID to search for. + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to use this option to permit the same lazy DFA + /// to run both anchored and unanchored searches for a single pattern. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build(r"foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = b"quux foo123"; + /// + /// // Here's a normal unanchored search. Notice that we use 'None' for the + /// // pattern ID. Since the DFA was built as an unanchored machine, it + /// // uses its default unanchored starting state. + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 0, haystack.len(), + /// )?); + /// // But now if we explicitly specify the pattern to search ('0' being + /// // the only pattern in the DFA), then it will use the starting state + /// // for that specific pattern which is always anchored. Since the + /// // pattern doesn't have a match at the beginning of the haystack, we + /// // find nothing. + /// assert_eq!(None, dfa.find_leftmost_fwd_at( + /// &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(), + /// )?); + /// // And finally, an anchored search is not the same as putting a '^' at + /// // beginning of the pattern. An anchored search can only match at the + /// // beginning of the *search*, which we can change: + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(), + /// )?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the lazy DFA's alphabet or + /// not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging the lazy DFA. + /// + /// When enabled, the lazy DFA will use a map from all possible bytes + /// to their corresponding equivalence class. Each equivalence class + /// represents a set of bytes that does not discriminate between a match + /// and a non-match in the DFA. For example, the pattern `[ab]+` has at + /// least two equivalence classes: a set containing `a` and `b` and a set + /// containing every byte except for `a` and `b`. `a` and `b` are in the + /// same equivalence classes because they never discriminate between a + /// match and a non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)` + /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of + /// equivalence classes (rounded up to the nearest power of 2). As a + /// result, total space usage can decrease substantially. Moreover, since a + /// smaller alphabet is used, DFA compilation during search becomes faster + /// as well since it will potentially be able to reuse a single transition + /// for multiple bytes. + /// + /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling + /// this does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Heuristically enable Unicode word boundaries. + /// + /// When set, this will attempt to implement Unicode word boundaries as if + /// they were ASCII word boundaries. This only works when the search input + /// is ASCII only. If a non-ASCII byte is observed while searching, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned. + /// + /// A possible alternative to enabling this option is to simply use an + /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this + /// option is if you absolutely need Unicode support. This option lets one + /// use a fast search implementation (a DFA) for some potentially very + /// common cases, while providing the option to fall back to some other + /// regex engine to handle the general case when an error is returned. + /// + /// If the pattern provided has no Unicode word boundary in it, then this + /// option has no effect. (That is, quitting on a non-ASCII byte only + /// occurs when this option is enabled _and_ a Unicode word boundary is + /// present in the pattern.) + /// + /// This is almost equivalent to setting all non-ASCII bytes to be quit + /// bytes. The only difference is that this will cause non-ASCII bytes to + /// be quit bytes _only_ when a Unicode word boundary is present in the + /// pattern. + /// + /// When enabling this option, callers _must_ be prepared to handle + /// a [`MatchError`](crate::MatchError) error during search. + /// When using a [`Regex`](crate::hybrid::regex::Regex), this + /// corresponds to using the `try_` suite of methods. Alternatively, + /// if callers can guarantee that their input is ASCII only, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be + /// returned while searching. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to heuristically enable Unicode word boundaries + /// in a pattern. It also shows what happens when a search comes across a + /// non-ASCII byte. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// HalfMatch, MatchError, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .build(r"\b[0-9]+\b")?; + /// let mut cache = dfa.create_cache(); + /// + /// // The match occurs before the search ever observes the snowman + /// // character, so no error occurs. + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = Some(HalfMatch::must(0, 7)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; + /// assert_eq!(expected, got); + /// + /// // Notice that this search fails, even though the snowman character + /// // occurs after the ending match offset. This is because search + /// // routines read one byte past the end of the search to account for + /// // look-around, and indeed, this is required here to determine whether + /// // the trailing \b matches. + /// let haystack = "foo 123☃".as_bytes(); + /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 }; + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack); + /// assert_eq!(Err(expected), got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn unicode_word_boundary(mut self, yes: bool) -> Config { + // We have a separate option for this instead of just setting the + // appropriate quit bytes here because we don't want to set quit bytes + // for every regex. We only want to set them when the regex contains a + // Unicode word boundary. + self.unicode_word_boundary = Some(yes); + self + } + + /// Add a "quit" byte to the lazy DFA. + /// + /// When a quit byte is seen during search time, then search will return + /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the + /// offset at which the search stopped. + /// + /// A quit byte will always overrule any other aspects of a regex. For + /// example, if the `x` byte is added as a quit byte and the regex `\w` is + /// used, then observing `x` will cause the search to quit immediately + /// despite the fact that `x` is in the `\w` class. + /// + /// This mechanism is primarily useful for heuristically enabling certain + /// features like Unicode word boundaries in a DFA. Namely, if the input + /// to search is ASCII, then a Unicode word boundary can be implemented + /// via an ASCII word boundary with no change in semantics. Thus, a DFA + /// can attempt to match a Unicode word boundary but give up as soon as it + /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes + /// to be quit bytes, then Unicode word boundaries will be permitted when + /// building lazy DFAs. Of course, callers should enable + /// [`Config::unicode_word_boundary`] if they want this behavior instead. + /// (The advantage being that non-ASCII quit bytes will only be added if a + /// Unicode word boundary is in the pattern.) + /// + /// When enabling this option, callers _must_ be prepared to handle a + /// [`MatchError`](crate::MatchError) error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. + /// + /// By default, there are no quit bytes set. + /// + /// # Panics + /// + /// This panics if heuristic Unicode word boundaries are enabled and any + /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling + /// Unicode word boundaries requires setting every non-ASCII byte to a quit + /// byte. So if the caller attempts to undo any of that, then this will + /// panic. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte. This could be useful if, for example, you wanted to prevent + /// a user supplied pattern from matching across a line boundary. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn quit(mut self, byte: u8, yes: bool) -> Config { + if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { + panic!( + "cannot set non-ASCII byte to be non-quit when \ + Unicode word boundaries are enabled" + ); + } + if self.quitset.is_none() { + self.quitset = Some(ByteSet::empty()); + } + if yes { + self.quitset.as_mut().unwrap().add(byte); + } else { + self.quitset.as_mut().unwrap().remove(byte); + } + self + } + + /// Sets the maximum amount of heap memory, in bytes, to allocate to the + /// cache for use during a lazy DFA search. If the lazy DFA would otherwise + /// use more heap memory, then, depending on other configuration knobs, + /// either stop the search and return an error or clear the cache and + /// continue the search. + /// + /// The default cache capacity is some "reasonable" number that will + /// accommodate most regular expressions. You may find that if you need + /// to build a large DFA then it may be necessary to increase the cache + /// capacity. + /// + /// Note that while building a lazy DFA will do a "minimum" check to ensure + /// the capacity is big enough, this is more or less about correctness. + /// If the cache is bigger than the minimum but still too small, then the + /// lazy DFA could wind up spending a lot of time clearing the cache and + /// recomputing transitions, thus negating the performance benefits of a + /// lazy DFA. Thus, setting the cache capacity is mostly an experimental + /// endeavor. For most common patterns, however, the default should be + /// sufficient. + /// + /// For more details on how the lazy DFA's cache is used, see the + /// documentation for [`Cache`]. + /// + /// # Example + /// + /// This example shows what happens if the configured cache capacity is + /// too small. In such cases, one can override the cache capacity to make + /// it bigger. Alternatively, one might want to use less memory by setting + /// a smaller cache capacity. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// + /// let pattern = r"\p{L}{1000}"; + /// + /// // The default cache capacity is likely too small to deal with regexes + /// // that are very large. Large repetitions of large Unicode character + /// // classes are a common way to make very large regexes. + /// let _ = DFA::new(pattern).unwrap_err(); + /// // Bump up the capacity to something bigger. + /// let dfa = DFA::builder() + /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); + /// let expected = Some(HalfMatch::must(0, 2000)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn cache_capacity(mut self, bytes: usize) -> Config { + self.cache_capacity = Some(bytes); + self + } + + /// Configures construction of a lazy DFA to use the minimum cache capacity + /// if the configured capacity is otherwise too small for the provided NFA. + /// + /// This is useful if you never want lazy DFA construction to fail because + /// of a capacity that is too small. + /// + /// In general, this option is typically not a good idea. In particular, + /// while a minimum cache capacity does permit the lazy DFA to function + /// where it otherwise couldn't, it's plausible that it may not function + /// well if it's constantly running out of room. In that case, the speed + /// advantages of the lazy DFA may be negated. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows what happens if the configured cache capacity is + /// too small. In such cases, one could override the capacity explicitly. + /// An alternative, demonstrated here, let's us force construction to use + /// the minimum cache capacity if the configured capacity is otherwise + /// too small. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// + /// let pattern = r"\p{L}{1000}"; + /// + /// // The default cache capacity is likely too small to deal with regexes + /// // that are very large. Large repetitions of large Unicode character + /// // classes are a common way to make very large regexes. + /// let _ = DFA::new(pattern).unwrap_err(); + /// // Configure construction such it automatically selects the minimum + /// // cache capacity if it would otherwise be too small. + /// let dfa = DFA::builder() + /// .configure(DFA::config().skip_cache_capacity_check(true)) + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); + /// let expected = Some(HalfMatch::must(0, 2000)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config { + self.skip_cache_capacity_check = Some(yes); + self + } + + /// Configure a lazy DFA search to quit after a certain number of cache + /// clearings. + /// + /// When a minimum is set, then a lazy DFA search will "give up" after + /// the minimum number of cache clearings has occurred. This is typically + /// useful in scenarios where callers want to detect whether the lazy DFA + /// search is "efficient" or not. If the cache is cleared too many times, + /// this is a good indicator that it is not efficient, and thus, the caller + /// may wish to use some other regex engine. + /// + /// Note that the number of times a cache is cleared is a property of + /// the cache itself. Thus, if a cache is used in a subsequent search + /// with a similarly configured lazy DFA, then it would cause the + /// search to "give up" if the cache needed to be cleared. The cache + /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or + /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if + /// you're using the `Regex` API). + /// + /// By default, no minimum is configured. Thus, a lazy DFA search will + /// never give up due to cache clearings. + /// + /// # Example + /// + /// This example uses a somewhat pathological configuration to demonstrate + /// the _possible_ behavior of cache clearing and how it might result + /// in a search that returns an error. + /// + /// It is important to note that the precise mechanics of how and when + /// a cache gets cleared is an implementation detail. Thus, the asserts + /// in the tests below with respect to the particular offsets at which a + /// search gave up should be viewed strictly as a demonstration. They are + /// not part of any API guarantees offered by this crate. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchError}; + /// + /// // This is a carefully chosen regex. The idea is to pick one + /// // that requires some decent number of states (hence the bounded + /// // repetition). But we specifically choose to create a class with an + /// // ASCII letter and a non-ASCII letter so that we can check that no new + /// // states are created once the cache is full. Namely, if we fill up the + /// // cache on a haystack of 'a's, then in order to match one 'β', a new + /// // state will need to be created since a 'β' is encoded with multiple + /// // bytes. Since there's no room for this state, the search should quit + /// // at the very first position. + /// let pattern = r"[aβ]{100}"; + /// let dfa = DFA::builder() + /// .configure( + /// // Configure it so that we have the minimum cache capacity + /// // possible. And that if any clearings occur, the search quits. + /// DFA::config() + /// .skip_cache_capacity_check(true) + /// .cache_capacity(0) + /// .minimum_cache_clear_count(Some(0)), + /// ) + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "a".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_leftmost_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 25 }), + /// ); + /// + /// // Now that we know the cache is full, if we search a haystack that we + /// // know will require creating at least one new state, it should not + /// // be able to make any progress. + /// let haystack = "β".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_leftmost_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 0 }), + /// ); + /// + /// // If we reset the cache, then we should be able to create more states + /// // and make more progress with searching for betas. + /// cache.reset(&dfa); + /// let haystack = "β".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_earliest_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 26 }), + /// ); + /// + /// // ... switching back to ASCII still makes progress since it just needs + /// // to set transitions on existing states! + /// let haystack = "a".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_earliest_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 13 }), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config { + self.minimum_cache_clear_count = Some(min); + self + } + + /// Returns whether this configuration has enabled anchored searches. + pub fn get_anchored(&self) -> bool { + self.anchored.unwrap_or(false) + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns whether this configuration has enabled heuristic Unicode word + /// boundary support. When enabled, it is possible for a search to return + /// an error. + pub fn get_unicode_word_boundary(&self) -> bool { + self.unicode_word_boundary.unwrap_or(false) + } + + /// Returns whether this configuration will instruct the DFA to enter a + /// quit state whenever the given byte is seen during a search. When at + /// least one byte has this enabled, it is possible for a search to return + /// an error. + pub fn get_quit(&self, byte: u8) -> bool { + self.quitset.map_or(false, |q| q.contains(byte)) + } + + /// Returns the cache capacity set on this configuration. + pub fn get_cache_capacity(&self) -> usize { + self.cache_capacity.unwrap_or(2 * (1 << 20)) + } + + /// Returns whether the cache capacity check should be skipped. + pub fn get_skip_cache_capacity_check(&self) -> bool { + self.skip_cache_capacity_check.unwrap_or(false) + } + + /// Returns, if set, the minimum number of times the cache must be cleared + /// before a lazy DFA search can give up. When no minimum is set, then a + /// search will never quit and will always clear the cache whenever it + /// fills up. + pub fn get_minimum_cache_clear_count(&self) -> Option<usize> { + self.minimum_cache_clear_count.unwrap_or(None) + } + + /// Returns the minimum lazy DFA cache capacity required for the given NFA. + /// + /// The cache capacity required for a particular NFA may change without + /// notice. Callers should not rely on it being stable. + /// + /// This is useful for informational purposes, but can also be useful for + /// other reasons. For example, if one wants to check the minimum cache + /// capacity themselves or if one wants to set the capacity based on the + /// minimum. + /// + /// This may return an error if this configuration does not support all of + /// the instructions used in the given NFA. For example, if the NFA has a + /// Unicode word boundary but this configuration does not enable heuristic + /// support for Unicode word boundaries. + pub fn get_minimum_cache_capacity( + &self, + nfa: &thompson::NFA, + ) -> Result<usize, BuildError> { + let quitset = self.quit_set_from_nfa(nfa)?; + let classes = self.byte_classes_from_nfa(nfa, &quitset); + let starts = self.get_starts_for_each_pattern(); + Ok(minimum_cache_capacity(nfa, &classes, starts)) + } + + /// Returns the byte class map used during search from the given NFA. + /// + /// If byte classes are disabled on this configuration, then a map is + /// returned that puts each byte in its own equivalent class. + fn byte_classes_from_nfa( + &self, + nfa: &thompson::NFA, + quit: &ByteSet, + ) -> ByteClasses { + if !self.get_byte_classes() { + // The lazy DFA will always use the equivalence class map, but + // enabling this option is useful for debugging. Namely, this will + // cause all transitions to be defined over their actual bytes + // instead of an opaque equivalence class identifier. The former is + // much easier to grok as a human. + ByteClasses::singletons() + } else { + let mut set = nfa.byte_class_set().clone(); + // It is important to distinguish any "quit" bytes from all other + // bytes. Otherwise, a non-quit byte may end up in the same class + // as a quit byte, and thus cause the DFA stop when it shouldn't. + if !quit.is_empty() { + set.add_set(&quit); + } + set.byte_classes() + } + } + + /// Return the quit set for this configuration and the given NFA. + /// + /// This may return an error if the NFA is incompatible with this + /// configuration's quit set. For example, if the NFA has a Unicode word + /// boundary and the quit set doesn't include non-ASCII bytes. + fn quit_set_from_nfa( + &self, + nfa: &thompson::NFA, + ) -> Result<ByteSet, BuildError> { + let mut quit = self.quitset.unwrap_or(ByteSet::empty()); + if nfa.has_word_boundary_unicode() { + if self.get_unicode_word_boundary() { + for b in 0x80..=0xFF { + quit.add(b); + } + } else { + // If heuristic support for Unicode word boundaries wasn't + // enabled, then we can still check if our quit set is correct. + // If the caller set their quit bytes in a way that causes the + // DFA to quit on at least all non-ASCII bytes, then that's all + // we need for heuristic support to work. + if !quit.contains_range(0x80, 0xFF) { + return Err( + BuildError::unsupported_dfa_word_boundary_unicode(), + ); + } + } + } + Ok(quit) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + fn overwrite(self, o: Config) -> Config { + Config { + anchored: o.anchored.or(self.anchored), + match_kind: o.match_kind.or(self.match_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + unicode_word_boundary: o + .unicode_word_boundary + .or(self.unicode_word_boundary), + quitset: o.quitset.or(self.quitset), + cache_capacity: o.cache_capacity.or(self.cache_capacity), + skip_cache_capacity_check: o + .skip_cache_capacity_check + .or(self.skip_cache_capacity_check), + minimum_cache_clear_count: o + .minimum_cache_clear_count + .or(self.minimum_cache_clear_count), + } + } +} + +/// A builder for constructing a lazy deterministic finite automaton from +/// regular expressions. +/// +/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Builder` type directly. +/// +/// This builder provides two main things: +/// +/// 1. It provides a few different `build` routines for actually constructing +/// a DFA from different kinds of inputs. The most convenient is +/// [`Builder::build`], which builds a DFA directly from a pattern string. The +/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight +/// from an NFA. +/// 2. The builder permits configuring a number of things. +/// [`Builder::configure`] is used with [`Config`] to configure aspects of +/// the DFA and the construction process itself. [`Builder::syntax`] and +/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA +/// construction, respectively. The syntax and thompson configurations only +/// apply when building from a pattern string. +/// +/// This builder always constructs a *single* lazy DFA. As such, this builder +/// can only be used to construct regexes that either detect the presence +/// of a match or find the end location of a match. A single DFA cannot +/// produce both the start and end of a match. For that information, use a +/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured +/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason +/// to use a DFA directly is if the end location of a match is enough for your +/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one, +/// since a second reverse DFA is needed to find the start of a match. +/// +/// # Example +/// +/// This example shows how to build a lazy DFA that uses a tiny cache capacity +/// and completely disables Unicode. That is: +/// +/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` +/// and `\b` are ASCII-only while `.` matches any byte except for `\n` +/// (instead of any UTF-8 encoding of a Unicode scalar value except for +/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. +/// * The pattern itself is permitted to match invalid UTF-8. For example, +/// things like `[^a]` that match any byte except for `a` are permitted. +/// * Unanchored patterns can search through invalid UTF-8. That is, for +/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of +/// `(?s:.)*?`. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::DFA, +/// nfa::thompson, +/// HalfMatch, SyntaxConfig, +/// }; +/// +/// let dfa = DFA::builder() +/// .configure(DFA::config().cache_capacity(5_000)) +/// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo[^b]ar.*")?; +/// let mut cache = dfa.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; +/// let expected = Some(HalfMatch::must(0, 10)); +/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + thompson: thompson::Builder, +} + +impl Builder { + /// Create a new lazy DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + thompson: thompson::Builder::new(), + } + } + + /// Build a lazy DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a lazy DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<DFA, BuildError> { + let nfa = + self.thompson.build_many(patterns).map_err(BuildError::nfa)?; + self.build_from_nfa(Arc::new(nfa)) + } + + /// Build a DFA from the given NFA. + /// + /// Note that this requires an `Arc<thompson::NFA>` instead of a + /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at + /// search time. This means that the lazy DFA must hold on to its source + /// NFA for the entirety of its lifetime. An `Arc` is used so that callers + /// aren't forced to clone the NFA if it is needed elsewhere. + /// + /// # Example + /// + /// This example shows how to build a lazy DFA if you already have an NFA + /// in hand. + /// + /// ``` + /// use std::sync::Arc; + /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch}; + /// + /// let haystack = "foo123bar".as_bytes(); + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = thompson::Builder::new() + /// .configure(thompson::Config::new().shrink(false)) + /// .build(r"[0-9]+")?; + /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?; + /// let mut cache = dfa.create_cache(); + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa( + &self, + nfa: Arc<thompson::NFA>, + ) -> Result<DFA, BuildError> { + let quitset = self.config.quit_set_from_nfa(&nfa)?; + let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); + // Check that we can fit at least a few states into our cache, + // otherwise it's pretty senseless to use the lazy DFA. This does have + // a possible failure mode though. This assumes the maximum size of a + // state in powerset space (so, the total number of NFA states), which + // may never actually materialize, and could be quite a bit larger + // than the actual biggest state. If this turns out to be a problem, + // we could expose a knob that disables this check. But if so, we have + // to be careful not to panic in other areas of the code (the cache + // clearing and init code) that tend to assume some minimum useful + // cache capacity. + let min_cache = minimum_cache_capacity( + &nfa, + &classes, + self.config.get_starts_for_each_pattern(), + ); + let mut cache_capacity = self.config.get_cache_capacity(); + if cache_capacity < min_cache { + // When the caller has asked us to skip the cache capacity check, + // then we simply force the cache capacity to its minimum amount + // and mush on. + if self.config.get_skip_cache_capacity_check() { + trace!( + "given capacity ({}) is too small, \ + since skip_cache_capacity_check is enabled, \ + setting cache capacity to minimum ({})", + cache_capacity, + min_cache, + ); + cache_capacity = min_cache; + } else { + return Err(BuildError::insufficient_cache_capacity( + min_cache, + cache_capacity, + )); + } + } + // We also need to check that we can fit at least some small number + // of states in our state ID space. This is unlikely to trigger in + // >=32-bit systems, but 16-bit systems have a pretty small state ID + // space since a number of bits are used up as sentinels. + if let Err(err) = minimum_lazy_state_id(&nfa, &classes) { + return Err(BuildError::insufficient_state_id_capacity(err)); + } + let stride2 = classes.stride2(); + Ok(DFA { + nfa, + stride2, + classes, + quitset, + anchored: self.config.get_anchored(), + match_kind: self.config.get_match_kind(), + starts_for_each_pattern: self.config.get_starts_for_each_pattern(), + cache_capacity, + minimum_cache_clear_count: self + .config + .get_minimum_cache_clear_count(), + }) + } + + /// Apply the given lazy DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a lazy DFA directly from a + /// pattern. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether the DFA should match the regex + /// in reverse or if additional time should be spent shrinking the size of + /// the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// Based on the minimum number of states required for a useful lazy DFA cache, +/// this returns the minimum lazy state ID that must be representable. +/// +/// It's likely not plausible for this to impose constraints on 32-bit systems +/// (or higher), but on 16-bit systems, the lazy state ID space is quite +/// constrained and thus may be insufficient for bigger regexes. +fn minimum_lazy_state_id( + nfa: &thompson::NFA, + classes: &ByteClasses, +) -> Result<LazyStateID, LazyStateIDError> { + let stride = 1 << classes.stride2(); + let min_state_index = MIN_STATES.checked_sub(1).unwrap(); + LazyStateID::new(min_state_index * stride) +} + +/// Based on the minimum number of states required for a useful lazy DFA cache, +/// this returns a heuristic minimum number of bytes of heap space required. +/// +/// This is a "heuristic" because the minimum it returns is likely bigger than +/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses +/// the maximum number of NFA states (all of them). This is likely bigger +/// than what is required in practice. Computing the true minimum effectively +/// requires determinization, which is probably too much work to do for a +/// simple check like this. +fn minimum_cache_capacity( + nfa: &thompson::NFA, + classes: &ByteClasses, + starts_for_each_pattern: bool, +) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + let stride = 1 << classes.stride2(); + + let sparses = 2 * nfa.len() * NFAStateID::SIZE; + let trans = MIN_STATES * stride * ID_SIZE; + + let mut starts = Start::count() * ID_SIZE; + if starts_for_each_pattern { + starts += (Start::count() * nfa.pattern_len()) * ID_SIZE; + } + + // Every `State` has three bytes for flags, 4 bytes (max) for the number + // of patterns, followed by 32-bit encodings of patterns and then delta + // varint encodings of NFA state IDs. We use the worst case (which isn't + // technically possible) of 5 bytes for each NFA state ID. + // + // HOWEVER, three of the states needed by a lazy DFA are just the sentinel + // unknown, dead and quit states. Those states have a known size and it is + // small. + assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3"); + let dead_state_size = State::dead().memory_usage(); + let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5); + let states = (3 * (size_of::<State>() + dead_state_size)) + + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size)); + let states_to_sid = states + (MIN_STATES * ID_SIZE); + let stack = nfa.len() * NFAStateID::SIZE; + let scratch_state_builder = max_state_size; + + trans + + starts + + states + + states_to_sid + + sparses + + stack + + scratch_state_builder +} diff --git a/vendor/regex-automata-0.2.0/src/hybrid/error.rs b/vendor/regex-automata-0.2.0/src/hybrid/error.rs new file mode 100644 index 000000000..715da39bd --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/error.rs @@ -0,0 +1,130 @@ +use crate::{hybrid::id::LazyStateIDError, nfa}; + +/// An error that occurs when initial construction of a lazy DFA fails. +/// +/// A build error can occur when insufficient cache capacity is configured or +/// if something about the NFA is unsupported. (For example, if one attempts +/// to build a lazy DFA without heuristic Unicode support but with an NFA that +/// contains a Unicode word boundary.) +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +#[derive(Clone, Debug)] +enum BuildErrorKind { + NFA(nfa::thompson::Error), + InsufficientCacheCapacity { minimum: usize, given: usize }, + InsufficientStateIDCapacity { err: LazyStateIDError }, + Unsupported(&'static str), +} + +impl BuildError { + fn kind(&self) -> &BuildErrorKind { + &self.kind + } + + pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + pub(crate) fn insufficient_cache_capacity( + minimum: usize, + given: usize, + ) -> BuildError { + BuildError { + kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given }, + } + } + + pub(crate) fn insufficient_state_id_capacity( + err: LazyStateIDError, + ) -> BuildError { + BuildError { + kind: BuildErrorKind::InsufficientStateIDCapacity { err }, + } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { + let msg = "cannot build lazy DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + BuildErrorKind::NFA(ref err) => Some(err), + BuildErrorKind::InsufficientCacheCapacity { .. } => None, + // LazyStateIDError is an implementation detail, don't expose it. + BuildErrorKind::InsufficientStateIDCapacity { .. } => None, + BuildErrorKind::Unsupported(_) => None, + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + BuildErrorKind::NFA(_) => write!(f, "error building NFA"), + BuildErrorKind::InsufficientCacheCapacity { minimum, given } => { + write!( + f, + "given cache capacity ({}) is smaller than \ + minimum required ({})", + given, minimum, + ) + } + BuildErrorKind::InsufficientStateIDCapacity { ref err } => { + err.fmt(f) + } + BuildErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) + } + } + } +} + +/// An error that occurs when cache usage has become inefficient. +/// +/// One of the weaknesses of a lazy DFA is that it may need to clear its +/// cache repeatedly if it's not big enough. If this happens too much, then it +/// can slow searching down significantly. A mitigation to this is to use +/// heuristics to detect whether the cache is being used efficiently or not. +/// If not, then a lazy DFA can return a `CacheError`. +/// +/// The default configuration of a lazy DFA in this crate is +/// set such that a `CacheError` will never occur. Instead, +/// callers must opt into this behavior with settings like +/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count). +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct CacheError(()); + +impl CacheError { + pub(crate) fn too_many_cache_clears() -> CacheError { + CacheError(()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for CacheError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl core::fmt::Display for CacheError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "lazy DFA cache has been cleared too many times") + } +} diff --git a/vendor/regex-automata-0.2.0/src/hybrid/id.rs b/vendor/regex-automata-0.2.0/src/hybrid/id.rs new file mode 100644 index 000000000..a6fcde52e --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/id.rs @@ -0,0 +1,415 @@ +/// A state identifier especially tailored for lazy DFAs. +/// +/// A lazy state ID logically represents a pointer to a DFA state. In practice, +/// by limiting the number of DFA states it can address, it reserves some +/// bits of its representation to encode some additional information. That +/// additional information is called a "tag." That tag is used to record +/// whether the state it points to is an unknown, dead, quit, start or match +/// state. +/// +/// When implementing a low level search routine with a lazy DFA, it is +/// necessary to query the type of the current state to know what to do: +/// +/// * **Unknown** - The state has not yet been computed. The +/// parameters used to get this state ID must be re-passed to +/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an +/// unknown state ID. +/// * **Dead** - A dead state only has transitions to itself. It indicates that +/// the search cannot do anything else and should stop with whatever result it +/// has. +/// * **Quit** - A quit state indicates that the automaton could not answer +/// whether a match exists or not. Correct search implementations must return a +/// [`MatchError::Quit`](crate::MatchError::Quit). +/// * **Start** - A start state indicates that the automaton will begin +/// searching at a starting state. Branching on this isn't required for +/// correctness, but a common optimization is to use this to more quickly look +/// for a prefix. +/// * **Match** - A match state indicates that a match has been found. +/// Depending on the semantics of your search implementation, it may either +/// continue until the end of the haystack or a dead state, or it might quit +/// and return the match immediately. +/// +/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate +/// can be used to determine if a tag exists at all. This is useful to avoid +/// branching on all of the above types for every byte searched. +/// +/// # Example +/// +/// This example shows how `LazyStateID` can be used to implement a correct +/// search routine with minimal branching. In particular, this search routine +/// implements "leftmost" matching, which means that it doesn't immediately +/// stop once a match is found. Instead, it continues until it reaches a dead +/// state. +/// +/// Notice also how a correct search implementation deals with +/// [`CacheError`](crate::hybrid::CacheError)s returned by some of +/// the lazy DFA routines. When a `CacheError` occurs, it returns +/// [`MatchError::GaveUp`](crate::MatchError::GaveUp). +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::{Cache, DFA}, +/// HalfMatch, MatchError, PatternID, +/// }; +/// +/// fn find_leftmost_first( +/// dfa: &DFA, +/// cache: &mut Cache, +/// haystack: &[u8], +/// ) -> Result<Option<HalfMatch>, MatchError> { +/// // The start state is determined by inspecting the position and the +/// // initial bytes of the haystack. Note that start states can never +/// // be match states (since DFAs in this crate delay matches by 1 +/// // byte), so we don't need to check if the start state is a match. +/// let mut sid = dfa.start_state_forward( +/// cache, None, haystack, 0, haystack.len(), +/// ).map_err(|_| MatchError::GaveUp { offset: 0 })?; +/// let mut last_match = None; +/// // Walk all the bytes in the haystack. We can quit early if we see +/// // a dead or a quit state. The former means the automaton will +/// // never transition to any other state. The latter means that the +/// // automaton entered a condition in which its search failed. +/// for (i, &b) in haystack.iter().enumerate() { +/// sid = dfa +/// .next_state(cache, sid, b) +/// .map_err(|_| MatchError::GaveUp { offset: i })?; +/// if sid.is_tagged() { +/// if sid.is_match() { +/// last_match = Some(HalfMatch::new( +/// dfa.match_pattern(cache, sid, 0), +/// i, +/// )); +/// } else if sid.is_dead() { +/// return Ok(last_match); +/// } else if sid.is_quit() { +/// // It is possible to enter into a quit state after +/// // observing a match has occurred. In that case, we +/// // should return the match instead of an error. +/// if last_match.is_some() { +/// return Ok(last_match); +/// } +/// return Err(MatchError::Quit { byte: b, offset: i }); +/// } +/// // Implementors may also want to check for start states and +/// // handle them differently for performance reasons. But it is +/// // not necessary for correctness. +/// } +/// } +/// // Matches are always delayed by 1 byte, so we must explicitly walk +/// // the special "EOI" transition at the end of the search. +/// sid = dfa +/// .next_eoi_state(cache, sid) +/// .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?; +/// if sid.is_match() { +/// last_match = Some(HalfMatch::new( +/// dfa.match_pattern(cache, sid, 0), +/// haystack.len(), +/// )); +/// } +/// Ok(last_match) +/// } +/// +/// // We use a greedy '+' operator to show how the search doesn't just stop +/// // once a match is detected. It continues extending the match. Using +/// // '[a-z]+?' would also work as expected and stop the search early. +/// // Greediness is built into the automaton. +/// let dfa = DFA::new(r"[a-z]+")?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 10); +/// +/// // Here's another example that tests our handling of the special +/// // EOI transition. This will fail to find a match if we don't call +/// // 'next_eoi_state' at the end of the search since the match isn't found +/// // until the final byte in the haystack. +/// let dfa = DFA::new(r"[0-9]{4}")?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 15); +/// +/// // And note that our search implementation above automatically works +/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects +/// // the appropriate pattern ID for us. +/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 1); +/// assert_eq!(mat.offset(), 3); +/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 7); +/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 1); +/// assert_eq!(mat.offset(), 5); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct LazyStateID(u32); + +impl LazyStateID { + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + const MAX_BIT: usize = 31; + + #[cfg(target_pointer_width = "16")] + const MAX_BIT: usize = 15; + + const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT); + const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1); + const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2); + const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3); + const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4); + const MAX: usize = LazyStateID::MASK_MATCH - 1; + + /// Create a new lazy state ID. + /// + /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns + /// an error. + #[inline] + pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> { + if id > LazyStateID::MAX { + return Err(LazyStateIDError { attempted: id as u64 }); + } + Ok(LazyStateID::new_unchecked(id)) + } + + /// Create a new lazy state ID without checking whether the given value + /// exceeds [`LazyStateID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety. + #[inline] + const fn new_unchecked(id: usize) -> LazyStateID { + LazyStateID(id as u32) + } + + /// Return this lazy state ID as its raw value if and only if it is not + /// tagged (and thus not an unknown, dead, quit, start or match state ID). + #[inline] + pub(crate) fn as_usize(&self) -> Option<usize> { + if self.is_tagged() { + None + } else { + Some(self.as_usize_unchecked()) + } + } + + /// Return this lazy state ID as an untagged `usize`. + /// + /// If this lazy state ID is tagged, then the usize returned is the state + /// ID without the tag. If the ID was not tagged, then the usize returned + /// is equivalent to the state ID. + #[inline] + pub(crate) fn as_usize_untagged(&self) -> usize { + self.as_usize_unchecked() & LazyStateID::MAX + } + + /// Return this lazy state ID as its raw internal `usize` value, which may + /// be tagged (and thus greater than LazyStateID::MAX). + #[inline] + pub(crate) const fn as_usize_unchecked(&self) -> usize { + self.0 as usize + } + + #[inline] + pub(crate) const fn to_unknown(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN, + ) + } + + #[inline] + pub(crate) const fn to_dead(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_DEAD, + ) + } + + #[inline] + pub(crate) const fn to_quit(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_QUIT, + ) + } + + /// Return this lazy state ID as a state ID that is tagged as a start + /// state. + #[inline] + pub(crate) const fn to_start(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_START, + ) + } + + /// Return this lazy state ID as a lazy state ID that is tagged as a match + /// state. + #[inline] + pub(crate) const fn to_match(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_MATCH, + ) + } + + /// Return true if and only if this lazy state ID is tagged. + /// + /// When a lazy state ID is tagged, then one can conclude that it is one + /// of a match, start, dead, quit or unknown state. + #[inline] + pub const fn is_tagged(&self) -> bool { + self.as_usize_unchecked() > LazyStateID::MAX + } + + /// Return true if and only if this represents a lazy state ID that is + /// "unknown." That is, the state has not yet been created. When a caller + /// sees this state ID, it generally means that a state has to be computed + /// in order to proceed. + #[inline] + pub const fn is_unknown(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0 + } + + /// Return true if and only if this represents a dead state. A dead state + /// is a state that can never transition to any other state except the + /// dead state. When a dead state is seen, it generally indicates that a + /// search should stop. + #[inline] + pub const fn is_dead(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0 + } + + /// Return true if and only if this represents a quit state. A quit state + /// is a state that is representationally equivalent to a dead state, + /// except it indicates the automaton has reached a point at which it can + /// no longer determine whether a match exists or not. In general, this + /// indicates an error during search and the caller must either pass this + /// error up or use a different search technique. + #[inline] + pub const fn is_quit(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0 + } + + /// Return true if and only if this lazy state ID has been tagged as a + /// start state. + #[inline] + pub const fn is_start(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_START > 0 + } + + /// Return true if and only if this lazy state ID has been tagged as a + /// match state. + #[inline] + pub const fn is_match(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0 + } +} + +/// This error occurs when a lazy state ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum lazy state ID +/// value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct LazyStateIDError { + attempted: u64, +} + +impl LazyStateIDError { + /// Returns the value that failed to constructed a lazy state ID. + pub(crate) fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for LazyStateIDError {} + +impl core::fmt::Display for LazyStateIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create LazyStateID from {:?}, which exceeds {:?}", + self.attempted(), + LazyStateID::MAX, + ) + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides no introspection capabilities. The only thing a caller +/// can do is construct it and pass it around to permit search routines to use +/// it to track state. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + id: Option<LazyStateID>, + /// Information associated with a match when `id` corresponds to a match + /// state. + last_match: Option<StateMatch>, +} + +/// Internal state about the last match that occurred. This records both the +/// offset of the match and the match index. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct StateMatch { + /// The index into the matching patterns for the current match state. + pub(crate) match_index: usize, + /// The offset in the haystack at which the match occurred. This is used + /// when reporting multiple matches at the same offset. That is, when + /// an overlapping search runs, the first thing it checks is whether it's + /// already in a match state, and if so, whether there are more patterns + /// to report as matches in that state. If so, it increments `match_index` + /// and returns the pattern and this offset. Once `match_index` exceeds the + /// number of matching patterns in the current state, the search continues. + pub(crate) offset: usize, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { id: None, last_match: None } + } + + pub(crate) fn id(&self) -> Option<LazyStateID> { + self.id + } + + pub(crate) fn set_id(&mut self, id: LazyStateID) { + self.id = Some(id); + } + + pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> { + self.last_match.as_mut() + } + + pub(crate) fn set_last_match(&mut self, last_match: StateMatch) { + self.last_match = Some(last_match); + } +} diff --git a/vendor/regex-automata-0.2.0/src/hybrid/mod.rs b/vendor/regex-automata-0.2.0/src/hybrid/mod.rs new file mode 100644 index 000000000..4c8ca7ebe --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/mod.rs @@ -0,0 +1,179 @@ +/*! +A module for building and searching with lazy determinstic finite automata +(DFAs). + +Like other modules in this crate, lazy DFAs support a rich regex syntax with +Unicode features. The key feature of a lazy DFA is that it builds itself +incrementally during search, and never uses more than a configured capacity of +memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache" +in which the actual DFA's transition table is stored. + +If you're looking for fully compiled DFAs, then please see the top-level +[`dfa` module](crate::dfa). + +# Overview + +This section gives a brief overview of the primary types in this module: + +* A [`regex::Regex`] provides a way to search for matches of a regular +expression using lazy DFAs. This includes iterating over matches with both the +start and end positions of each match. +* A [`dfa::DFA`] provides direct low level access to a lazy DFA. + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::{hybrid::regex::Regex, MultiMatch}; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; +let mut cache = re.create_cache(); + +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = + re.find_leftmost_iter(&mut cache, text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: searching with regex sets + +The lazy DFAs in this module all fully support searching with multiple regexes +simultaneously. You can use this support with standard leftmost-first style +searching to find non-overlapping matches: + +``` +use regex_automata::{hybrid::regex::Regex, MultiMatch}; + +let re = Regex::new_many(&[r"\w+", r"\S+"])?; +let mut cache = re.create_cache(); + +let text = b"@foo bar"; +let matches: Vec<MultiMatch> = + re.find_leftmost_iter(&mut cache, text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 4), + MultiMatch::must(0, 5, 8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +Or use overlapping style searches to find all possible occurrences: + +``` +use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch}; + +// N.B. For overlapping searches, we need the underlying lazy DFA to report all +// possible matches. +let re = Regex::builder() + .dfa(dfa::Config::new().match_kind(MatchKind::All)) + .build_many(&[r"\w{3}", r"\S{3}"])?; +let mut cache = re.create_cache(); + +let text = b"@foo bar"; +let matches: Vec<MultiMatch> = + re.find_overlapping_iter(&mut cache, text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 3), + MultiMatch::must(0, 1, 4), + MultiMatch::must(1, 1, 4), + MultiMatch::must(0, 5, 8), + MultiMatch::must(1, 5, 8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# When should I use this? + +Generally speaking, if you can abide the use of mutable state during search, +and you don't need things like capturing groups or Unicode word boundary +support in non-ASCII text, then a lazy DFA is likely a robust choice with +respect to both search speed and memory usage. Note however that its speed +may be worse than a general purpose regex engine if you don't select a good +[prefilter](crate::util::prefilter). + +If you know ahead of time that your pattern would result in a very large DFA +if it was fully compiled, it may be better to use an NFA simulation instead +of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA +to something that is big enough to hold the state machine (likely through +experimentation). The issue here is that if the cache is too small, then it +could wind up being reset too frequently and this might decrease searching +speed significantly. + +# Differences with fully compiled DFAs + +A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a +[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities +(and similarly for their underlying DFAs), but they achieve them through +different means. The main difference is that a hybrid or "lazy" regex builds +its DFA lazily during search, where as a fully compiled regex will build its +DFA at construction time. While building a DFA at search time might sound like +it's slow, it tends to work out where most bytes seen during a search will +reuse pre-built parts of the DFA and thus can be almost as fast as a fully +compiled DFA. The main downside is that searching requires mutable space to +store the DFA, and, in the worst case, a search can result in a new state being +created for each byte seen, which would make searching quite a bit slower. + +A fully compiled DFA never has to worry about searches being slower once +it's built. (Aside from, say, the transition table being so large that it +is subject to harsh CPU cache effects.) However, of course, building a full +DFA can be quite time consuming and memory hungry. Particularly when it's +so easy to build large DFAs when Unicode mode is enabled. + +A lazy DFA strikes a nice balance _in practice_, particularly in the +presence of Unicode mode, by only building what is needed. It avoids the +worst case exponential time complexity of DFA compilation by guaranteeing that +it will only build at most one state per byte searched. While the worst +case here can lead to a very high constant, it will never be exponential. + +# Syntax + +This module supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). + +There are two things that are not supported by the lazy DFAs in this module: + +* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top +of them) can only find the offsets of an entire match, but cannot resolve +the offsets of each capturing group. This is because DFAs do not have the +expressive power necessary. +* Unicode word boundaries. These present particularly difficult challenges for +DFA construction and would result in an explosion in the number of states. +One can enable [`dfa::Config::unicode_word_boundary`] though, which provides +heuristic support for Unicode word boundaries that only works on ASCII text. +Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work +on any input. + +There are no plans to lift either of these limitations. + +Note that these restrictions are identical to the restrictions on fully +compiled DFAs. + +# Support for `alloc`-only + +This crate comes with `alloc` and `std` features that are enabled by default. +One can disable the `std` feature and still use the full API of a lazy DFA. +(You should use `std` when possible, since it permits providing implementations +of the `std::error::Error` trait, and does enable some minor internal +optimizations.) + +This module does require at least the `alloc` feature though. It is not +available in any capacity without `alloc`. +*/ + +pub use self::{ + error::{BuildError, CacheError}, + id::{LazyStateID, OverlappingState}, +}; + +pub mod dfa; +mod error; +mod id; +pub mod regex; +mod search; diff --git a/vendor/regex-automata-0.2.0/src/hybrid/regex.rs b/vendor/regex-automata-0.2.0/src/hybrid/regex.rs new file mode 100644 index 000000000..7cc6b9064 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/regex.rs @@ -0,0 +1,2124 @@ +/*! +A lazy DFA backed `Regex`. + +This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience +routines you might have come to expect, such as finding a match and iterating +over all non-overlapping matches. This `Regex` type is limited in its +capabilities to what a lazy DFA can provide. Therefore, APIs involving +capturing groups, for example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::hybrid) for examples. +*/ + +use core::borrow::Borrow; + +use alloc::boxed::Box; + +use crate::{ + hybrid::{ + dfa::{self, DFA}, + error::BuildError, + OverlappingState, + }, + nfa::thompson, + util::{ + matchtypes::{MatchError, MatchKind, MultiMatch}, + prefilter::{self, Prefilter}, + }, +}; + +/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") +/// for searching. +/// +/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of +/// a match while the reverse DFA is responsible for detecting the start +/// of a match. Thus, in order to find the bounds of any given match, a +/// forward search must first be run followed by a reverse search. A match +/// found by the forward DFA guarantees that the reverse DFA will also find +/// a match. +/// +/// A `Regex` can also have a prefilter set via the +/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is +/// enabled. +/// +/// # Earliest vs Leftmost vs Overlapping +/// +/// The search routines exposed on a `Regex` reflect three different ways +/// of searching: +/// +/// * "earliest" means to stop as soon as a match has been detected. +/// * "leftmost" means to continue matching until the underlying +/// automaton cannot advance. This reflects "standard" searching you +/// might be used to in other regex engines. e.g., This permits +/// non-greedy and greedy searching to work as you would expect. +/// * "overlapping" means to find all possible matches, even if they +/// overlap. +/// +/// Generally speaking, when doing an overlapping search, you'll want to +/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using +/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is +/// likely to lead to odd behavior since `LeftmostFirst` specifically omits +/// some matches that can never be reported due to its semantics. +/// +/// The following example shows the differences between how these different +/// types of searches impact looking for matches of `[a-z]+` in the +/// haystack `abc`. +/// +/// ``` +/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch}; +/// +/// let pattern = r"[a-z]+"; +/// let haystack = "abc".as_bytes(); +/// +/// // With leftmost-first semantics, we test "earliest" and "leftmost". +/// let re = regex::Builder::new() +/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst)) +/// .build(pattern)?; +/// let mut cache = re.create_cache(); +/// +/// // "earliest" searching isn't impacted by greediness +/// let mut it = re.find_earliest_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// // "leftmost" searching supports greediness (and non-greediness) +/// let mut it = re.find_leftmost_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// // For overlapping, we want "all" match kind semantics. +/// let re = regex::Builder::new() +/// .dfa(dfa::Config::new().match_kind(MatchKind::All)) +/// .build(pattern)?; +/// let mut cache = re.create_cache(); +/// +/// // In the overlapping search, we find all three possible matches +/// // starting at the beginning of the haystack. +/// let mut it = re.find_overlapping_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Fallibility +/// +/// In non-default configurations, the lazy DFAs generated in this module may +/// return an error during a search. (Currently, the only way this happens is +/// if quit bytes are added, Unicode word boundaries are heuristically enabled, +/// or if the cache is configured to "give up" on a search if it has been +/// cleared too many times. All of these are turned off by default, which means +/// a search can never fail in the default configuration.) For convenience, +/// the main search routines, like [`find_leftmost`](Regex::find_leftmost), +/// will panic if an error occurs. However, if you need to use DFAs which may +/// produce an error at search time, then there are fallible equivalents of +/// all search routines. For example, for `find_leftmost`, its fallible analog +/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed +/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the +/// infallible routines simply return `Option<MultiMatch>`. +/// +/// # Example +/// +/// This example shows how to cause a search to terminate if it sees a +/// `\n` byte, and handle the error returned. This could be useful if, for +/// example, you wanted to prevent a user supplied pattern from matching +/// across a line boundary. +/// +/// ``` +/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError}; +/// +/// let re = Regex::builder() +/// .dfa(dfa::Config::new().quit(b'\n', true)) +/// .build(r"foo\p{any}+bar")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = "foo\nbar".as_bytes(); +/// // Normally this would produce a match, since \p{any} contains '\n'. +/// // But since we instructed the automaton to enter a quit state if a +/// // '\n' is observed, this produces a match error instead. +/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; +/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err(); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug)] +pub struct Regex { + /// An optional prefilter that is passed down to the lazy DFA search + /// routines when present. By default, no prefilter is set. + pre: Option<Box<dyn Prefilter>>, + /// The forward lazy DFA. This can only find the end of a match. + forward: DFA, + /// The reverse lazy DFA. This can only find the start of a match. + /// + /// This is built with 'all' match semantics (instead of leftmost-first) + /// so that it always finds the longest possible match (which corresponds + /// to the leftmost starting position). It is also compiled as an anchored + /// matcher and has 'starts_for_each_pattern' enabled. Including starting + /// states for each pattern is necessary to ensure that we only look for + /// matches of a pattern that matched in the forward direction. Otherwise, + /// we might wind up finding the "leftmost" starting position of a totally + /// different pattern! + reverse: DFA, + /// Whether iterators on this type should advance by one codepoint or one + /// byte when an empty match is seen. + utf8: bool, +} + +/// Convenience routines for regex and cache construction. +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Regex::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_leftmost_iter( + /// &mut cache, + /// b"abc 1 foo 4567 0 quux", + /// ); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Regex::builder().build_many(patterns) + } + + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a regex. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode for `Regex` iteration. + /// When UTF-8 mode is disabled, the position immediately following an + /// empty match is where the next search begins, instead of the next + /// position of a UTF-8 encoded codepoint. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// nfa::thompson, + /// MultiMatch, SyntaxConfig, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(MultiMatch::must(0, 1, 9)); + /// let got = re.find_leftmost(&mut cache, haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this `Regex`. + /// + /// The cache returned should only be used for searches for this + /// `Regex`. If you want to reuse the cache for another `Regex`, then + /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, + /// [`Regex::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `Regex` (and only this `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 2)), + /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + self.forward().reset_cache(&mut cache.forward); + self.reverse().reset_cache(&mut cache.reverse); + } +} + +/// Standard infallible search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_is_match`](Regex::try_is_match). + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar")); + /// assert_eq!(false, re.is_match(&mut cache, b"foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool { + self.try_is_match(cache, haystack).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest`](Regex::try_find_earliest). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// // Normally, the leftmost first match would greedily consume as many + /// // decimal digits as it could. But a match is detected as soon as one + /// // digit is seen. + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 4)), + /// re.find_earliest(&mut cache, b"foo12345"), + /// ); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the "earliest" match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 1)), + /// re.find_earliest(&mut cache, b"abc"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Option<MultiMatch> { + self.try_find_earliest(cache, haystack).unwrap() + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// // Greediness is applied appropriately when compared to find_earliest. + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 11)), + /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re.find_leftmost(&mut cache, b"abc"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Option<MultiMatch> { + self.try_find_leftmost(cache, haystack).unwrap() + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping`](Regex::try_find_overlapping). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex, OverlappingState}, + /// MatchKind, + /// MultiMatch, + /// }; + /// + /// let re = Regex::builder() + /// .dfa(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(MultiMatch::must(1, 0, 4)); + /// let got = re.find_overlapping(&mut cache, haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(MultiMatch::must(0, 1, 4)); + /// let got = re.find_overlapping(&mut cache, haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping( + &self, + cache: &mut Cache, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping(cache, haystack, state).unwrap() + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). + /// + /// # Example + /// + /// This example shows how to run an "earliest" iterator. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::new("[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let haystack = "123".as_bytes(); + /// + /// // Normally, a standard leftmost iterator would return a single + /// // match, but since "earliest" detects matches earlier, we get + /// // three matches. + /// let mut it = re.find_earliest_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindEarliestMatches<'r, 'c, 't> { + FindEarliestMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec<MultiMatch> = re + /// .find_leftmost_iter(&mut cache, text) + /// .collect(); + /// assert_eq!(matches, vec![ + /// MultiMatch::must(0, 0, 4), + /// MultiMatch::must(0, 5, 10), + /// MultiMatch::must(0, 11, 17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex}, + /// MatchKind, + /// MultiMatch, + /// }; + /// + /// let re = Regex::builder() + /// .dfa(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = re.create_cache(); + /// let haystack = "@foo".as_bytes(); + /// + /// let mut it = re.find_overlapping_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindOverlappingMatches<'r, 'c, 't> { + FindOverlappingMatches::new(self, cache, haystack) + } +} + +/// Lower level infallible search routines that permit controlling where +/// the search starts and ends in a particular sequence. This is useful for +/// executing searches that need to take surrounding context into account. This +/// is required for correctly implementing iteration because of look-around +/// operators (`^`, `$`, `\b`). +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_is_match_at`](Regex::try_is_match_at). + pub fn is_match_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> bool { + self.try_is_match_at(cache, haystack, start, end).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_at`](Regex::try_find_earliest_at). + pub fn find_earliest_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_earliest_at(cache, haystack, start, end).unwrap() + } + + /// Returns the same as `find_leftmost`, but starts the search at the given + /// offset. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches within the + /// same haystack, which cannot be done correctly by simply providing a + /// subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). + pub fn find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_leftmost_at(cache, haystack, start, end).unwrap() + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). + pub fn find_overlapping_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping_at(cache, haystack, start, end, state) + .unwrap() + } +} + +/// Fallible search routines. These may return an error when the underlying +/// lazy DFAs have been configured in a way that permits them to fail during a +/// search. +/// +/// Errors during search only occur when the lazy DFA has been explicitly +/// configured to do so, usually by specifying one or more "quit" bytes or by +/// heuristically enabling Unicode word boundaries. +/// +/// Errors will never be returned using the default configuration. So these +/// fallible routines are only needed for particular configurations. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match`](Regex::is_match). + pub fn try_is_match( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<bool, MatchError> { + self.try_is_match_at(cache, haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest`](Regex::find_earliest). + pub fn try_find_earliest( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at(cache, haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost`](Regex::find_leftmost). + pub fn try_find_leftmost( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at(cache, haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping`](Regex::find_overlapping). + pub fn try_find_overlapping( + &self, + cache: &mut Cache, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_iter`](Regex::find_earliest_iter). + pub fn try_find_earliest_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindEarliestMatches<'r, 'c, 't> { + TryFindEarliestMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_iter`](Regex::find_leftmost_iter). + pub fn try_find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 'c, 't> { + TryFindLeftmostMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_iter`](Regex::find_overlapping_iter). + pub fn try_find_overlapping_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 'c, 't> { + TryFindOverlappingMatches::new(self, cache, haystack) + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match_at`](Regex::is_match_at). + pub fn try_is_match_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<bool, MatchError> { + self.forward() + .find_leftmost_fwd_at( + &mut cache.forward, + self.scanner().as_mut(), + None, + haystack, + start, + end, + ) + .map(|x| x.is_some()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_at`](Regex::find_earliest_at). + pub fn try_find_earliest_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + ) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_at`](Regex::find_leftmost_at). + pub fn try_find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + ) + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_at`](Regex::find_overlapping_at). + pub fn try_find_overlapping_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + state, + ) + } +} + +impl Regex { + #[inline(always)] + fn try_find_earliest_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa + .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // earliest case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = rdfa + .find_earliest_rev_at(rcache, None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + #[inline(always)] + fn try_find_leftmost_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa + .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = rdfa + .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + #[inline(always)] + fn try_find_overlapping_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa.find_overlapping_fwd_at( + fcache, pre, None, haystack, start, end, state, + )? { + None => return Ok(None), + Some(end) => end, + }; + // Unlike the leftmost cases, the reverse overlapping search may match + // a different pattern than the forward search. See test failures when + // using `None` instead of `Some(end.pattern())` below. Thus, we must + // run our reverse search using the pattern that matched in the forward + // direction. + let start = rdfa + .find_leftmost_rev_at( + rcache, + Some(end.pattern()), + haystack, + 0, + end.offset(), + )? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Return the underlying lazy DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn forward(&self) -> &DFA { + &self.forward + } + + /// Return the underlying lazy DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn reverse(&self) -> &DFA { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_count()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + assert_eq!( + self.forward().pattern_count(), + self.reverse().pattern_count() + ); + self.forward().pattern_count() + } + + /// Convenience function for returning this regex's prefilter as a trait + /// object. + /// + /// If this regex doesn't have a prefilter, then `None` is returned. + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.pre.as_ref().map(|x| &**x) + } + + /// Attach the given prefilter to this regex. + pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) { + self.pre = pre; + } + + /// Convenience function for returning a prefilter scanner. + fn scanner(&self) -> Option<prefilter::Scanner> { + self.prefilter().map(prefilter::Scanner::new) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>); + +impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindEarliestMatches<'r, 'c, 't> { + FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>); + +impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all overlapping matches for a particular infallible +/// search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindOverlappingMatches<'r, 'c, 't>( + TryFindOverlappingMatches<'r, 'c, 't>, +); + +impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindOverlappingMatches<'r, 'c, 't> { + FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindEarliestMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindEarliestMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindEarliestMatches { + re, + cache, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_earliest_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindLeftmostMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindLeftmostMatches { + re, + cache, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_leftmost_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all overlapping matches for a particular fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindOverlappingMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + state: OverlappingState, +} + +impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindOverlappingMatches { + re, + cache, + scanner, + text, + last_end: 0, + state: OverlappingState::start(), + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_overlapping_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + &mut self.state, + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + // Unlike the non-overlapping case, we're OK with empty matches at this + // level. In particular, the overlapping search algorithm is itself + // responsible for ensuring that progress is always made. + self.last_end = m.end(); + Some(Ok(m)) + } +} + +/// A cache represents a partially computed forward and reverse DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`Regex`]. +/// +/// Caches can be created from their corresponding `Regex` via +/// [`Regex::create_cache`]. A cache can only be used with either the `Regex` +/// that created it, or the `Regex` that was most recently used to reset it +/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in +/// panics or incorrect results. +#[derive(Debug, Clone)] +pub struct Cache { + forward: dfa::Cache, + reverse: dfa::Cache, +} + +impl Cache { + /// Create a new cache for the given `Regex`. + /// + /// The cache returned should only be used for searches for the given + /// `Regex`. If you want to reuse the cache for another `Regex`, then you + /// must call [`Cache::reset`] with that `Regex`. + pub fn new(re: &Regex) -> Cache { + let forward = dfa::Cache::new(re.forward()); + let reverse = dfa::Cache::new(re.reverse()); + Cache { forward, reverse } + } + + /// Reset this cache such that it can be used for searching with the given + /// `Regex` (and only that `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 2)), + /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &Regex) { + self.forward.reset(re.forward()); + self.reverse.reset(re.reverse()); + } + + /// Returns the heap memory usage, in bytes, as a sum of the forward and + /// reverse lazy DFA caches. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.forward.memory_usage() + self.reverse.memory_usage() + } + + /// Return references to the forward and reverse caches, respectively. + pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { + (&self.forward, &self.reverse) + } + + /// Return mutable references to the forward and reverse caches, + /// respectively. + pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { + (&mut self.forward, &mut self.reverse) + } +} + +/// The configuration used for compiling a hybrid NFA/DFA regex. +/// +/// A regex configuration is a simple data object that is typically used with +/// [`Builder::configure`]. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + utf8: Option<bool>, +} + +impl Config { + /// Return a new default regex compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (the default) and an empty match is seen, + /// the iterators on [`Regex`] will always start the next search at the + /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 + /// mode is disabled, such searches are begun at the next byte offset. + /// + /// If this mode is enabled and invalid UTF-8 is given to search, then + /// behavior is unspecified. + /// + /// Generally speaking, one should enable this when + /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) + /// and + /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) + /// are enabled, and disable it otherwise. + /// + /// # Example + /// + /// This example demonstrates the differences between when this option is + /// enabled and disabled. The differences only arise when the regex can + /// return matches of length zero. + /// + /// In this first snippet, we show the results when UTF-8 mode is disabled. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And in this snippet, we execute the same search on the same haystack, + /// but with UTF-8 mode enabled. Notice that byte offsets that would + /// otherwise split the encoding of `☃` are not returned. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(true)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Returns true if and only if this configuration has UTF-8 mode enabled. + /// + /// When UTF-8 mode is enabled and an empty match is seen, the iterators on + /// [`Regex`] will always start the next search at the next UTF-8 encoded + /// codepoint. When UTF-8 mode is disabled, such searches are begun at the + /// next byte offset. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { utf8: o.utf8.or(self.utf8) } + } +} + +/// A builder for a regex based on a hybrid NFA/DFA. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the lazy DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder +/// in that it permits fine grain configuration of the construction process. +/// The trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are three +/// different UTF-8 modes: +/// +/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the +/// pattern itself can contain sub-expressions that match invalid UTF-8. +/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) +/// controls whether the implicit unanchored prefix added to the NFA can +/// match through invalid UTF-8 or not. +/// * [`Config::utf8`] controls how the regex iterators themselves advance +/// the starting position of the next search when a match with zero length is +/// found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two hybrid NFA/DFAs, +/// where one is responsible for finding the end of a match and the other is +/// responsible for finding the start of a match. If you only need to detect +/// whether something matched, or only the end of a match, then you should use +/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper +/// than building two of them. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax, the NFA and +/// the regex itself. This is generally what you want for matching on +/// arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// }; +/// +/// let re = Regex::builder() +/// .configure(Regex::config().utf8(false)) +/// .syntax(SyntaxConfig::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(MultiMatch::must(0, 1, 9)); +/// let got = re.find_leftmost(&mut cache, haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. Notice also that the +/// // search was unanchored and skipped over invalid UTF-8. +/// // Disabling UTF-8 on the Thompson NFA permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + dfa: dfa::Builder, +} + +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), dfa: DFA::builder() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given patterns. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + DFA::config() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(true), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse hybrid NFA/DFAs. + fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { + // The congruous method on DFA-backed regexes is exposed, but it's + // not clear this builder is useful here since lazy DFAs can't be + // serialized and there is only one type of them. + let utf8 = self.config.get_utf8(); + Regex { pre: None, forward, reverse, utf8 } + } + + /// Apply the given regex configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the lazy DFA compilation configuration for this builder using + /// [`dfa::Config`](dfa::Config). + /// + /// This permits setting things like whether Unicode word boundaries should + /// be heuristically supported or settings how the behavior of the cache. + pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +#[inline(always)] +fn next_unwrap( + item: Option<Result<MultiMatch, MatchError>>, +) -> Option<MultiMatch> { + match item { + None => None, + Some(Ok(m)) => Some(m), + Some(Err(err)) => panic!( + "unexpected regex search error: {}\n\ + to handle search errors, use try_ methods", + err, + ), + } +} diff --git a/vendor/regex-automata-0.2.0/src/hybrid/search.rs b/vendor/regex-automata-0.2.0/src/hybrid/search.rs new file mode 100644 index 000000000..92760cee2 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/search.rs @@ -0,0 +1,663 @@ +use crate::{ + hybrid::{ + dfa::{Cache, DFA}, + id::{LazyStateID, OverlappingState, StateMatch}, + }, + nfa::thompson, + util::{ + id::PatternID, + matchtypes::{HalfMatch, MatchError}, + prefilter, MATCH_OFFSET, + }, +}; + +#[inline(never)] +pub(crate) fn find_earliest_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end) + } else { + find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end) + } +} + +#[inline(never)] +pub(crate) fn find_leftmost_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end) + } else { + find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end) + } +} + +#[inline(always)] +fn find_fwd( + mut pre: Option<&mut prefilter::Scanner>, + earliest: bool, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + haystack: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= haystack.len()); + assert!(end <= haystack.len()); + + // Why do this? This lets 'bytes[at]' work without bounds checks below. + // It seems the assert on 'end <= haystack.len()' above is otherwise + // not enough. Why not just make 'bytes' scoped this way anyway? Well, + // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' + // for resolving look-ahead. + let bytes = &haystack[..end]; + + let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?; + let mut last_match = None; + let mut at = start; + if let Some(ref mut pre) = pre { + // If a prefilter doesn't report false positives, then we don't need to + // touch the DFA at all. However, since all matches include the pattern + // ID, and the prefilter infrastructure doesn't report pattern IDs, we + // limit this optimization to cases where there is exactly one pattern. + // In that case, any match must be the 0th pattern. + if dfa.pattern_count() == 1 && !pre.reports_false_positives() { + return Ok(pre.next_candidate(bytes, at).into_option().map( + |offset| HalfMatch { pattern: PatternID::ZERO, offset }, + )); + } else if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + while at < end { + if sid.is_tagged() { + sid = dfa + .next_state(cache, sid, bytes[at]) + .map_err(|_| gave_up(at))?; + at += 1; + } else { + // SAFETY: There are two safety invariants we need to uphold + // here in the loop below: that 'sid' is a valid state ID for + // this DFA, and that 'at' is a valid index into 'bytes'. For + // the former, we rely on the invariant that next_state* and + // start_state_forward always returns a valid state ID (given a + // valid state ID in the former case), and that we are only at this + // place in the code if 'sid' is untagged. Moreover, every call to + // next_state_untagged_unchecked below is guarded by a check that + // sid is untagged. For the latter safety invariant, we always + // guard unchecked access with a check that 'at' is less than + // 'end', where 'end == bytes.len()'. + // + // For justification, this gives us a ~10% bump in search time. + // This was used for a benchmark: + // + // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb + // + // With bounds checked: ~881.4ms. Without: ~775ms. For input, I + // used OpenSubtitles2018.raw.sample.medium.en. + let mut prev_sid = sid; + while at < end { + prev_sid = sid; + sid = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + at += 1; + if sid.is_tagged() { + break; + } + // SAFETY: we make four unguarded accesses to 'bytes[at]' + // below, and each are safe because we know that 'at + 4' is + // in bounds. Moreover, while we don't check whether 'sid' is + // untagged directly, we know it is because of the check above. + // And the unrolled loop below quits when the next state is not + // equal to the previous state. + // + // PERF: For justification for eliminating bounds checks, + // see above. For justification for the unrolling, we use + // two tests. The one above with regex '(?m)^.+$', and also + // '(?m)^.{40}$'. The former is kinda the best case for + // unrolling, and gives a 1.67 boost primarily because the DFA + // spends most of its time munching through the input in the + // same state. But the latter pattern rarely spends time in the + // same state through subsequent transitions, so unrolling is + // pretty much always ineffective in that it craps out on the + // first 'sid != next' check below. However, without unrolling, + // search is only 1.03 times faster than with unrolling on the + // latter pattern, which we deem to be an acceptable loss in + // favor of optimizing the more common case of having a "hot" + // state somewhere in the DFA. + while at + 4 < end { + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + } + } + if sid.is_unknown() { + sid = dfa + .next_state(cache, prev_sid, bytes[at - 1]) + .map_err(|_| gave_up(at - 1))?; + } + } + if sid.is_tagged() { + if sid.is_start() { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + } else if sid.is_match() { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: at - MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + } else if sid.is_dead() { + return Ok(last_match); + } else if sid.is_quit() { + if last_match.is_some() { + return Ok(last_match); + } + let offset = at - 1; + return Err(MatchError::Quit { byte: bytes[offset], offset }); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + } + // We are careful to use 'haystack' here, which contains the full context + // that we might want to inspect. + Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match)) +} + +#[inline(never)] +pub(crate) fn find_earliest_rev( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + find_rev(true, dfa, cache, pattern_id, bytes, start, end) +} + +#[inline(never)] +pub(crate) fn find_leftmost_rev( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + find_rev(false, dfa, cache, pattern_id, bytes, start, end) +} + +#[inline(always)] +fn find_rev( + earliest: bool, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + haystack: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= haystack.len()); + assert!(end <= haystack.len()); + + // Why do this? This lets 'bytes[at]' work without bounds checks below. + // It seems the assert on 'end <= haystack.len()' above is otherwise + // not enough. Why not just make 'bytes' scoped this way anyway? Well, + // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' + // for resolving look-ahead. + let bytes = &haystack[start..]; + + let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?; + let mut last_match = None; + let mut at = end - start; + while at > 0 { + if sid.is_tagged() { + at -= 1; + sid = dfa + .next_state(cache, sid, bytes[at]) + .map_err(|_| gave_up(at))?; + } else { + // SAFETY: See comments in 'find_fwd' for both a safety argument + // and a justification from a performance perspective as to 1) why + // we elide bounds checks and 2) why we do a specialized version of + // unrolling below. + let mut prev_sid = sid; + while at > 0 && !sid.is_tagged() { + prev_sid = sid; + at -= 1; + while at > 3 { + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + } + sid = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + } + if sid.is_unknown() { + sid = dfa + .next_state(cache, prev_sid, bytes[at]) + .map_err(|_| gave_up(at))?; + } + } + if sid.is_tagged() { + if sid.is_start() { + continue; + } else if sid.is_match() { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: start + at + MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + } else if sid.is_dead() { + return Ok(last_match); + } else { + debug_assert!(sid.is_quit()); + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::Quit { byte: bytes[at], offset: at }); + } + } + } + Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match)) +} + +#[inline(never)] +pub(crate) fn find_overlapping_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should only ever + // use a prefilter when no pattern ID is given. + if pre.is_some() && pattern_id.is_none() { + find_overlapping_fwd_imp( + pre, + dfa, + cache, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } else { + find_overlapping_fwd_imp( + None, + dfa, + cache, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } +} + +#[inline(always)] +fn find_overlapping_fwd_imp( + mut pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let mut sid = match caller_state.id() { + None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?, + Some(sid) => { + if let Some(last) = caller_state.last_match() { + let match_count = dfa.match_count(cache, sid); + if last.match_index < match_count { + let m = HalfMatch { + pattern: dfa.match_pattern( + cache, + sid, + last.match_index, + ), + offset: last.offset, + }; + last.match_index += 1; + return Ok(Some(m)); + } + } + + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `match_offset` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `MATCH_OFFSET` bytes. + // + // Incidentally, since MATCH_OFFSET is non-zero, this also makes + // dealing with empty matches convenient. Namely, callers needn't + // special case them when implementing an iterator. Instead, this + // ensures that forward progress is always made. + start += MATCH_OFFSET; + sid + } + }; + + let mut at = start; + while at < end { + let byte = bytes[at]; + sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?; + at += 1; + if sid.is_tagged() { + caller_state.set_id(sid); + if sid.is_start() { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + } else if sid.is_match() { + let offset = at - MATCH_OFFSET; + caller_state + .set_last_match(StateMatch { match_index: 1, offset }); + return Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset, + })); + } else if sid.is_dead() { + return Ok(None); + } else { + debug_assert!(sid.is_quit()); + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + } + } + + let result = eoi_fwd(dfa, cache, bytes, end, &mut sid); + caller_state.set_id(sid); + if let Ok(Some(ref last_match)) = result { + caller_state.set_last_match(StateMatch { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + match_index: 1, + offset: last_match.offset(), + }); + } + result +} + +#[inline(always)] +fn init_fwd( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<LazyStateID, MatchError> { + let sid = dfa + .start_state_forward(cache, pattern_id, bytes, start, end) + .map_err(|_| gave_up(start))?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!sid.is_match()); + Ok(sid) +} + +#[inline(always)] +fn init_rev( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<LazyStateID, MatchError> { + let sid = dfa + .start_state_reverse(cache, pattern_id, bytes, start, end) + .map_err(|_| gave_up(end))?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!sid.is_match()); + Ok(sid) +} + +#[inline(always)] +fn eoi_fwd( + dfa: &DFA, + cache: &mut Cache, + bytes: &[u8], + end: usize, + sid: &mut LazyStateID, +) -> Result<Option<HalfMatch>, MatchError> { + match bytes.get(end) { + Some(&b) => { + *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, *sid, 0), + offset: end, + })) + } else { + Ok(None) + } + } + None => { + *sid = dfa + .next_eoi_state(cache, *sid) + .map_err(|_| gave_up(bytes.len()))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, *sid, 0), + offset: bytes.len(), + })) + } else { + Ok(None) + } + } + } +} + +#[inline(always)] +fn eoi_rev( + dfa: &DFA, + cache: &mut Cache, + bytes: &[u8], + start: usize, + state: LazyStateID, +) -> Result<Option<HalfMatch>, MatchError> { + if start > 0 { + let sid = dfa + .next_state(cache, state, bytes[start - 1]) + .map_err(|_| gave_up(start))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: start, + })) + } else { + Ok(None) + } + } else { + let sid = + dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: 0, + })) + } else { + Ok(None) + } + } +} + +/// A convenience routine for constructing a "gave up" match error. +#[inline(always)] +fn gave_up(offset: usize) -> MatchError { + MatchError::GaveUp { offset } +} diff --git a/vendor/regex-automata-0.2.0/src/lib.rs b/vendor/regex-automata-0.2.0/src/lib.rs new file mode 100644 index 000000000..d9d7ada48 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/lib.rs @@ -0,0 +1,47 @@ +/*! +This crate provides an "expert" API for executing regular expressions using +finite automata. + +**WARNING**: This `0.2` release of `regex-automata` was published +before it was ready to unblock work elsewhere that needed some +of the new APIs in this release. At the time of writing, it is +strongly preferred that you continue using the +[`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/) +release. Since this release represents an unfinished state, please do not +create issues for this release unless it's for a critical bug. +*/ + +#![allow(warnings)] +// #![deny(missing_docs)] +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(not(any( + target_pointer_width = "16", + target_pointer_width = "32", + target_pointer_width = "64" +)))] +compile_error!("regex-automata currently not supported on non-{16,32,64}"); + +#[cfg(feature = "alloc")] +extern crate alloc; + +#[doc(inline)] +pub use crate::util::id::PatternID; +#[cfg(feature = "alloc")] +pub use crate::util::syntax::SyntaxConfig; +pub use crate::util::{ + bytes::{DeserializeError, SerializeError}, + matchtypes::{HalfMatch, Match, MatchError, MatchKind, MultiMatch}, +}; + +#[macro_use] +mod macros; + +pub mod dfa; +#[cfg(feature = "alloc")] +pub mod hybrid; +#[doc(hidden)] +#[cfg(feature = "alloc")] +pub mod nfa; +#[doc(hidden)] +pub mod util; diff --git a/vendor/regex-automata-0.2.0/src/macros.rs b/vendor/regex-automata-0.2.0/src/macros.rs new file mode 100644 index 000000000..649ba17c5 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/macros.rs @@ -0,0 +1,30 @@ +/// A simple macro for defining bitfield accessors/mutators. +#[cfg(feature = "alloc")] +macro_rules! define_bool { + ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { + fn $is_fn_name(&self) -> bool { + self.bools & (0b1 << $bit) > 0 + } + + fn $set_fn_name(&mut self, yes: bool) { + if yes { + self.bools |= 1 << $bit; + } else { + self.bools &= !(1 << $bit); + } + } + }; +} + +macro_rules! log { + ($($tt:tt)*) => { + #[cfg(feature = "logging")] + { + $($tt)* + } + } +} + +macro_rules! trace { + ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } +} diff --git a/vendor/regex-automata-0.2.0/src/nfa/mod.rs b/vendor/regex-automata-0.2.0/src/nfa/mod.rs new file mode 100644 index 000000000..61ce5ef47 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/mod.rs @@ -0,0 +1 @@ +pub mod thompson; diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs new file mode 100644 index 000000000..301194005 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs @@ -0,0 +1,1713 @@ +/* +This module provides an NFA compiler using Thompson's construction +algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA +graph as output. The NFA graph is structured in a way that permits it to be +executed by a virtual machine and also used to efficiently build a DFA. + +The compiler deals with a slightly expanded set of NFA states that notably +includes an empty node that has exactly one epsilon transition to the next +state. In other words, it's a "goto" instruction if one views Thompson's NFA +as a set of bytecode instructions. These goto instructions are removed in +a subsequent phase before returning the NFA to the caller. The purpose of +these empty nodes is that they make the construction algorithm substantially +simpler to implement. We remove them before returning to the caller because +they can represent substantial overhead when traversing the NFA graph +(either while searching using the NFA directly or while building a DFA). + +In the future, it would be nice to provide a Glushkov compiler as well, +as it would work well as a bit-parallel NFA for smaller regexes. But +the Thompson construction is one I'm more familiar with and seems more +straight-forward to deal with when it comes to large Unicode character +classes. + +Internally, the compiler uses interior mutability to improve composition +in the face of the borrow checker. In particular, we'd really like to be +able to write things like this: + + self.c_concat(exprs.iter().map(|e| self.c(e))) + +Which elegantly uses iterators to build up a sequence of compiled regex +sub-expressions and then hands it off to the concatenating compiler +routine. Without interior mutability, the borrow checker won't let us +borrow `self` mutably both inside and outside the closure at the same +time. +*/ + +use core::{ + borrow::Borrow, + cell::{Cell, RefCell}, + mem, +}; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use regex_syntax::{ + hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary}, + utf8::{Utf8Range, Utf8Sequences}, + ParserBuilder, +}; + +use crate::{ + nfa::thompson::{ + error::Error, + map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, + range_trie::RangeTrie, + Look, SparseTransitions, State, Transition, NFA, + }, + util::{ + alphabet::ByteClassSet, + id::{IteratorIDExt, PatternID, StateID}, + }, +}; + +/// The configuration used for compiling a Thompson NFA from a regex pattern. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + reverse: Option<bool>, + utf8: Option<bool>, + nfa_size_limit: Option<Option<usize>>, + shrink: Option<bool>, + captures: Option<bool>, + #[cfg(test)] + unanchored_prefix: Option<bool>, +} + +impl Config { + /// Return a new default Thompson NFA compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Reverse the NFA. + /// + /// A NFA reversal is performed by reversing all of the concatenated + /// sub-expressions in the original pattern, recursively. The resulting + /// NFA can be used to match the pattern starting from the end of a string + /// instead of the beginning of a string. + /// + /// Reversing the NFA is useful for building a reverse DFA, which is most + /// useful for finding the start of a match after its ending position has + /// been found. + /// + /// This is disabled by default. + pub fn reverse(mut self, yes: bool) -> Config { + self.reverse = Some(yes); + self + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (which is the default), unanchored searches + /// will only match through valid UTF-8. If invalid UTF-8 is seen, then + /// an unanchored search will stop at that point. This is equivalent to + /// putting a `(?s:.)*?` at the start of the regex. + /// + /// When UTF-8 mode is disabled, then unanchored searches will match + /// through any arbitrary byte. This is equivalent to putting a + /// `(?s-u:.)*?` at the start of the regex. + /// + /// Generally speaking, UTF-8 mode should only be used when you know you + /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used + /// on input that is not valid UTF-8, then the regex is not likely to work + /// as expected. + /// + /// This is enabled by default. + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Sets an approximate size limit on the total heap used by the NFA being + /// compiled. + /// + /// This permits imposing constraints on the size of a compiled NFA. This + /// may be useful in contexts where the regex pattern is untrusted and one + /// wants to avoid using too much memory. + /// + /// This size limit does not apply to auxiliary heap used during + /// compilation that is not part of the built NFA. + /// + /// Note that this size limit is applied during compilation in order for + /// the limit to prevent too much heap from being used. However, the + /// implementation may use an intermediate NFA representation that is + /// otherwise slightly bigger than the final public form. Since the size + /// limit may be applied to an intermediate representation, there is not + /// necessarily a precise correspondence between the configured size limit + /// and the heap usage of the final NFA. + /// + /// There is no size limit by default. + /// + /// # Example + /// + /// This example demonstrates how Unicode mode can greatly increase the + /// size of the NFA. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// // 300KB isn't enough! + /// NFA::builder() + /// .configure(NFA::config().nfa_size_limit(Some(300_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 400KB probably is. + /// let nfa = NFA::builder() + /// .configure(NFA::config().nfa_size_limit(Some(400_000))) + /// .build(r"\w{20}")?; + /// + /// assert_eq!(nfa.pattern_len(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config { + self.nfa_size_limit = Some(bytes); + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// This is enabled by default. Generally speaking, if one is using an NFA + /// to compile a DFA, then the extra time used to shrink the NFA will be + /// more than made up for during DFA construction (potentially by a lot). + /// In other words, enabling this can substantially decrease the overall + /// amount of time it takes to build a DFA. + /// + /// The only reason to disable this if you want to compile an NFA and start + /// using it as quickly as possible without needing to build a DFA. e.g., + /// for an NFA simulation or for a lazy DFA. + /// + /// This is enabled by default. + pub fn shrink(mut self, yes: bool) -> Config { + self.shrink = Some(yes); + self + } + + /// Whether to include 'Capture' states in the NFA. + /// + /// This can only be enabled when compiling a forward NFA. This is + /// always disabled---with no way to override it---when the `reverse` + /// configuration is enabled. + /// + /// This is enabled by default. + pub fn captures(mut self, yes: bool) -> Config { + self.captures = Some(yes); + self + } + + /// Whether to compile an unanchored prefix into this NFA. + /// + /// This is enabled by default. It is made available for tests only to make + /// it easier to unit test the output of the compiler. + #[cfg(test)] + fn unanchored_prefix(mut self, yes: bool) -> Config { + self.unanchored_prefix = Some(yes); + self + } + + pub fn get_reverse(&self) -> bool { + self.reverse.unwrap_or(false) + } + + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + pub fn get_nfa_size_limit(&self) -> Option<usize> { + self.nfa_size_limit.unwrap_or(None) + } + + pub fn get_shrink(&self) -> bool { + self.shrink.unwrap_or(true) + } + + pub fn get_captures(&self) -> bool { + !self.get_reverse() && self.captures.unwrap_or(true) + } + + fn get_unanchored_prefix(&self) -> bool { + #[cfg(test)] + { + self.unanchored_prefix.unwrap_or(true) + } + #[cfg(not(test))] + { + true + } + } + + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { + reverse: o.reverse.or(self.reverse), + utf8: o.utf8.or(self.utf8), + nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), + shrink: o.shrink.or(self.shrink), + captures: o.captures.or(self.captures), + #[cfg(test)] + unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), + } + } +} + +/// A builder for compiling an NFA. +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + parser: ParserBuilder, +} + +impl Builder { + /// Create a new NFA builder with its default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), parser: ParserBuilder::new() } + } + + /// Compile the given regular expression into an NFA. + /// + /// If there was a problem parsing the regex, then that error is returned. + /// + /// Otherwise, if there was a problem building the NFA, then an error is + /// returned. The only error that can occur is if the compiled regex would + /// exceed the size limits configured on this builder. + pub fn build(&self, pattern: &str) -> Result<NFA, Error> { + self.build_many(&[pattern]) + } + + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<NFA, Error> { + let mut hirs = vec![]; + for p in patterns { + hirs.push( + self.parser + .build() + .parse(p.as_ref()) + .map_err(Error::syntax)?, + ); + log!(log::trace!("parsed: {:?}", p.as_ref())); + } + self.build_many_from_hir(&hirs) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into an NFA. + /// + /// If there was a problem building the NFA, then an error is returned. The + /// only error that can occur is if the compiled regex would exceed the + /// size limits configured on this builder. + pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> { + self.build_from_hir_with(&mut Compiler::new(), expr) + } + + pub fn build_many_from_hir<H: Borrow<Hir>>( + &self, + exprs: &[H], + ) -> Result<NFA, Error> { + self.build_many_from_hir_with(&mut Compiler::new(), exprs) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into the NFA given using the given compiler. Callers may + /// prefer this over `build` if they would like to reuse allocations while + /// compiling many regular expressions. + /// + /// On success, the given NFA is completely overwritten with the NFA + /// produced by the compiler. + /// + /// If there was a problem building the NFA, then an error is returned. + /// The only error that can occur is if the compiled regex would exceed + /// the size limits configured on this builder. When an error is returned, + /// the contents of `nfa` are unspecified and should not be relied upon. + /// However, it can still be reused in subsequent calls to this method. + fn build_from_hir_with( + &self, + compiler: &mut Compiler, + expr: &Hir, + ) -> Result<NFA, Error> { + self.build_many_from_hir_with(compiler, &[expr]) + } + + fn build_many_from_hir_with<H: Borrow<Hir>>( + &self, + compiler: &mut Compiler, + exprs: &[H], + ) -> Result<NFA, Error> { + compiler.configure(self.config); + compiler.compile(exprs) + } + + /// Apply the given NFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](../../struct.SyntaxConfig.html). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// This syntax configuration generally only applies when an NFA is built + /// directly from a pattern string. If an NFA is built from an HIR, then + /// all syntax settings are ignored. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + config.apply(&mut self.parser); + self + } +} + +/// A compiler that converts a regex abstract syntax to an NFA via Thompson's +/// construction. Namely, this compiler permits epsilon transitions between +/// states. +#[derive(Clone, Debug)] +pub struct Compiler { + /// The configuration from the builder. + config: Config, + /// The final NFA that is built. + /// + /// Parts of this NFA are constructed during compilation, but the actual + /// states aren't added until a final "finish" step. This is because the + /// states constructed during compilation have unconditional epsilon + /// transitions, which makes the logic of compilation much simpler. The + /// "finish" step removes these unconditional epsilon transitions and must + /// therefore remap all of the transition state IDs. + nfa: RefCell<NFA>, + /// The set of compiled NFA states. Once a state is compiled, it is + /// assigned a state ID equivalent to its index in this list. Subsequent + /// compilation can modify previous states by adding new transitions. + states: RefCell<Vec<CState>>, + /// State used for compiling character classes to UTF-8 byte automata. + /// State is not retained between character class compilations. This just + /// serves to amortize allocation to the extent possible. + utf8_state: RefCell<Utf8State>, + /// State used for arranging character classes in reverse into a trie. + trie_state: RefCell<RangeTrie>, + /// State used for caching common suffixes when compiling reverse UTF-8 + /// automata (for Unicode character classes). + utf8_suffix: RefCell<Utf8SuffixMap>, + /// A map used to re-map state IDs when translating the compiler's internal + /// NFA state representation to the external NFA representation. + remap: RefCell<Vec<StateID>>, + /// A set of compiler internal state IDs that correspond to states that are + /// exclusively epsilon transitions, i.e., goto instructions, combined with + /// the state that they point to. This is used to record said states while + /// transforming the compiler's internal NFA representation to the external + /// form. + empties: RefCell<Vec<(StateID, StateID)>>, + /// The total memory used by each of the 'CState's in 'states'. This only + /// includes heap usage by each state, and not the size of the state + /// itself. + memory_cstates: Cell<usize>, +} + +/// A compiler intermediate state representation for an NFA that is only used +/// during compilation. Once compilation is done, `CState`s are converted +/// to `State`s (defined in the parent module), which have a much simpler +/// representation. +#[derive(Clone, Debug, Eq, PartialEq)] +enum CState { + /// An empty state whose only purpose is to forward the automaton to + /// another state via en epsilon transition. These are useful during + /// compilation but are otherwise removed at the end. + Empty { + next: StateID, + }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to 'Empty', but serves the purpose of instructing NFA simulations to + /// record additional state when the finite state machine passes through + /// this epsilon transition. + /// + /// These transitions are treated as epsilon transitions with no additional + /// effects in DFAs. + /// + /// 'slot' in this context refers to the specific capture group offset that + /// is being recorded. Each capturing group has two slots corresponding to + /// the start and end of the matching portion of that group. + CaptureStart { + next: StateID, + capture_index: u32, + name: Option<Arc<str>>, + }, + CaptureEnd { + next: StateID, + capture_index: u32, + }, + /// A state that only transitions to `next` if the current input byte is + /// in the range `[start, end]` (inclusive on both ends). + Range { + range: Transition, + }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. + /// As such, this may only be used when every transition has equal + /// priority. (In practice, this is only used for encoding large UTF-8 + /// automata.) In contrast, a `Union` state has each alternate in order + /// of priority. Priority is used to implement greedy matching and also + /// alternations themselves, e.g., `abc|a` where `abc` has priority over + /// `a`. + /// + /// To clarify, it is possible to remove `Sparse` and represent all things + /// that `Sparse` is used for via `Union`. But this creates a more bloated + /// NFA with more epsilon transitions than is necessary in the special case + /// of character classes. + Sparse { + ranges: Vec<Transition>, + }, + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { + look: Look, + next: StateID, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { + alternates: Vec<StateID>, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via later transitions are + /// preferred over earlier transitions. + /// + /// This "reverse" state exists for convenience during compilation that + /// permits easy construction of non-greedy combinations of NFA states. At + /// the end of compilation, Union and UnionReverse states are merged into + /// one Union type of state, where the latter has its epsilon transitions + /// reversed to reflect the priority inversion. + /// + /// The "convenience" here arises from the fact that as new states are + /// added to the list of `alternates`, we would like that add operation + /// to be amortized constant time. But if we used a `Union`, we'd need to + /// prepend the state, which takes O(n) time. There are other approaches we + /// could use to solve this, but this seems simple enough. + UnionReverse { + alternates: Vec<StateID>, + }, + /// A match state. There is at most one such occurrence of this state in + /// an NFA for each pattern compiled into the NFA. At time of writing, a + /// match state is always produced for every pattern given, but in theory, + /// if a pattern can never lead to a match, then the match state could be + /// omitted. + /// + /// `id` refers to the ID of the pattern itself, which corresponds to the + /// pattern's index (starting at 0). `start_id` refers to the anchored + /// NFA starting state corresponding to this pattern. + Match { + pattern_id: PatternID, + start_id: StateID, + }, +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +pub struct ThompsonRef { + start: StateID, + end: StateID, +} + +impl Compiler { + /// Create a new compiler. + pub fn new() -> Compiler { + Compiler { + config: Config::default(), + nfa: RefCell::new(NFA::empty()), + states: RefCell::new(vec![]), + utf8_state: RefCell::new(Utf8State::new()), + trie_state: RefCell::new(RangeTrie::new()), + utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + remap: RefCell::new(vec![]), + empties: RefCell::new(vec![]), + memory_cstates: Cell::new(0), + } + } + + /// Configure and prepare this compiler from the builder's knobs. + /// + /// The compiler is must always reconfigured by the builder before using it + /// to build an NFA. Namely, this will also clear any latent state in the + /// compiler used during previous compilations. + fn configure(&mut self, config: Config) { + self.config = config; + self.nfa.borrow_mut().clear(); + self.states.borrow_mut().clear(); + self.memory_cstates.set(0); + // We don't need to clear anything else since they are cleared on + // their own and only when they are used. + } + + /// Convert the current intermediate NFA to its final compiled form. + fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> { + if exprs.is_empty() { + return Ok(NFA::never_match()); + } + if exprs.len() > PatternID::LIMIT { + return Err(Error::too_many_patterns(exprs.len())); + } + + // We always add an unanchored prefix unless we were specifically told + // not to (for tests only), or if we know that the regex is anchored + // for all matches. When an unanchored prefix is not added, then the + // NFA's anchored and unanchored start states are equivalent. + let all_anchored = + exprs.iter().all(|e| e.borrow().is_anchored_start()); + let anchored = !self.config.get_unanchored_prefix() || all_anchored; + let unanchored_prefix = if anchored { + self.c_empty()? + } else { + if self.config.get_utf8() { + self.c_unanchored_prefix_valid_utf8()? + } else { + self.c_unanchored_prefix_invalid_utf8()? + } + }; + + let compiled = self.c_alternation( + exprs.iter().with_pattern_ids().map(|(pid, e)| { + let group_kind = hir::GroupKind::CaptureIndex(0); + let one = self.c_group(&group_kind, e.borrow())?; + let match_state_id = self.add_match(pid, one.start)?; + self.patch(one.end, match_state_id)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) + }), + )?; + self.patch(unanchored_prefix.end, compiled.start)?; + self.finish(compiled.start, unanchored_prefix.start)?; + Ok(self.nfa.replace(NFA::empty())) + } + + /// Finishes the compilation process and populates the NFA attached to this + /// compiler with the final graph. + fn finish( + &self, + start_anchored: StateID, + start_unanchored: StateID, + ) -> Result<(), Error> { + trace!( + "intermediate NFA compilation complete, \ + intermediate NFA size: {} states, {} bytes on heap", + self.states.borrow().len(), + self.nfa_memory_usage(), + ); + let mut nfa = self.nfa.borrow_mut(); + let mut bstates = self.states.borrow_mut(); + let mut remap = self.remap.borrow_mut(); + let mut empties = self.empties.borrow_mut(); + remap.resize(bstates.len(), StateID::ZERO); + empties.clear(); + + // The idea here is to convert our intermediate states to their final + // form. The only real complexity here is the process of converting + // transitions, which are expressed in terms of state IDs. The new + // set of states will be smaller because of partial epsilon removal, + // so the state IDs will not be the same. + for (sid, bstate) in bstates.iter_mut().with_state_ids() { + match *bstate { + CState::Empty { next } => { + // Since we're removing empty states, we need to handle + // them later since we don't yet know which new state this + // empty state will be mapped to. + empties.push((sid, next)); + } + CState::CaptureStart { next, capture_index, ref name } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + remap[sid] = nfa.add_capture_start( + next, + capture_index, + name.clone(), + )?; + } + CState::CaptureEnd { next, capture_index } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + remap[sid] = nfa.add_capture_end(next, capture_index)?; + } + CState::Range { range } => { + remap[sid] = nfa.add_range(range)?; + } + CState::Sparse { ref mut ranges } => { + let ranges = + mem::replace(ranges, vec![]).into_boxed_slice(); + remap[sid] = + nfa.add_sparse(SparseTransitions { ranges })?; + } + CState::Look { look, next } => { + remap[sid] = nfa.add_look(next, look)?; + } + CState::Union { ref mut alternates } => { + let alternates = + mem::replace(alternates, vec![]).into_boxed_slice(); + remap[sid] = nfa.add_union(alternates)?; + } + CState::UnionReverse { ref mut alternates } => { + let mut alternates = + mem::replace(alternates, vec![]).into_boxed_slice(); + alternates.reverse(); + remap[sid] = nfa.add_union(alternates)?; + } + CState::Match { start_id, .. } => { + remap[sid] = nfa.add_match()?; + nfa.finish_pattern(start_id)?; + } + } + } + for &(empty_id, mut empty_next) in empties.iter() { + // empty states can point to other empty states, forming a chain. + // So we must follow the chain until the end, which must end at + // a non-empty state, and therefore, a state that is correctly + // remapped. We are guaranteed to terminate because our compiler + // never builds a loop among only empty states. + while let CState::Empty { next } = bstates[empty_next] { + empty_next = next; + } + remap[empty_id] = remap[empty_next]; + } + nfa.set_start_anchored(start_anchored); + nfa.set_start_unanchored(start_unanchored); + nfa.remap(&remap); + trace!( + "final NFA (reverse? {:?}) compilation complete, \ + final NFA size: {} states, {} bytes on heap", + self.config.get_reverse(), + nfa.states().len(), + nfa.memory_usage(), + ); + Ok(()) + } + + fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> { + match *expr.kind() { + HirKind::Empty => self.c_empty(), + HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch), + HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b), + HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c), + HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c), + HirKind::Anchor(ref anchor) => self.c_anchor(anchor), + HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb), + HirKind::Repetition(ref rep) => self.c_repetition(rep), + HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir), + HirKind::Concat(ref es) => { + self.c_concat(es.iter().map(|e| self.c(e))) + } + HirKind::Alternation(ref es) => { + self.c_alternation(es.iter().map(|e| self.c(e))) + } + } + } + + fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> + where + I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>, + { + let first = if self.is_reverse() { it.next_back() } else { it.next() }; + let ThompsonRef { start, mut end } = match first { + Some(result) => result?, + None => return self.c_empty(), + }; + loop { + let next = + if self.is_reverse() { it.next_back() } else { it.next() }; + let compiled = match next { + Some(result) => result?, + None => break, + }; + self.patch(end, compiled.start)?; + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> + where + I: Iterator<Item = Result<ThompsonRef, Error>>, + { + let first = it.next().expect("alternations must be non-empty")?; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let union = self.add_union()?; + let end = self.add_empty()?; + self.patch(union, first.start)?; + self.patch(first.end, end)?; + self.patch(union, second.start)?; + self.patch(second.end, end)?; + for result in it { + let compiled = result?; + self.patch(union, compiled.start)?; + self.patch(compiled.end, end)?; + } + Ok(ThompsonRef { start: union, end }) + } + + fn c_group( + &self, + kind: &hir::GroupKind, + expr: &Hir, + ) -> Result<ThompsonRef, Error> { + if !self.config.get_captures() { + return self.c(expr); + } + let (capi, name) = match *kind { + hir::GroupKind::NonCapturing => return self.c(expr), + hir::GroupKind::CaptureIndex(index) => (index, None), + hir::GroupKind::CaptureName { ref name, index } => { + (index, Some(Arc::from(&**name))) + } + }; + + let start = self.add_capture_start(capi, name)?; + let inner = self.c(expr)?; + let end = self.add_capture_end(capi)?; + + self.patch(start, inner.start)?; + self.patch(inner.end, end)?; + Ok(ThompsonRef { start, end }) + } + + fn c_repetition( + &self, + rep: &hir::Repetition, + ) -> Result<ThompsonRef, Error> { + match rep.kind { + hir::RepetitionKind::ZeroOrOne => { + self.c_zero_or_one(&rep.hir, rep.greedy) + } + hir::RepetitionKind::ZeroOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 0) + } + hir::RepetitionKind::OneOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 1) + } + hir::RepetitionKind::Range(ref rng) => match *rng { + hir::RepetitionRange::Exactly(count) => { + self.c_exactly(&rep.hir, count) + } + hir::RepetitionRange::AtLeast(m) => { + self.c_at_least(&rep.hir, rep.greedy, m) + } + hir::RepetitionRange::Bounded(min, max) => { + self.c_bounded(&rep.hir, rep.greedy, min, max) + } + }, + } + } + + fn c_bounded( + &self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result<ThompsonRef, Error> { + let prefix = self.c_exactly(expr, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 04) + // 000003: 61 => 04 + // 000004: union(05, 06) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better + // to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: union(03, 08) + // 000003: 61 => 04 + // 000004: union(05, 08) + // 000005: 61 => 06 + // 000006: union(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty()?; + let mut prev_end = prefix.end; + for _ in min..max { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + let compiled = self.c(expr)?; + self.patch(prev_end, union)?; + self.patch(union, compiled.start)?; + self.patch(union, empty)?; + prev_end = compiled.end; + } + self.patch(prev_end, empty)?; + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + fn c_at_least( + &self, + expr: &Hir, + greedy: bool, + n: u32, + ) -> Result<ThompsonRef, Error> { + if n == 0 { + // When the expression cannot match the empty string, then we + // can get away with something much simpler: just one 'alt' + // instruction that optionally repeats itself. But if the expr + // can match the empty string... see below. + if !expr.is_match_empty() { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + let compiled = self.c(expr)?; + self.patch(union, compiled.start)?; + self.patch(compiled.end, union)?; + return Ok(ThompsonRef { start: union, end: union }); + } + + // What's going on here? Shouldn't x* be simpler than this? It + // turns out that when implementing leftmost-first (Perl-like) + // match semantics, x* results in an incorrect preference order + // when computing the transitive closure of states if and only if + // 'x' can match the empty string. So instead, we compile x* as + // (x+)?, which preserves the correct preference order. + // + // See: https://github.com/rust-lang/regex/issues/779 + let compiled = self.c(expr)?; + let plus = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + self.patch(compiled.end, plus)?; + self.patch(plus, compiled.start)?; + + let question = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + let empty = self.add_empty()?; + self.patch(question, compiled.start)?; + self.patch(question, empty)?; + self.patch(plus, empty)?; + Ok(ThompsonRef { start: question, end: empty }) + } else if n == 1 { + let compiled = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + self.patch(compiled.end, union)?; + self.patch(union, compiled.start)?; + Ok(ThompsonRef { start: compiled.start, end: union }) + } else { + let prefix = self.c_exactly(expr, n - 1)?; + let last = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }?; + self.patch(prefix.end, last.start)?; + self.patch(last.end, union)?; + self.patch(union, last.start)?; + Ok(ThompsonRef { start: prefix.start, end: union }) + } + } + + fn c_zero_or_one( + &self, + expr: &Hir, + greedy: bool, + ) -> Result<ThompsonRef, Error> { + let union = + if greedy { self.add_union() } else { self.add_reverse_union() }?; + let compiled = self.c(expr)?; + let empty = self.add_empty()?; + self.patch(union, compiled.start)?; + self.patch(union, empty)?; + self.patch(compiled.end, empty)?; + Ok(ThompsonRef { start: union, end: empty }) + } + + fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> { + let it = (0..n).map(|_| self.c(expr)); + self.c_concat(it) + } + + fn c_byte_class( + &self, + cls: &hir::ClassBytes, + ) -> Result<ThompsonRef, Error> { + let end = self.add_empty()?; + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + trans.push(Transition { + start: r.start(), + end: r.end(), + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) + } + + fn c_unicode_class( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef, Error> { + // If all we have are ASCII ranges wrapped in a Unicode package, then + // there is zero reason to bring out the big guns. We can fit all ASCII + // ranges within a single sparse state. + if cls.is_all_ascii() { + let end = self.add_empty()?; + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + assert!(r.start() <= '\x7F'); + assert!(r.end() <= '\x7F'); + trans.push(Transition { + start: r.start() as u8, + end: r.end() as u8, + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) + } else if self.is_reverse() { + if !self.config.get_shrink() { + // When we don't want to spend the extra time shrinking, we + // compile the UTF-8 automaton in reverse using something like + // the "naive" approach, but will attempt to re-use common + // suffixes. + self.c_unicode_class_reverse_with_suffix(cls) + } else { + // When we want to shrink our NFA for reverse UTF-8 automata, + // we cannot feed UTF-8 sequences directly to the UTF-8 + // compiler, since the UTF-8 compiler requires all sequences + // to be lexicographically sorted. Instead, we organize our + // sequences into a range trie, which can then output our + // sequences in the correct order. Unfortunately, building the + // range trie is fairly expensive (but not nearly as expensive + // as building a DFA). Hence the reason why the 'shrink' option + // exists, so that this path can be toggled off. For example, + // we might want to turn this off if we know we won't be + // compiling a DFA. + let mut trie = self.trie_state.borrow_mut(); + trie.clear(); + + for rng in cls.iter() { + for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { + seq.reverse(); + trie.insert(seq.as_slice()); + } + } + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?; + trie.iter(|seq| { + utf8c.add(&seq)?; + Ok(()) + })?; + utf8c.finish() + } + } else { + // In the forward direction, we always shrink our UTF-8 automata + // because we can stream it right into the UTF-8 compiler. There + // is almost no downside (in either memory or time) to using this + // approach. + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?; + for rng in cls.iter() { + for seq in Utf8Sequences::new(rng.start(), rng.end()) { + utf8c.add(seq.as_slice())?; + } + } + utf8c.finish() + } + + // For reference, the code below is the "naive" version of compiling a + // UTF-8 automaton. It is deliciously simple (and works for both the + // forward and reverse cases), but will unfortunately produce very + // large NFAs. When compiling a forward automaton, the size difference + // can sometimes be an order of magnitude. For example, the '\w' regex + // will generate about ~3000 NFA states using the naive approach below, + // but only 283 states when using the approach above. This is because + // the approach above actually compiles a *minimal* (or near minimal, + // because of the bounded hashmap for reusing equivalent states) UTF-8 + // automaton. + // + // The code below is kept as a reference point in order to make it + // easier to understand the higher level goal here. Although, it will + // almost certainly bit-rot, so keep that in mind. + /* + let it = cls + .iter() + .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) + .map(|seq| { + let it = seq + .as_slice() + .iter() + .map(|rng| self.c_range(rng.start, rng.end)); + self.c_concat(it) + }); + self.c_alternation(it) + */ + } + + fn c_unicode_class_reverse_with_suffix( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef, Error> { + // N.B. It would likely be better to cache common *prefixes* in the + // reverse direction, but it's not quite clear how to do that. The + // advantage of caching suffixes is that it does give us a win, and + // has a very small additional overhead. + let mut cache = self.utf8_suffix.borrow_mut(); + cache.clear(); + + let union = self.add_union()?; + let alt_end = self.add_empty()?; + for urng in cls.iter() { + for seq in Utf8Sequences::new(urng.start(), urng.end()) { + let mut end = alt_end; + for brng in seq.as_slice() { + let key = Utf8SuffixKey { + from: end, + start: brng.start, + end: brng.end, + }; + let hash = cache.hash(&key); + if let Some(id) = cache.get(&key, hash) { + end = id; + continue; + } + + let compiled = self.c_range(brng.start, brng.end)?; + self.patch(compiled.end, end)?; + end = compiled.start; + cache.set(key, hash, end); + } + self.patch(union, end)?; + } + } + Ok(ThompsonRef { start: union, end: alt_end }) + } + + fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> { + let look = match *anchor { + Anchor::StartLine => Look::StartLine, + Anchor::EndLine => Look::EndLine, + Anchor::StartText => Look::StartText, + Anchor::EndText => Look::EndText, + }; + let id = self.add_look(look)?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_word_boundary( + &self, + wb: &WordBoundary, + ) -> Result<ThompsonRef, Error> { + let look = match *wb { + WordBoundary::Unicode => Look::WordBoundaryUnicode, + WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate, + WordBoundary::Ascii => Look::WordBoundaryAscii, + WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate, + }; + let id = self.add_look(look)?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { + let mut buf = [0; 4]; + let it = ch + .encode_utf8(&mut buf) + .as_bytes() + .iter() + .map(|&b| self.c_range(b, b)); + self.c_concat(it) + } + + fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> { + let id = self.add_range(start, end)?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_empty(&self) -> Result<ThompsonRef, Error> { + let id = self.add_empty()?; + Ok(ThompsonRef { start: id, end: id }) + } + + fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> { + self.c_at_least(&Hir::any(false), false, 0) + } + + fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> { + self.c_at_least(&Hir::any(true), false, 0) + } + + fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { + let old_memory_cstates = self.memory_cstates.get(); + match self.states.borrow_mut()[from] { + CState::Empty { ref mut next } => { + *next = to; + } + CState::Range { ref mut range } => { + range.next = to; + } + CState::Sparse { .. } => { + panic!("cannot patch from a sparse NFA state") + } + CState::Look { ref mut next, .. } => { + *next = to; + } + CState::Union { ref mut alternates } => { + alternates.push(to); + self.memory_cstates + .set(old_memory_cstates + mem::size_of::<StateID>()); + } + CState::UnionReverse { ref mut alternates } => { + alternates.push(to); + self.memory_cstates + .set(old_memory_cstates + mem::size_of::<StateID>()); + } + CState::CaptureStart { ref mut next, .. } => { + *next = to; + } + CState::CaptureEnd { ref mut next, .. } => { + *next = to; + } + CState::Match { .. } => {} + } + if old_memory_cstates != self.memory_cstates.get() { + self.check_nfa_size_limit()?; + } + Ok(()) + } + + fn add_empty(&self) -> Result<StateID, Error> { + self.add_state(CState::Empty { next: StateID::ZERO }) + } + + fn add_capture_start( + &self, + capture_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, Error> { + self.add_state(CState::CaptureStart { + next: StateID::ZERO, + capture_index, + name, + }) + } + + fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> { + self.add_state(CState::CaptureEnd { + next: StateID::ZERO, + capture_index, + }) + } + + fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> { + let trans = Transition { start, end, next: StateID::ZERO }; + self.add_state(CState::Range { range: trans }) + } + + fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> { + if ranges.len() == 1 { + self.add_state(CState::Range { range: ranges[0] }) + } else { + self.add_state(CState::Sparse { ranges }) + } + } + + fn add_look(&self, mut look: Look) -> Result<StateID, Error> { + if self.is_reverse() { + look = look.reversed(); + } + self.add_state(CState::Look { look, next: StateID::ZERO }) + } + + fn add_union(&self) -> Result<StateID, Error> { + self.add_state(CState::Union { alternates: vec![] }) + } + + fn add_reverse_union(&self) -> Result<StateID, Error> { + self.add_state(CState::UnionReverse { alternates: vec![] }) + } + + fn add_match( + &self, + pattern_id: PatternID, + start_id: StateID, + ) -> Result<StateID, Error> { + self.add_state(CState::Match { pattern_id, start_id }) + } + + fn add_state(&self, state: CState) -> Result<StateID, Error> { + let mut states = self.states.borrow_mut(); + let id = StateID::new(states.len()) + .map_err(|_| Error::too_many_states(states.len()))?; + self.memory_cstates + .set(self.memory_cstates.get() + state.memory_usage()); + states.push(state); + // If we don't explicitly drop this, then 'nfa_memory_usage' will also + // try to borrow it when we check the size limit and hit an error. + drop(states); + self.check_nfa_size_limit()?; + Ok(id) + } + + fn is_reverse(&self) -> bool { + self.config.get_reverse() + } + + /// If an NFA size limit was set, this checks that the NFA compiled so far + /// fits within that limit. If so, then nothing is returned. Otherwise, an + /// error is returned. + /// + /// This should be called after increasing the heap usage of the + /// intermediate NFA. + /// + /// Note that this borrows 'self.states', so callers should ensure there is + /// no mutable borrow of it outstanding. + fn check_nfa_size_limit(&self) -> Result<(), Error> { + if let Some(limit) = self.config.get_nfa_size_limit() { + if self.nfa_memory_usage() > limit { + return Err(Error::exceeded_size_limit(limit)); + } + } + Ok(()) + } + + /// Returns the heap memory usage, in bytes, of the NFA compiled so far. + /// + /// Note that this is an approximation of how big the final NFA will be. + /// In practice, the final NFA will likely be a bit smaller since it uses + /// things like `Box<[T]>` instead of `Vec<T>`. + fn nfa_memory_usage(&self) -> usize { + self.states.borrow().len() * mem::size_of::<CState>() + + self.memory_cstates.get() + } +} + +impl CState { + fn memory_usage(&self) -> usize { + match *self { + CState::Empty { .. } + | CState::Range { .. } + | CState::Look { .. } + | CState::CaptureStart { .. } + | CState::CaptureEnd { .. } + | CState::Match { .. } => 0, + CState::Sparse { ref ranges } => { + ranges.len() * mem::size_of::<Transition>() + } + CState::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + CState::UnionReverse { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } +} + +#[derive(Debug)] +struct Utf8Compiler<'a> { + nfac: &'a Compiler, + state: &'a mut Utf8State, + target: StateID, +} + +#[derive(Clone, Debug)] +struct Utf8State { + compiled: Utf8BoundedMap, + uncompiled: Vec<Utf8Node>, +} + +#[derive(Clone, Debug)] +struct Utf8Node { + trans: Vec<Transition>, + last: Option<Utf8LastTransition>, +} + +#[derive(Clone, Debug)] +struct Utf8LastTransition { + start: u8, + end: u8, +} + +impl Utf8State { + fn new() -> Utf8State { + Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] } + } + + fn clear(&mut self) { + self.compiled.clear(); + self.uncompiled.clear(); + } +} + +impl<'a> Utf8Compiler<'a> { + fn new( + nfac: &'a Compiler, + state: &'a mut Utf8State, + ) -> Result<Utf8Compiler<'a>, Error> { + let target = nfac.add_empty()?; + state.clear(); + let mut utf8c = Utf8Compiler { nfac, state, target }; + utf8c.add_empty(); + Ok(utf8c) + } + + fn finish(&mut self) -> Result<ThompsonRef, Error> { + self.compile_from(0)?; + let node = self.pop_root(); + let start = self.compile(node)?; + Ok(ThompsonRef { start, end: self.target }) + } + + fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> { + let prefix_len = ranges + .iter() + .zip(&self.state.uncompiled) + .take_while(|&(range, node)| { + node.last.as_ref().map_or(false, |t| { + (t.start, t.end) == (range.start, range.end) + }) + }) + .count(); + assert!(prefix_len < ranges.len()); + self.compile_from(prefix_len)?; + self.add_suffix(&ranges[prefix_len..]); + Ok(()) + } + + fn compile_from(&mut self, from: usize) -> Result<(), Error> { + let mut next = self.target; + while from + 1 < self.state.uncompiled.len() { + let node = self.pop_freeze(next); + next = self.compile(node)?; + } + self.top_last_freeze(next); + Ok(()) + } + + fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> { + let hash = self.state.compiled.hash(&node); + if let Some(id) = self.state.compiled.get(&node, hash) { + return Ok(id); + } + let id = self.nfac.add_sparse(node.clone())?; + self.state.compiled.set(node, hash, id); + Ok(id) + } + + fn add_suffix(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + assert!(self.state.uncompiled[last].last.is_none()); + self.state.uncompiled[last].last = Some(Utf8LastTransition { + start: ranges[0].start, + end: ranges[0].end, + }); + for r in &ranges[1..] { + self.state.uncompiled.push(Utf8Node { + trans: vec![], + last: Some(Utf8LastTransition { start: r.start, end: r.end }), + }); + } + } + + fn add_empty(&mut self) { + self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); + } + + fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { + let mut uncompiled = self.state.uncompiled.pop().unwrap(); + uncompiled.set_last_transition(next); + uncompiled.trans + } + + fn pop_root(&mut self) -> Vec<Transition> { + assert_eq!(self.state.uncompiled.len(), 1); + assert!(self.state.uncompiled[0].last.is_none()); + self.state.uncompiled.pop().expect("non-empty nodes").trans + } + + fn top_last_freeze(&mut self, next: StateID) { + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + self.state.uncompiled[last].set_last_transition(next); + } +} + +impl Utf8Node { + fn set_last_transition(&mut self, next: StateID) { + if let Some(last) = self.last.take() { + self.trans.push(Transition { + start: last.start, + end: last.end, + next, + }); + } + } +} + +#[cfg(test)] +mod tests { + use alloc::vec::Vec; + + use super::{ + Builder, Config, PatternID, SparseTransitions, State, StateID, + Transition, NFA, + }; + + fn build(pattern: &str) -> NFA { + Builder::new() + .configure(Config::new().captures(false).unanchored_prefix(false)) + .build(pattern) + .unwrap() + } + + fn pid(id: usize) -> PatternID { + PatternID::new(id).unwrap() + } + + fn sid(id: usize) -> StateID { + StateID::new(id).unwrap() + } + + fn s_byte(byte: u8, next: usize) -> State { + let next = sid(next); + let trans = Transition { start: byte, end: byte, next }; + State::Range { range: trans } + } + + fn s_range(start: u8, end: u8, next: usize) -> State { + let next = sid(next); + let trans = Transition { start, end, next }; + State::Range { range: trans } + } + + fn s_sparse(ranges: &[(u8, u8, usize)]) -> State { + let ranges = ranges + .iter() + .map(|&(start, end, next)| Transition { + start, + end, + next: sid(next), + }) + .collect(); + State::Sparse(SparseTransitions { ranges }) + } + + fn s_union(alts: &[usize]) -> State { + State::Union { + alternates: alts + .iter() + .map(|&id| sid(id)) + .collect::<Vec<StateID>>() + .into_boxed_slice(), + } + } + + fn s_match(id: usize) -> State { + State::Match { id: pid(id) } + } + + // Test that building an unanchored NFA has an appropriate `(?s:.)*?` + // prefix. + #[test] + fn compile_unanchored_prefix() { + // When the machine can only match valid UTF-8. + let nfa = Builder::new() + .configure(Config::new().captures(false)) + .build(r"a") + .unwrap(); + // There should be many states since the `.` in `(?s:.)*?` matches any + // Unicode scalar value. + assert_eq!(11, nfa.len()); + assert_eq!(nfa.states[10], s_match(0)); + assert_eq!(nfa.states[9], s_byte(b'a', 10)); + + // When the machine can match through invalid UTF-8. + let nfa = Builder::new() + .configure(Config::new().captures(false).utf8(false)) + .build(r"a") + .unwrap(); + assert_eq!( + nfa.states, + &[ + s_union(&[2, 1]), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_match(0), + ] + ); + } + + #[test] + fn compile_empty() { + assert_eq!(build("").states, &[s_match(0),]); + } + + #[test] + fn compile_literal() { + assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]); + assert_eq!( + build("ab").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] + ); + assert_eq!( + build("☃").states, + &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] + ); + + // Check that non-UTF-8 literals work. + let nfa = Builder::new() + .configure( + Config::new() + .captures(false) + .utf8(false) + .unanchored_prefix(false), + ) + .syntax(crate::SyntaxConfig::new().utf8(false)) + .build(r"(?-u)\xFF") + .unwrap(); + assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]); + } + + #[test] + fn compile_class() { + assert_eq!( + build(r"[a-z]").states, + &[s_range(b'a', b'z', 1), s_match(0),] + ); + assert_eq!( + build(r"[x-za-c]").states, + &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] + ); + assert_eq!( + build(r"[\u03B1-\u03B4]").states, + &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] + ); + assert_eq!( + build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, + &[ + s_range(0xB1, 0xB4, 5), + s_range(0x99, 0x9E, 5), + s_byte(0xA4, 1), + s_byte(0x9F, 2), + s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), + s_match(0), + ] + ); + assert_eq!( + build(r"[a-z☃]").states, + &[ + s_byte(0x83, 3), + s_byte(0x98, 0), + s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), + s_match(0), + ] + ); + } + + #[test] + fn compile_repetition() { + assert_eq!( + build(r"a?").states, + &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),] + ); + assert_eq!( + build(r"a??").states, + &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),] + ); + } + + #[test] + fn compile_group() { + assert_eq!( + build(r"ab+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)] + ); + assert_eq!( + build(r"(ab)").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] + ); + assert_eq!( + build(r"(ab)+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)] + ); + } + + #[test] + fn compile_alternation() { + assert_eq!( + build(r"a|b").states, + &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)] + ); + assert_eq!( + build(r"|b").states, + &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)] + ); + assert_eq!( + build(r"a|").states, + &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)] + ); + } + + #[test] + fn many_start_pattern() { + let nfa = Builder::new() + .configure(Config::new().captures(false).unanchored_prefix(false)) + .build_many(&["a", "b"]) + .unwrap(); + assert_eq!( + nfa.states, + &[ + s_byte(b'a', 1), + s_match(0), + s_byte(b'b', 3), + s_match(1), + s_union(&[0, 2]), + ] + ); + assert_eq!(nfa.start_anchored().as_usize(), 4); + assert_eq!(nfa.start_unanchored().as_usize(), 4); + // Test that the start states for each individual pattern are correct. + assert_eq!(nfa.start_pattern(pid(0)), sid(0)); + assert_eq!(nfa.start_pattern(pid(1)), sid(2)); + } +} diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/error.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/error.rs new file mode 100644 index 000000000..52f02e888 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/error.rs @@ -0,0 +1,145 @@ +use crate::util::id::{PatternID, StateID}; + +/// An error that can occured during the construction of a thompson NFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`regex_syntax::Error`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building an NFA directly from a pattern string. +/// +/// Otherwise, errors typically occur when a limit has been breeched. For +/// example, if the total heap usage of the compiled NFA exceeds the limit +/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then +/// building the NFA will fail. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred during the construction of a thompson NFA. +#[derive(Clone, Debug)] +enum ErrorKind { + /// An error that occurred while parsing a regular expression. Note that + /// this error may be printed over multiple lines, and is generally + /// intended to be end user readable on its own. + Syntax(regex_syntax::Error), + /// An error that occurs if too many patterns were given to the NFA + /// compiler. + TooManyPatterns { + /// The number of patterns given, which exceeds the limit. + given: usize, + /// The limit on the number of patterns. + limit: usize, + }, + /// An error that occurs if too states are produced while building an NFA. + TooManyStates { + /// The minimum number of states that are desired, which exceeds the + /// limit. + given: usize, + /// The limit on the number of states. + limit: usize, + }, + /// An error that occurs when NFA compilation exceeds a configured heap + /// limit. + ExceededSizeLimit { + /// The configured limit, in bytes. + limit: usize, + }, + /// An error that occurs when an invalid capture group index is added to + /// the NFA. An "invalid" index can be one that is too big (e.g., results + /// in an integer overflow) or one that is discontinuous from previous + /// capture group indices added. + InvalidCaptureIndex { + /// The invalid index that was given. + index: usize, + }, + /// An error that occurs when an NFA contains a Unicode word boundary, but + /// where the crate was compiled without the necessary data for dealing + /// with Unicode word boundaries. + UnicodeWordUnavailable, +} + +impl Error { + fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn syntax(err: regex_syntax::Error) -> Error { + Error { kind: ErrorKind::Syntax(err) } + } + + pub(crate) fn too_many_patterns(given: usize) -> Error { + let limit = PatternID::LIMIT; + Error { kind: ErrorKind::TooManyPatterns { given, limit } } + } + + pub(crate) fn too_many_states(given: usize) -> Error { + let limit = StateID::LIMIT; + Error { kind: ErrorKind::TooManyStates { given, limit } } + } + + pub(crate) fn exceeded_size_limit(limit: usize) -> Error { + Error { kind: ErrorKind::ExceededSizeLimit { limit } } + } + + pub(crate) fn invalid_capture_index(index: usize) -> Error { + Error { kind: ErrorKind::InvalidCaptureIndex { index } } + } + + pub(crate) fn unicode_word_unavailable() -> Error { + Error { kind: ErrorKind::UnicodeWordUnavailable } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + ErrorKind::Syntax(ref err) => Some(err), + ErrorKind::TooManyPatterns { .. } => None, + ErrorKind::TooManyStates { .. } => None, + ErrorKind::ExceededSizeLimit { .. } => None, + ErrorKind::InvalidCaptureIndex { .. } => None, + ErrorKind::UnicodeWordUnavailable => None, + } + } +} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + ErrorKind::Syntax(_) => write!(f, "error parsing regex"), + ErrorKind::TooManyPatterns { given, limit } => write!( + f, + "attemped to compile {} patterns, \ + which exceeds the limit of {}", + given, limit, + ), + ErrorKind::TooManyStates { given, limit } => write!( + f, + "attemped to compile {} NFA states, \ + which exceeds the limit of {}", + given, limit, + ), + ErrorKind::ExceededSizeLimit { limit } => write!( + f, + "heap usage during NFA compilation exceeded limit of {}", + limit, + ), + ErrorKind::InvalidCaptureIndex { index } => write!( + f, + "capture group index {} is invalid (too big or discontinuous)", + index, + ), + ErrorKind::UnicodeWordUnavailable => write!( + f, + "crate has been compiled without Unicode word boundary \ + support, but the NFA contains Unicode word boundary \ + assertions", + ), + } + } +} diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/map.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/map.rs new file mode 100644 index 000000000..79ff63ca3 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/map.rs @@ -0,0 +1,290 @@ +// This module contains a couple simple and purpose built hash maps. The key +// trade off they make is that they serve as caches rather than true maps. That +// is, inserting a new entry may cause eviction of another entry. This gives +// us two things. First, there's less overhead associated with inserts and +// lookups. Secondly, it lets us control our memory usage. +// +// These maps are used in some fairly hot code when generating NFA states for +// large Unicode character classes. +// +// Instead of exposing a rich hashmap entry API, we just permit the caller to +// produce a hash of the key directly. The hash can then be reused for both +// lookups and insertions at the cost of leaking abstraction a bit. But these +// are for internal use only, so it's fine. +// +// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a +// (almost) minimal DFA for large Unicode character classes in linear time. +// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse +// NFAs, it's only used when the compiler is configured to 'shrink' the NFA, +// since there's a bit more expense in the reverse direction.) +// +// The Utf8SuffixMap is used when compiling large Unicode character classes for +// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive +// construction of UTF-8 automata by caching common suffixes. This doesn't +// get the same space savings as Daciuk's algorithm, but it's basically as +// fast as the naive approach and typically winds up using less memory (since +// it generates smaller NFAs) despite the presence of the cache. +// +// These maps effectively represent caching mechanisms for CState::Sparse and +// CState::Range, respectively. The former represents a single NFA state with +// many transitions of equivalent priority while the latter represents a single +// NFA state with a single transition. (Neither state ever has or is an +// epsilon transition.) Thus, they have different key types. It's likely we +// could make one generic map, but the machinery didn't seem worth it. They +// are simple enough. + +use alloc::{vec, vec::Vec}; + +use crate::{nfa::thompson::Transition, util::id::StateID}; + +// Basic FNV-1a hash constants as described in: +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +const PRIME: u64 = 1099511628211; +const INIT: u64 = 14695981039346656037; + +/// A bounded hash map where the key is a sequence of NFA transitions and the +/// value is a pre-existing NFA state ID. +/// +/// std's hashmap can be used for this, however, this map has two important +/// advantages. Firstly, it has lower overhead. Secondly, it permits us to +/// control our memory usage by limited the number of slots. In general, the +/// cost here is that this map acts as a cache. That is, inserting a new entry +/// may remove an old entry. We are okay with this, since it does not impact +/// correctness in the cases where it is used. The only effect that dropping +/// states from the cache has is that the resulting NFA generated may be bigger +/// than it otherwise would be. +/// +/// This improves benchmarks that compile large Unicode character classes, +/// since it makes the generation of (almost) minimal UTF-8 automaton faster. +/// Specifically, one could observe the difference with std's hashmap via +/// something like the following benchmark: +/// +/// hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" +/// +/// But to observe that difference, you'd have to modify the code to use +/// std's hashmap. +/// +/// It is quite possible that there is a better way to approach this problem. +/// For example, if there happens to be a very common state that collides with +/// a lot of less frequent states, then we could wind up with very poor caching +/// behavior. Alas, the effectiveness of this cache has not been measured. +/// Instead, ad hoc experiments suggest that it is "good enough." Additional +/// smarts (such as an LRU eviction policy) have to be weighed against the +/// amount of extra time they cost. +#[derive(Clone, Debug)] +pub struct Utf8BoundedMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + /// + /// This makes it possible to clear the map by simply incrementing the + /// version number instead of actually deallocating any storage. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8BoundedEntry>, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8BoundedEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which is a sorted sequence of non-overlapping NFA transitions. + key: Vec<Transition>, + /// The state ID corresponding to the state containing the transitions in + /// this entry. + val: StateID, +} + +impl Utf8BoundedMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8BoundedMap { + assert!(capacity > 0); + Utf8BoundedMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + // If we loop back to version 0, then we forcefully clear the + // entire map. Otherwise, it might be possible to incorrectly + // match entries used to generate other NFAs. + if self.version == 0 { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transitions. + pub fn hash(&self, key: &[Transition]) -> usize { + let mut h = INIT; + for t in key { + h = (h ^ (t.start as u64)).wrapping_mul(PRIME); + h = (h ^ (t.end as u64)).wrapping_mul(PRIME); + h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME); + } + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given transitions, then None is + /// returned. + pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + // There may be a hash collision, so we need to confirm real equality. + if entry.key != key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transitions given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set( + &mut self, + key: Vec<Transition>, + hash: usize, + state_id: StateID, + ) { + self.map[hash] = + Utf8BoundedEntry { version: self.version, key, val: state_id }; + } +} + +/// A cache of suffixes used to modestly compress UTF-8 automata for large +/// Unicode character classes. +#[derive(Clone, Debug)] +pub struct Utf8SuffixMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8SuffixEntry>, +} + +/// A key that uniquely identifies an NFA state. It is a triple that represents +/// a transition from one state for a particular byte range. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Utf8SuffixKey { + pub from: StateID, + pub start: u8, + pub end: u8, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8SuffixEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which consists of a transition in a particular state. + key: Utf8SuffixKey, + /// The identifier that the transition in the key maps to. + val: StateID, +} + +impl Utf8SuffixMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8SuffixMap { + assert!(capacity > 0); + Utf8SuffixMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transition. + pub fn hash(&self, key: &Utf8SuffixKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const PRIME: u64 = 1099511628211; + const INIT: u64 = 14695981039346656037; + + let mut h = INIT; + h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME); + h = (h ^ (key.start as u64)).wrapping_mul(PRIME); + h = (h ^ (key.end as u64)).wrapping_mul(PRIME); + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given key, then None is returned. + pub fn get( + &mut self, + key: &Utf8SuffixKey, + hash: usize, + ) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + if key != &entry.key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transition given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { + self.map[hash] = + Utf8SuffixEntry { version: self.version, key, val: state_id }; + } +} diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs new file mode 100644 index 000000000..88a438e8e --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/mod.rs @@ -0,0 +1,1555 @@ +use core::{convert::TryFrom, fmt, mem, ops::Range}; + +use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; + +use crate::util::{ + alphabet::{self, ByteClassSet}, + decode_last_utf8, decode_utf8, + id::{IteratorIDExt, PatternID, PatternIDIter, StateID}, + is_word_byte, is_word_char_fwd, is_word_char_rev, +}; + +pub use self::{ + compiler::{Builder, Config}, + error::Error, +}; + +mod compiler; +mod error; +mod map; +pub mod pikevm; +mod range_trie; + +/// A map from capture group name to its corresponding capture index. +/// +/// Since there are always two slots for each capture index, the pair of slots +/// corresponding to the capture index for a pattern ID of 0 are indexed at +/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`. +/// +/// This type is actually wrapped inside a Vec indexed by pattern ID on the +/// NFA, since multiple patterns may have the same capture group name. +/// +/// Note that this is somewhat of a sub-optimal representation, since it +/// requires a hashmap for each pattern. A better representation would be +/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look +/// up a capture index by name without producing a `Arc<str>`, which requires +/// an allocation. To fix this, I think we'd need to define our own unsized +/// type or something? +#[cfg(feature = "std")] +type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>; +#[cfg(not(feature = "std"))] +type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>; + +// The NFA API below is not something I'm terribly proud of at the moment. In +// particular, it supports both mutating the NFA and actually using the NFA to +// perform a search. I think combining these two things muddies the waters a +// bit too much. +// +// I think the issue is that I saw the compiler as the 'builder,' and where +// the compiler had the ability to manipulate the internal state of the NFA. +// However, one of my goals was to make it possible for others to build their +// own NFAs in a way that is *not* couple to the regex-syntax crate. +// +// So I think really, there should be an NFA, a NFABuilder and then the +// internal compiler which uses the NFABuilder API to build an NFA. Alas, at +// the time of writing, I kind of ran out of steam. + +/// A fully compiled Thompson NFA. +/// +/// The states of the NFA are indexed by state IDs, which are how transitions +/// are expressed. +#[derive(Clone)] +pub struct NFA { + /// The state list. This list is guaranteed to be indexable by all starting + /// state IDs, and it is also guaranteed to contain at most one `Match` + /// state for each pattern compiled into this NFA. (A pattern may not have + /// a corresponding `Match` state if a `Match` state is impossible to + /// reach.) + states: Vec<State>, + /// The anchored starting state of this NFA. + start_anchored: StateID, + /// The unanchored starting state of this NFA. + start_unanchored: StateID, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// A map from PatternID to its corresponding range of capture slots. Each + /// range is guaranteed to be contiguous with the previous range. The + /// end of the last range corresponds to the total number of slots needed + /// for this NFA. + patterns_to_slots: Vec<Range<usize>>, + /// A map from capture name to its corresponding index. So e.g., given + /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name + /// 'word' for pattern ID=0 would corresponding to the index '3'. Its + /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'. + capture_name_to_index: Vec<CaptureNameMap>, + /// A map from pattern ID to capture group index to name, if one exists. + /// This is effectively the inverse of 'capture_name_to_index'. The outer + /// vec is indexed by pattern ID, while the inner vec is index by capture + /// index offset for the corresponding pattern. + /// + /// The first capture group for each pattern is always unnamed and is thus + /// always None. + capture_index_to_name: Vec<Vec<Option<Arc<str>>>>, + /// A representation of equivalence classes over the transitions in this + /// NFA. Two bytes in the same equivalence class must not discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_class_set: ByteClassSet, + /// Various facts about this NFA, which can be used to improve failure + /// modes (e.g., rejecting DFA construction if an NFA has Unicode word + /// boundaries) or for performing optimizations (avoiding an increase in + /// states if there are no look-around states). + facts: Facts, + /// Heap memory used indirectly by NFA states. Since each state might use a + /// different amount of heap, we need to keep track of this incrementally. + memory_states: usize, +} + +impl NFA { + pub fn config() -> Config { + Config::new() + } + + pub fn builder() -> Builder { + Builder::new() + } + + /// Returns an NFA with no states. Its match semantics are unspecified. + /// + /// An empty NFA is useful as a starting point for building one. It is + /// itself not intended to be used for matching. For example, its starting + /// state identifiers are configured to be `0`, but since it has no states, + /// the identifiers are invalid. + /// + /// If you need an NFA that never matches is anything and can be correctly + /// used for matching, use [`NFA::never_match`]. + #[inline] + pub fn empty() -> NFA { + NFA { + states: vec![], + start_anchored: StateID::ZERO, + start_unanchored: StateID::ZERO, + start_pattern: vec![], + patterns_to_slots: vec![], + capture_name_to_index: vec![], + capture_index_to_name: vec![], + byte_class_set: ByteClassSet::empty(), + facts: Facts::default(), + memory_states: 0, + } + } + + /// Returns an NFA with a single regex that always matches at every + /// position. + #[inline] + pub fn always_match() -> NFA { + let mut nfa = NFA::empty(); + // Since we're only adding one pattern, these are guaranteed to work. + let start = nfa.add_match().unwrap(); + assert_eq!(start.as_usize(), 0); + let pid = nfa.finish_pattern(start).unwrap(); + assert_eq!(pid.as_usize(), 0); + nfa + } + + /// Returns an NFA that never matches at any position. It contains no + /// regexes. + #[inline] + pub fn never_match() -> NFA { + let mut nfa = NFA::empty(); + // Since we're only adding one state, this can never fail. + nfa.add_fail().unwrap(); + nfa + } + + /// Return the number of states in this NFA. + /// + /// This is guaranteed to be no bigger than [`StateID::LIMIT`]. + #[inline] + pub fn len(&self) -> usize { + self.states.len() + } + + /// Returns the total number of distinct match states in this NFA. + /// Stated differently, this returns the total number of regex patterns + /// used to build this NFA. + /// + /// This may return zero if the NFA was constructed with no patterns. In + /// this case, and only this case, the NFA can never produce a match for + /// any input. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`]. + #[inline] + pub fn pattern_len(&self) -> usize { + self.start_pattern.len() + } + + /// Returns the pattern ID of the pattern currently being compiled by this + /// NFA. + fn current_pattern_id(&self) -> PatternID { + // This always works because we never permit more patterns in + // 'start_pattern' than can be addressed by PatternID. Also, we only + // add a new entry to 'start_pattern' once we finish compiling a + // pattern. Thus, the length refers to the ID of the current pattern + // being compiled. + PatternID::new(self.start_pattern.len()).unwrap() + } + + /// Returns the total number of capturing groups in this NFA. + /// + /// This includes the special 0th capture group that is always present and + /// captures the start and end offset of the entire match. + /// + /// This is a convenience routine for `nfa.capture_slot_len() / 2`. + #[inline] + pub fn capture_len(&self) -> usize { + let slots = self.capture_slot_len(); + // This assert is guaranteed to pass since the NFA construction process + // guarantees that it is always true. + assert_eq!(slots % 2, 0, "capture slots must be divisible by 2"); + slots / 2 + } + + /// Returns the total number of capturing slots in this NFA. + /// + /// This value is guaranteed to be a multiple of 2. (Where each capturing + /// group has precisely two capturing slots in the NFA.) + #[inline] + pub fn capture_slot_len(&self) -> usize { + self.patterns_to_slots.last().map_or(0, |r| r.end) + } + + /// Return a range of capture slots for the given pattern. + /// + /// The range returned is guaranteed to be contiguous with ranges for + /// adjacent patterns. + /// + /// This panics if the given pattern ID is greater than or equal to the + /// number of patterns in this NFA. + #[inline] + pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> { + self.patterns_to_slots[pid].clone() + } + + /// Return the capture group index corresponding to the given name in the + /// given pattern. If no such capture group name exists in the given + /// pattern, then this returns `None`. + /// + /// If the given pattern ID is invalid, then this panics. + #[inline] + pub fn capture_name_to_index( + &self, + pid: PatternID, + name: &str, + ) -> Option<usize> { + assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID"); + self.capture_name_to_index[pid].get(name).cloned() + } + + // TODO: add iterators over capture group names. + // Do we also permit indexing? + + /// Returns an iterator over all pattern IDs in this NFA. + #[inline] + pub fn patterns(&self) -> PatternIter { + PatternIter { + it: PatternID::iter(self.pattern_len()), + _marker: core::marker::PhantomData, + } + } + + /// Return the ID of the initial anchored state of this NFA. + #[inline] + pub fn start_anchored(&self) -> StateID { + self.start_anchored + } + + /// Set the anchored starting state ID for this NFA. + #[inline] + pub fn set_start_anchored(&mut self, id: StateID) { + self.start_anchored = id; + } + + /// Return the ID of the initial unanchored state of this NFA. + #[inline] + pub fn start_unanchored(&self) -> StateID { + self.start_unanchored + } + + /// Set the unanchored starting state ID for this NFA. + #[inline] + pub fn set_start_unanchored(&mut self, id: StateID) { + self.start_unanchored = id; + } + + /// Return the ID of the initial anchored state for the given pattern. + /// + /// If the pattern doesn't exist in this NFA, then this panics. + #[inline] + pub fn start_pattern(&self, pid: PatternID) -> StateID { + self.start_pattern[pid] + } + + /// Get the byte class set for this NFA. + #[inline] + pub fn byte_class_set(&self) -> &ByteClassSet { + &self.byte_class_set + } + + /// Return a reference to the NFA state corresponding to the given ID. + #[inline] + pub fn state(&self, id: StateID) -> &State { + &self.states[id] + } + + /// Returns a slice of all states in this NFA. + /// + /// The slice returned may be indexed by a `StateID` generated by `add`. + #[inline] + pub fn states(&self) -> &[State] { + &self.states + } + + #[inline] + pub fn is_always_start_anchored(&self) -> bool { + self.start_anchored() == self.start_unanchored() + } + + #[inline] + pub fn has_any_look(&self) -> bool { + self.facts.has_any_look() + } + + #[inline] + pub fn has_any_anchor(&self) -> bool { + self.facts.has_any_anchor() + } + + #[inline] + pub fn has_word_boundary(&self) -> bool { + self.has_word_boundary_unicode() || self.has_word_boundary_ascii() + } + + #[inline] + pub fn has_word_boundary_unicode(&self) -> bool { + self.facts.has_word_boundary_unicode() + } + + #[inline] + pub fn has_word_boundary_ascii(&self) -> bool { + self.facts.has_word_boundary_ascii() + } + + /// Returns the memory usage, in bytes, of this NFA. + /// + /// This does **not** include the stack size used up by this NFA. To + /// compute that, use `std::mem::size_of::<NFA>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + self.states.len() * mem::size_of::<State>() + + self.memory_states + + self.start_pattern.len() * mem::size_of::<StateID>() + } + + // Why do we define a bunch of 'add_*' routines below instead of just + // defining a single 'add' routine that accepts a 'State'? Indeed, for most + // of the 'add_*' routines below, such a simple API would be more than + // appropriate. Unfortunately, adding capture states and, to a lesser + // extent, match states, is a bit more complex. Namely, when we add a + // capture state, we *really* want to know the corresponding capture + // group's name and index and what not, so that we can update other state + // inside this NFA. But, e.g., the capture group name is not and should + // not be included in 'State::Capture'. So what are our choices? + // + // 1) Define one 'add' and require some additional optional parameters. + // This feels quite ugly, and adds unnecessary complexity to more common + // and simpler cases. + // + // 2) Do what we do below. The sad thing is that our API is bigger with + // more methods. But each method is very specific and hopefully simple. + // + // 3) Define a new enum, say, 'StateWithInfo', or something that permits + // providing both a State and some extra ancillary info in some cases. This + // doesn't seem too bad to me, but seems slightly worse than (2) because of + // the additional type required. + // + // 4) Abandon the idea that we have to specify things like the capture + // group name when we add the Capture state to the NFA. We would then need + // to add other methods that permit the caller to add this additional state + // "out of band." Other than it introducing some additional complexity, I + // decided against this because I wanted the NFA builder API to make it + // as hard as possible to build a bad or invalid NFA. Using the approach + // below, as you'll see, permits us to do a lot of strict checking of our + // inputs and return an error if we see something we don't expect. + + pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> { + self.byte_class_set.set_range(range.start, range.end); + self.add_state(State::Range { range }) + } + + pub fn add_sparse( + &mut self, + sparse: SparseTransitions, + ) -> Result<StateID, Error> { + for range in sparse.ranges.iter() { + self.byte_class_set.set_range(range.start, range.end); + } + self.add_state(State::Sparse(sparse)) + } + + pub fn add_look( + &mut self, + next: StateID, + look: Look, + ) -> Result<StateID, Error> { + self.facts.set_has_any_look(true); + look.add_to_byteset(&mut self.byte_class_set); + match look { + Look::StartLine + | Look::EndLine + | Look::StartText + | Look::EndText => { + self.facts.set_has_any_anchor(true); + } + Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => { + self.facts.set_has_word_boundary_unicode(true); + } + Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => { + self.facts.set_has_word_boundary_ascii(true); + } + } + self.add_state(State::Look { look, next }) + } + + pub fn add_union( + &mut self, + alternates: Box<[StateID]>, + ) -> Result<StateID, Error> { + self.add_state(State::Union { alternates }) + } + + pub fn add_capture_start( + &mut self, + next_id: StateID, + capture_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, Error> { + let pid = self.current_pattern_id(); + let capture_index = match usize::try_from(capture_index) { + Err(_) => { + return Err(Error::invalid_capture_index(core::usize::MAX)) + } + Ok(capture_index) => capture_index, + }; + // Do arithmetic to find our absolute slot index first, to make sure + // the index is at least possibly valid (doesn't overflow). + let relative_slot = match capture_index.checked_mul(2) { + Some(relative_slot) => relative_slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + let slot = match relative_slot.checked_add(self.capture_slot_len()) { + Some(slot) => slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + // Make sure we have space to insert our (pid,index)|-->name mapping. + if pid.as_usize() >= self.capture_index_to_name.len() { + // Note that we require that if you're adding capturing groups, + // then there must be at least one capturing group per pattern. + // Moreover, whenever we expand our space here, it should always + // first be for the first capture group (at index==0). + if pid.as_usize() > self.capture_index_to_name.len() + || capture_index > 0 + { + return Err(Error::invalid_capture_index(capture_index)); + } + self.capture_name_to_index.push(CaptureNameMap::new()); + self.capture_index_to_name.push(vec![]); + } + if capture_index >= self.capture_index_to_name[pid].len() { + // We require that capturing groups are added in correspondence + // to their index. So no discontinuous indices. This is likely + // overly strict, but also makes it simpler to provide guarantees + // about our capturing group data. + if capture_index > self.capture_index_to_name[pid].len() { + return Err(Error::invalid_capture_index(capture_index)); + } + self.capture_index_to_name[pid].push(None); + } + if let Some(ref name) = name { + self.capture_name_to_index[pid] + .insert(Arc::clone(name), capture_index); + } + self.capture_index_to_name[pid][capture_index] = name; + self.add_state(State::Capture { next: next_id, slot }) + } + + pub fn add_capture_end( + &mut self, + next_id: StateID, + capture_index: u32, + ) -> Result<StateID, Error> { + let pid = self.current_pattern_id(); + let capture_index = match usize::try_from(capture_index) { + Err(_) => { + return Err(Error::invalid_capture_index(core::usize::MAX)) + } + Ok(capture_index) => capture_index, + }; + // If we haven't already added this capture group via a corresponding + // 'add_capture_start' call, then we consider the index given to be + // invalid. + if pid.as_usize() >= self.capture_index_to_name.len() + || capture_index >= self.capture_index_to_name[pid].len() + { + return Err(Error::invalid_capture_index(capture_index)); + } + // Since we've already confirmed that this capture index is invalid + // and has a corresponding starting slot, we know the multiplcation + // has already been done and succeeded. + let relative_slot_start = capture_index.checked_mul(2).unwrap(); + let relative_slot = match relative_slot_start.checked_add(1) { + Some(relative_slot) => relative_slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + let slot = match relative_slot.checked_add(self.capture_slot_len()) { + Some(slot) => slot, + None => return Err(Error::invalid_capture_index(capture_index)), + }; + self.add_state(State::Capture { next: next_id, slot }) + } + + pub fn add_fail(&mut self) -> Result<StateID, Error> { + self.add_state(State::Fail) + } + + /// Add a new match state to this NFA and return its state ID. + pub fn add_match(&mut self) -> Result<StateID, Error> { + let pattern_id = self.current_pattern_id(); + let sid = self.add_state(State::Match { id: pattern_id })?; + Ok(sid) + } + + /// Finish compiling the current pattern and return its identifier. The + /// given ID should be the state ID corresponding to the anchored starting + /// state for matching this pattern. + pub fn finish_pattern( + &mut self, + start_id: StateID, + ) -> Result<PatternID, Error> { + // We've gotta make sure that we never permit the user to add more + // patterns than we can identify. So if we're already at the limit, + // then return an error. This is somewhat non-ideal since this won't + // result in an error until trying to complete the compilation of a + // pattern instead of starting it. + if self.start_pattern.len() >= PatternID::LIMIT { + return Err(Error::too_many_patterns( + self.start_pattern.len().saturating_add(1), + )); + } + let pid = self.current_pattern_id(); + self.start_pattern.push(start_id); + // Add the number of new slots created by this pattern. This is always + // equivalent to '2 * caps.len()', where 'caps.len()' is the number of + // new capturing groups introduced by the pattern we're finishing. + let new_cap_groups = self + .capture_index_to_name + .get(pid.as_usize()) + .map_or(0, |caps| caps.len()); + let new_slots = match new_cap_groups.checked_mul(2) { + Some(new_slots) => new_slots, + None => { + // Just return the biggest index that we know exists. + let index = new_cap_groups.saturating_sub(1); + return Err(Error::invalid_capture_index(index)); + } + }; + let slot_start = self.capture_slot_len(); + self.patterns_to_slots.push(slot_start..(slot_start + new_slots)); + Ok(pid) + } + + fn add_state(&mut self, state: State) -> Result<StateID, Error> { + let id = StateID::new(self.states.len()) + .map_err(|_| Error::too_many_states(self.states.len()))?; + self.memory_states += state.memory_usage(); + self.states.push(state); + Ok(id) + } + + /// Remap the transitions in every state of this NFA using the given map. + /// The given map should be indexed according to state ID namespace used by + /// the transitions of the states currently in this NFA. + /// + /// This may be used during the final phases of an NFA compiler, which + /// turns its intermediate NFA into the final NFA. Remapping may be + /// required to bring the state pointers from the intermediate NFA to the + /// final NFA. + pub fn remap(&mut self, old_to_new: &[StateID]) { + for state in &mut self.states { + state.remap(old_to_new); + } + self.start_anchored = old_to_new[self.start_anchored]; + self.start_unanchored = old_to_new[self.start_unanchored]; + for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() { + *id = old_to_new[*id]; + } + } + + /// Clear this NFA such that it has zero states and is otherwise "empty." + /// + /// An empty NFA is useful as a starting point for building one. It is + /// itself not intended to be used for matching. For example, its starting + /// state identifiers are configured to be `0`, but since it has no states, + /// the identifiers are invalid. + pub fn clear(&mut self) { + self.states.clear(); + self.start_anchored = StateID::ZERO; + self.start_unanchored = StateID::ZERO; + self.start_pattern.clear(); + self.patterns_to_slots.clear(); + self.capture_name_to_index.clear(); + self.capture_index_to_name.clear(); + self.byte_class_set = ByteClassSet::empty(); + self.facts = Facts::default(); + self.memory_states = 0; + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "thompson::NFA(")?; + for (sid, state) in self.states.iter().with_state_ids() { + let status = if sid == self.start_anchored { + '^' + } else if sid == self.start_unanchored { + '>' + } else { + ' ' + }; + writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; + } + if self.pattern_len() > 1 { + writeln!(f, "")?; + for pid in self.patterns() { + let sid = self.start_pattern(pid); + writeln!( + f, + "START({:06?}): {:?}", + pid.as_usize(), + sid.as_usize() + )?; + } + } + writeln!(f, "")?; + writeln!( + f, + "transition equivalence classes: {:?}", + self.byte_class_set().byte_classes() + )?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// A state in a final compiled NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state that transitions to `next` if and only if the current input + /// byte is in the range `[start, end]` (inclusive). + /// + /// This is a special case of Sparse in that it encodes only one transition + /// (and therefore avoids the allocation). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. As + /// such, this may only be used when every transition has equal priority. + /// (In practice, this is only used for encoding UTF-8 automata.) + Sparse(SparseTransitions), + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { look: Look, next: StateID }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Box<[StateID]> }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to an epsilon transition, but serves the purpose of instructing NFA + /// simulations to record additional state when the finite state machine + /// passes through this epsilon transition. + /// + /// These transitions are treated as epsilon transitions with no additional + /// effects in DFAs. + /// + /// 'slot' in this context refers to the specific capture group offset that + /// is being recorded. Each capturing group has two slots corresponding to + /// the start and end of the matching portion of that group. + /// A fail state. When encountered, the automaton is guaranteed to never + /// reach a match state. + Capture { next: StateID, slot: usize }, + /// A state that cannot be transitioned out of. If a search reaches this + /// state, then no match is possible and the search should terminate. + Fail, + /// A match state. There is exactly one such occurrence of this state for + /// each regex compiled into the NFA. + Match { id: PatternID }, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + #[inline] + pub fn is_epsilon(&self) -> bool { + match *self { + State::Range { .. } + | State::Sparse { .. } + | State::Fail + | State::Match { .. } => false, + State::Look { .. } + | State::Union { .. } + | State::Capture { .. } => true, + } + } + + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::Range { .. } + | State::Look { .. } + | State::Capture { .. } + | State::Match { .. } + | State::Fail => 0, + State::Sparse(SparseTransitions { ref ranges }) => { + ranges.len() * mem::size_of::<Transition>() + } + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::Range { ref mut range } => range.next = remap[range.next], + State::Sparse(SparseTransitions { ref mut ranges }) => { + for r in ranges.iter_mut() { + r.next = remap[r.next]; + } + } + State::Look { ref mut next, .. } => *next = remap[*next], + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::Capture { ref mut next, .. } => *next = remap[*next], + State::Fail => {} + State::Match { .. } => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + State::Range { ref range } => range.fmt(f), + State::Sparse(SparseTransitions { ref ranges }) => { + let rs = ranges + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Look { ref look, next } => { + write!(f, "{:?} => {:?}", look, next.as_usize()) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{:?}", id.as_usize())) + .collect::<Vec<String>>() + .join(", "); + write!(f, "alt({})", alts) + } + State::Capture { next, slot } => { + write!(f, "capture({:?}) => {:?}", slot, next.as_usize()) + } + State::Fail => write!(f, "FAIL"), + State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()), + } + } +} + +/// A collection of facts about an NFA. +/// +/// There are no real cohesive principles behind what gets put in here. For +/// the most part, it is implementation driven. +#[derive(Clone, Copy, Debug, Default)] +struct Facts { + /// Various yes/no facts about this NFA. + bools: u16, +} + +impl Facts { + define_bool!(0, has_any_look, set_has_any_look); + define_bool!(1, has_any_anchor, set_has_any_anchor); + define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode); + define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii); +} + +/// A sequence of transitions used to represent a sparse state. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SparseTransitions { + pub ranges: Box<[Transition]>, +} + +impl SparseTransitions { + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + for t in self.ranges.iter() { + if t.start > byte { + break; + } else if t.matches_byte(byte) { + return Some(t.next); + } + } + None + + /* + // This is an alternative implementation that uses binary search. In + // some ad hoc experiments, like + // + // smallishru=OpenSubtitles2018.raw.sample.smallish.ru + // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // + // I could not observe any improvement, and in fact, things seemed to + // be a bit slower. + self.ranges + .binary_search_by(|t| { + if t.end < byte { + core::cmp::Ordering::Less + } else if t.start > byte { + core::cmp::Ordering::Greater + } else { + core::cmp::Ordering::Equal + } + }) + .ok() + .map(|i| self.ranges[i].next) + */ + } +} + +/// A transition to another state, only if the given byte falls in the +/// inclusive range specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + pub start: u8, + pub end: u8, + pub next: StateID, +} + +impl Transition { + pub fn matches(&self, haystack: &[u8], at: usize) -> bool { + haystack.get(at).map_or(false, |&b| self.matches_byte(b)) + } + + pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { + unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) + } + + pub fn matches_byte(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::util::DebugByte; + + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + ) + } + } +} + +/// A conditional NFA epsilon transition. +/// +/// A simulation of the NFA can only move through this epsilon transition if +/// the current position satisfies some look-around property. Some assertions +/// are look-behind (StartLine, StartText), some assertions are look-ahead +/// (EndLine, EndText) while other assertions are both look-behind and +/// look-ahead (WordBoundary*). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// The previous position is either `\n` or the current position is the + /// beginning of the haystack (i.e., at position `0`). + StartLine = 1 << 0, + /// The next position is either `\n` or the current position is the end of + /// the haystack (i.e., at position `haystack.len()`). + EndLine = 1 << 1, + /// The current position is the beginning of the haystack (i.e., at + /// position `0`). + StartText = 1 << 2, + /// The current position is the end of the haystack (i.e., at position + /// `haystack.len()`). + EndText = 1 << 3, + /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])` + /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only + /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if + /// `i=haystack.len()`, then `is_word(n)=false`. + WordBoundaryUnicode = 1 << 4, + /// Same as for `WordBoundaryUnicode`, but requires that + /// `is_word(p) == is_word(n)`. + WordBoundaryUnicodeNegate = 1 << 5, + /// When tested at position `i`, where `p=haystack[i-1]` and + /// `n=haystack[i]`, this assertion passes if and only if `is_word(p) + /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if + /// `i=haystack.len()`, then `is_word(n)=false`. + WordBoundaryAscii = 1 << 6, + /// Same as for `WordBoundaryAscii`, but requires that + /// `is_word(p) == is_word(n)`. + /// + /// Note that it is possible for this assertion to match at positions that + /// split the UTF-8 encoding of a codepoint. For this reason, this may only + /// be used when UTF-8 mode is disable in the regex syntax. + WordBoundaryAsciiNegate = 1 << 7, +} + +impl Look { + #[inline(always)] + pub fn matches(&self, bytes: &[u8], at: usize) -> bool { + match *self { + Look::StartLine => at == 0 || bytes[at - 1] == b'\n', + Look::EndLine => at == bytes.len() || bytes[at] == b'\n', + Look::StartText => at == 0, + Look::EndText => at == bytes.len(), + Look::WordBoundaryUnicode => { + let word_before = is_word_char_rev(bytes, at); + let word_after = is_word_char_fwd(bytes, at); + word_before != word_after + } + Look::WordBoundaryUnicodeNegate => { + // This is pretty subtle. Why do we need to do UTF-8 decoding + // here? Well... at time of writing, the is_word_char_{fwd,rev} + // routines will only return true if there is a valid UTF-8 + // encoding of a "word" codepoint, and false in every other + // case (including invalid UTF-8). This means that in regions + // of invalid UTF-8 (which might be a subset of valid UTF-8!), + // it would result in \B matching. While this would be + // questionable in the context of truly invalid UTF-8, it is + // *certainly* wrong to report match boundaries that split the + // encoding of a codepoint. So to work around this, we ensure + // that we can decode a codepoint on either side of `at`. If + // either direction fails, then we don't permit \B to match at + // all. + // + // Now, this isn't exactly optimal from a perf perspective. We + // could try and detect this in is_word_char_{fwd,rev}, but + // it's not clear if it's worth it. \B is, after all, rarely + // used. + // + // And in particular, we do *not* have to do this with \b, + // because \b *requires* that at least one side of `at` be a + // "word" codepoint, which in turn implies one side of `at` + // must be valid UTF-8. This in turn implies that \b can never + // split a valid UTF-8 encoding of a codepoint. In the case + // where one side of `at` is truly invalid UTF-8 and the other + // side IS a word codepoint, then we want \b to match since it + // represents a valid UTF-8 boundary. It also makes sense. For + // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. + let word_before = at > 0 + && match decode_last_utf8(&bytes[..at]) { + None | Some(Err(_)) => return false, + Some(Ok(_)) => is_word_char_rev(bytes, at), + }; + let word_after = at < bytes.len() + && match decode_utf8(&bytes[at..]) { + None | Some(Err(_)) => return false, + Some(Ok(_)) => is_word_char_fwd(bytes, at), + }; + word_before == word_after + } + Look::WordBoundaryAscii => { + let word_before = at > 0 && is_word_byte(bytes[at - 1]); + let word_after = at < bytes.len() && is_word_byte(bytes[at]); + word_before != word_after + } + Look::WordBoundaryAsciiNegate => { + let word_before = at > 0 && is_word_byte(bytes[at - 1]); + let word_after = at < bytes.len() && is_word_byte(bytes[at]); + word_before == word_after + } + } + } + + /// Create a look-around assertion from its corresponding integer (as + /// defined in `Look`). If the given integer does not correspond to any + /// assertion, then None is returned. + fn from_int(n: u8) -> Option<Look> { + match n { + 0b0000_0001 => Some(Look::StartLine), + 0b0000_0010 => Some(Look::EndLine), + 0b0000_0100 => Some(Look::StartText), + 0b0000_1000 => Some(Look::EndText), + 0b0001_0000 => Some(Look::WordBoundaryUnicode), + 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate), + 0b0100_0000 => Some(Look::WordBoundaryAscii), + 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate), + _ => None, + } + } + + /// Flip the look-around assertion to its equivalent for reverse searches. + fn reversed(&self) -> Look { + match *self { + Look::StartLine => Look::EndLine, + Look::EndLine => Look::StartLine, + Look::StartText => Look::EndText, + Look::EndText => Look::StartText, + Look::WordBoundaryUnicode => Look::WordBoundaryUnicode, + Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate, + Look::WordBoundaryAscii => Look::WordBoundaryAscii, + Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate, + } + } + + /// Split up the given byte classes into equivalence classes in a way that + /// is consistent with this look-around assertion. + fn add_to_byteset(&self, set: &mut ByteClassSet) { + match *self { + Look::StartText | Look::EndText => {} + Look::StartLine | Look::EndLine => { + set.set_range(b'\n', b'\n'); + } + Look::WordBoundaryUnicode + | Look::WordBoundaryUnicodeNegate + | Look::WordBoundaryAscii + | Look::WordBoundaryAsciiNegate => { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. This isn't technically correct + // for Unicode word boundaries, but DFAs can't handle those + // anyway, and thus, the byte classes don't need to either + // since they are themselves only used in DFAs. + let iswb = regex_syntax::is_word_byte; + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { + b2 += 1; + } + set.set_range(b1 as u8, (b2 - 1) as u8); + b1 = b2; + } + } + } + } +} + +/// LookSet is a memory-efficient set of look-around assertions. Callers may +/// idempotently insert or remove any look-around assertion from a set. +#[repr(transparent)] +#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct LookSet { + set: u8, +} + +impl LookSet { + /// Return a LookSet from its representation. + pub(crate) fn from_repr(repr: u8) -> LookSet { + LookSet { set: repr } + } + + /// Return a mutable LookSet from a mutable pointer to its representation. + pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet { + // SAFETY: This is safe since a LookSet is repr(transparent) where its + // repr is a u8. + unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) } + } + + /// Return true if and only if this set is empty. + pub(crate) fn is_empty(&self) -> bool { + self.set == 0 + } + + /// Clears this set such that it has no assertions in it. + pub(crate) fn clear(&mut self) { + self.set = 0; + } + + /// Insert the given look-around assertion into this set. If the assertion + /// already exists, then this is a no-op. + pub(crate) fn insert(&mut self, look: Look) { + self.set |= look as u8; + } + + /// Remove the given look-around assertion from this set. If the assertion + /// is not in this set, then this is a no-op. + #[cfg(test)] + pub(crate) fn remove(&mut self, look: Look) { + self.set &= !(look as u8); + } + + /// Return true if and only if the given assertion is in this set. + pub(crate) fn contains(&self, look: Look) -> bool { + (look as u8) & self.set != 0 + } + + /// Subtract the given `other` set from the `self` set and return a new + /// set. + pub(crate) fn subtract(&self, other: LookSet) -> LookSet { + LookSet { set: self.set & !other.set } + } + + /// Return the intersection of the given `other` set with the `self` set + /// and return the resulting set. + pub(crate) fn intersect(&self, other: LookSet) -> LookSet { + LookSet { set: self.set & other.set } + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut members = vec![]; + for i in 0..8 { + let look = match Look::from_int(1 << i) { + None => continue, + Some(look) => look, + }; + if self.contains(look) { + members.push(look); + } + } + f.debug_tuple("LookSet").field(&members).finish() + } +} + +/// An iterator over all pattern IDs in an NFA. +pub struct PatternIter<'a> { + it: PatternIDIter, + /// We explicitly associate a lifetime with this iterator even though we + /// don't actually borrow anything from the NFA. We do this for backward + /// compatibility purposes. If we ever do need to borrow something from + /// the NFA, then we can and just get rid of this marker without breaking + /// the public API. + _marker: core::marker::PhantomData<&'a ()>, +} + +impl<'a> Iterator for PatternIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + self.it.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + // TODO: Replace tests using DFA with NFA matching engine once implemented. + use crate::dfa::{dense, Automaton}; + + #[test] + fn always_match() { + let nfa = NFA::always_match(); + let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); + let find = |input, start, end| { + dfa.find_leftmost_fwd_at(None, None, input, start, end) + .unwrap() + .map(|m| m.offset()) + }; + + assert_eq!(Some(0), find(b"", 0, 0)); + assert_eq!(Some(0), find(b"a", 0, 1)); + assert_eq!(Some(1), find(b"a", 1, 1)); + assert_eq!(Some(0), find(b"ab", 0, 2)); + assert_eq!(Some(1), find(b"ab", 1, 2)); + assert_eq!(Some(2), find(b"ab", 2, 2)); + } + + #[test] + fn never_match() { + let nfa = NFA::never_match(); + let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); + let find = |input, start, end| { + dfa.find_leftmost_fwd_at(None, None, input, start, end) + .unwrap() + .map(|m| m.offset()) + }; + + assert_eq!(None, find(b"", 0, 0)); + assert_eq!(None, find(b"a", 0, 1)); + assert_eq!(None, find(b"a", 1, 1)); + assert_eq!(None, find(b"ab", 0, 2)); + assert_eq!(None, find(b"ab", 1, 2)); + assert_eq!(None, find(b"ab", 2, 2)); + } + + #[test] + fn look_set() { + let mut f = LookSet::default(); + assert!(!f.contains(Look::StartText)); + assert!(!f.contains(Look::EndText)); + assert!(!f.contains(Look::StartLine)); + assert!(!f.contains(Look::EndLine)); + assert!(!f.contains(Look::WordBoundaryUnicode)); + assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); + assert!(!f.contains(Look::WordBoundaryAscii)); + assert!(!f.contains(Look::WordBoundaryAsciiNegate)); + + f.insert(Look::StartText); + assert!(f.contains(Look::StartText)); + f.remove(Look::StartText); + assert!(!f.contains(Look::StartText)); + + f.insert(Look::EndText); + assert!(f.contains(Look::EndText)); + f.remove(Look::EndText); + assert!(!f.contains(Look::EndText)); + + f.insert(Look::StartLine); + assert!(f.contains(Look::StartLine)); + f.remove(Look::StartLine); + assert!(!f.contains(Look::StartLine)); + + f.insert(Look::EndLine); + assert!(f.contains(Look::EndLine)); + f.remove(Look::EndLine); + assert!(!f.contains(Look::EndLine)); + + f.insert(Look::WordBoundaryUnicode); + assert!(f.contains(Look::WordBoundaryUnicode)); + f.remove(Look::WordBoundaryUnicode); + assert!(!f.contains(Look::WordBoundaryUnicode)); + + f.insert(Look::WordBoundaryUnicodeNegate); + assert!(f.contains(Look::WordBoundaryUnicodeNegate)); + f.remove(Look::WordBoundaryUnicodeNegate); + assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); + + f.insert(Look::WordBoundaryAscii); + assert!(f.contains(Look::WordBoundaryAscii)); + f.remove(Look::WordBoundaryAscii); + assert!(!f.contains(Look::WordBoundaryAscii)); + + f.insert(Look::WordBoundaryAsciiNegate); + assert!(f.contains(Look::WordBoundaryAsciiNegate)); + f.remove(Look::WordBoundaryAsciiNegate); + assert!(!f.contains(Look::WordBoundaryAsciiNegate)); + } + + #[test] + fn look_matches_start_line() { + let look = Look::StartLine; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("\na"), 1)); + + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a\na"), 1)); + } + + #[test] + fn look_matches_end_line() { + let look = Look::EndLine; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("\na"), 0)); + assert!(look.matches(B("\na"), 2)); + assert!(look.matches(B("a\na"), 1)); + + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a\na"), 0)); + assert!(!look.matches(B("a\na"), 2)); + } + + #[test] + fn look_matches_start_text() { + let look = Look::StartText; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 0)); + assert!(look.matches(B("a"), 0)); + + assert!(!look.matches(B("\n"), 1)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a\na"), 1)); + } + + #[test] + fn look_matches_end_text() { + let look = Look::EndText; + + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("\n"), 1)); + assert!(look.matches(B("\na"), 2)); + + assert!(!look.matches(B("\na"), 0)); + assert!(!look.matches(B("a\na"), 1)); + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("\na"), 1)); + assert!(!look.matches(B("a\na"), 0)); + assert!(!look.matches(B("a\na"), 2)); + } + + #[test] + fn look_matches_word_unicode() { + let look = Look::WordBoundaryUnicode; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("a"), 1)); + assert!(look.matches(B("a "), 1)); + assert!(look.matches(B(" a "), 1)); + assert!(look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃"), 0)); + assert!(look.matches(B("𝛃"), 4)); + assert!(look.matches(B("𝛃 "), 4)); + assert!(look.matches(B(" 𝛃 "), 1)); + assert!(look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(look.matches(B("𝛃𐆀"), 0)); + assert!(look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(!look.matches(B(""), 0)); + assert!(!look.matches(B("ab"), 1)); + assert!(!look.matches(B("a "), 2)); + assert!(!look.matches(B(" a "), 0)); + assert!(!look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃b"), 4)); + assert!(!look.matches(B("𝛃 "), 5)); + assert!(!look.matches(B(" 𝛃 "), 0)); + assert!(!look.matches(B(" 𝛃 "), 6)); + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + assert!(!look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_ascii() { + let look = Look::WordBoundaryAscii; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(look.matches(B("a"), 0)); + assert!(look.matches(B("a"), 1)); + assert!(look.matches(B("a "), 1)); + assert!(look.matches(B(" a "), 1)); + assert!(look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(!look.matches(B("𝛃"), 0)); + assert!(!look.matches(B("𝛃"), 4)); + assert!(!look.matches(B("𝛃 "), 4)); + assert!(!look.matches(B(" 𝛃 "), 1)); + assert!(!look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(!look.matches(B("𝛃𐆀"), 0)); + assert!(!look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(!look.matches(B(""), 0)); + assert!(!look.matches(B("ab"), 1)); + assert!(!look.matches(B("a "), 2)); + assert!(!look.matches(B(" a "), 0)); + assert!(!look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃b"), 4)); + assert!(!look.matches(B("𝛃 "), 5)); + assert!(!look.matches(B(" 𝛃 "), 0)); + assert!(!look.matches(B(" 𝛃 "), 6)); + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + assert!(!look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_unicode_negate() { + let look = Look::WordBoundaryUnicodeNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a "), 1)); + assert!(!look.matches(B(" a "), 1)); + assert!(!look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃"), 0)); + assert!(!look.matches(B("𝛃"), 4)); + assert!(!look.matches(B("𝛃 "), 4)); + assert!(!look.matches(B(" 𝛃 "), 1)); + assert!(!look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. + assert!(!look.matches(B("𝛃𐆀"), 0)); + assert!(!look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("ab"), 1)); + assert!(look.matches(B("a "), 2)); + assert!(look.matches(B(" a "), 0)); + assert!(look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(look.matches(B("𝛃b"), 4)); + assert!(look.matches(B("𝛃 "), 5)); + assert!(look.matches(B(" 𝛃 "), 0)); + assert!(look.matches(B(" 𝛃 "), 6)); + // These don't match because they could otherwise return an offset that + // splits the UTF-8 encoding of a codepoint. + assert!(!look.matches(B("𝛃"), 1)); + assert!(!look.matches(B("𝛃"), 2)); + assert!(!look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. These also don't + // match because they could otherwise return an offset that splits the + // UTF-8 encoding of a codepoint. + assert!(!look.matches(B("𝛃𐆀"), 1)); + assert!(!look.matches(B("𝛃𐆀"), 2)); + assert!(!look.matches(B("𝛃𐆀"), 3)); + assert!(!look.matches(B("𝛃𐆀"), 5)); + assert!(!look.matches(B("𝛃𐆀"), 6)); + assert!(!look.matches(B("𝛃𐆀"), 7)); + // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end + // of the haystack. So the "end" of the haystack isn't a word and 𐆀 + // isn't a word, thus, \B matches. + assert!(look.matches(B("𝛃𐆀"), 8)); + } + + #[test] + fn look_matches_word_ascii_negate() { + let look = Look::WordBoundaryAsciiNegate; + + // \xF0\x9D\x9B\x83 = 𝛃 (in \w) + // \xF0\x90\x86\x80 = 𐆀 (not in \w) + + // Simple ASCII word boundaries. + assert!(!look.matches(B("a"), 0)); + assert!(!look.matches(B("a"), 1)); + assert!(!look.matches(B("a "), 1)); + assert!(!look.matches(B(" a "), 1)); + assert!(!look.matches(B(" a "), 2)); + + // Unicode word boundaries with a non-ASCII codepoint. Since this is + // an ASCII word boundary, none of these match. + assert!(look.matches(B("𝛃"), 0)); + assert!(look.matches(B("𝛃"), 4)); + assert!(look.matches(B("𝛃 "), 4)); + assert!(look.matches(B(" 𝛃 "), 1)); + assert!(look.matches(B(" 𝛃 "), 5)); + + // Unicode word boundaries between non-ASCII codepoints. Again, since + // this is an ASCII word boundary, none of these match. + assert!(look.matches(B("𝛃𐆀"), 0)); + assert!(look.matches(B("𝛃𐆀"), 4)); + + // Non word boundaries for ASCII. + assert!(look.matches(B(""), 0)); + assert!(look.matches(B("ab"), 1)); + assert!(look.matches(B("a "), 2)); + assert!(look.matches(B(" a "), 0)); + assert!(look.matches(B(" a "), 3)); + + // Non word boundaries with a non-ASCII codepoint. + assert!(!look.matches(B("𝛃b"), 4)); + assert!(look.matches(B("𝛃 "), 5)); + assert!(look.matches(B(" 𝛃 "), 0)); + assert!(look.matches(B(" 𝛃 "), 6)); + assert!(look.matches(B("𝛃"), 1)); + assert!(look.matches(B("𝛃"), 2)); + assert!(look.matches(B("𝛃"), 3)); + + // Non word boundaries with non-ASCII codepoints. + assert!(look.matches(B("𝛃𐆀"), 1)); + assert!(look.matches(B("𝛃𐆀"), 2)); + assert!(look.matches(B("𝛃𐆀"), 3)); + assert!(look.matches(B("𝛃𐆀"), 5)); + assert!(look.matches(B("𝛃𐆀"), 6)); + assert!(look.matches(B("𝛃𐆀"), 7)); + assert!(look.matches(B("𝛃𐆀"), 8)); + } + + fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] { + string.as_ref() + } +} diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/pikevm.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/pikevm.rs new file mode 100644 index 000000000..7572f9f10 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/pikevm.rs @@ -0,0 +1,554 @@ +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, Error, State, NFA}, + util::{ + id::{PatternID, StateID}, + matchtypes::MultiMatch, + sparse_set::SparseSet, + }, +}; + +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + anchored: Option<bool>, + utf8: Option<bool>, +} + +impl Config { + /// Return a new default PikeVM configuration. + pub fn new() -> Config { + Config::default() + } + + pub fn anchored(mut self, yes: bool) -> Config { + self.anchored = Some(yes); + self + } + + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + pub fn get_anchored(&self) -> bool { + self.anchored.unwrap_or(false) + } + + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { + anchored: o.anchored.or(self.anchored), + utf8: o.utf8.or(self.utf8), + } + } +} + +/// A builder for a PikeVM. +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + thompson: thompson::Builder, +} + +impl Builder { + /// Create a new PikeVM builder with its default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + thompson: thompson::Builder::new(), + } + } + + pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> { + self.build_many(&[pattern]) + } + + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<PikeVM, Error> { + let nfa = self.thompson.build_many(patterns)?; + self.build_from_nfa(Arc::new(nfa)) + } + + pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> { + // TODO: Check that this is correct. + // if !cfg!(all( + // feature = "dfa", + // feature = "syntax", + // feature = "unicode-perl" + // )) { + if !cfg!(feature = "syntax") { + if nfa.has_word_boundary_unicode() { + return Err(Error::unicode_word_unavailable()); + } + } + Ok(PikeVM { config: self.config, nfa }) + } + + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a PikeVM directly from a + /// pattern. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like if additional time should be spent + /// shrinking the size of the NFA. + /// + /// These settings only apply when constructing a PikeVM directly from a + /// pattern. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +#[derive(Clone, Debug)] +pub struct PikeVM { + config: Config, + nfa: Arc<NFA>, +} + +impl PikeVM { + pub fn new(pattern: &str) -> Result<PikeVM, Error> { + PikeVM::builder().build(pattern) + } + + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> { + PikeVM::builder().build_many(patterns) + } + + pub fn config() -> Config { + Config::new() + } + + pub fn builder() -> Builder { + Builder::new() + } + + pub fn create_cache(&self) -> Cache { + Cache::new(self.nfa()) + } + + pub fn create_captures(&self) -> Captures { + Captures::new(self.nfa()) + } + + pub fn nfa(&self) -> &Arc<NFA> { + &self.nfa + } + + pub fn find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches::new(self, cache, haystack) + } + + // BREADCRUMBS: + // + // 1) Don't forget about prefilters. + // + // 2) Consider the case of using a PikeVM with an NFA that has Capture + // states, but where we don't want to track capturing groups (other than + // group 0). This potentially saves a lot of copying around and what not. I + // believe the current regex crate does this, for example. The interesting + // bit here is how to handle the case of multiple patterns... + // + // 3) Permit the caller to specify a pattern ID to run an anchored-only + // search on. + // + // 4) How to do overlapping? The way multi-regex support works in the regex + // crate currently is to run the PikeVM until either we reach the end of + // the haystack or when we know all regexes have matched. The latter case + // is probably quite rare, so the common case is likely that we're always + // searching the entire input. The question is: can we emulate that with + // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then + // all we need to do is provide an overlapping API on the PikeVM that + // roughly matches the ones we provide on DFAs. For those APIs, the only + // thing they need over non-overlapping APIs is "caller state." For DFAs, + // the caller state is simple: it contains the last state visited and the + // last match reported. For the PikeVM (and NFAs in general), the "last + // state" is actually a *set* of NFA states. So I think what happens here + // is that we can just force the `Cache` to subsume this role. We'll still + // need some additional state to track the last match reported though. + // Because when two or more patterns match at the same location, we need a + // way to know to iterate over them. Although maybe it's not match index we + // need, but the state index of the last NFA state processed in the cache. + // Then we just pick up where we left off. There might be another match + // state, in which case, we report it. + + pub fn find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + caps: &mut Captures, + ) -> Option<MultiMatch> { + let anchored = + self.config.get_anchored() || self.nfa.is_always_start_anchored(); + let mut at = start; + let mut matched_pid = None; + cache.clear(); + 'LOOP: loop { + if cache.clist.set.is_empty() { + if matched_pid.is_some() || (anchored && at > start) { + break 'LOOP; + } + // TODO: prefilter + } + if (!anchored && matched_pid.is_none()) + || cache.clist.set.is_empty() + { + self.epsilon_closure( + &mut cache.clist, + &mut caps.slots, + &mut cache.stack, + self.nfa.start_anchored(), + haystack, + at, + ); + } + for i in 0..cache.clist.set.len() { + let sid = cache.clist.set.get(i); + let pid = match self.step( + &mut cache.nlist, + &mut caps.slots, + cache.clist.caps(sid), + &mut cache.stack, + sid, + haystack, + at, + ) { + None => continue, + Some(pid) => pid, + }; + matched_pid = Some(pid); + break; + } + if at >= end { + break; + } + at += 1; + cache.swap(); + cache.nlist.set.clear(); + } + matched_pid.map(|pid| { + let slots = self.nfa.pattern_slots(pid); + let (start, end) = (slots.start, slots.start + 1); + MultiMatch::new( + pid, + caps.slots[start].unwrap(), + caps.slots[end].unwrap(), + ) + }) + } + + #[inline(always)] + fn step( + &self, + nlist: &mut Threads, + slots: &mut [Slot], + thread_caps: &mut [Slot], + stack: &mut Vec<FollowEpsilon>, + sid: StateID, + haystack: &[u8], + at: usize, + ) -> Option<PatternID> { + match *self.nfa.state(sid) { + State::Fail + | State::Look { .. } + | State::Union { .. } + | State::Capture { .. } => None, + State::Range { ref range } => { + if range.matches(haystack, at) { + self.epsilon_closure( + nlist, + thread_caps, + stack, + range.next, + haystack, + at + 1, + ); + } + None + } + State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches(haystack, at) { + self.epsilon_closure( + nlist, + thread_caps, + stack, + next, + haystack, + at + 1, + ); + } + None + } + State::Match { id } => { + slots.copy_from_slice(thread_caps); + Some(id) + } + } + } + + #[inline(always)] + fn epsilon_closure( + &self, + nlist: &mut Threads, + thread_caps: &mut [Slot], + stack: &mut Vec<FollowEpsilon>, + sid: StateID, + haystack: &[u8], + at: usize, + ) { + stack.push(FollowEpsilon::StateID(sid)); + while let Some(frame) = stack.pop() { + match frame { + FollowEpsilon::StateID(sid) => { + self.epsilon_closure_step( + nlist, + thread_caps, + stack, + sid, + haystack, + at, + ); + } + FollowEpsilon::Capture { slot, pos } => { + thread_caps[slot] = pos; + } + } + } + } + + #[inline(always)] + fn epsilon_closure_step( + &self, + nlist: &mut Threads, + thread_caps: &mut [Slot], + stack: &mut Vec<FollowEpsilon>, + mut sid: StateID, + haystack: &[u8], + at: usize, + ) { + loop { + if !nlist.set.insert(sid) { + return; + } + match *self.nfa.state(sid) { + State::Fail + | State::Range { .. } + | State::Sparse { .. } + | State::Match { .. } => { + let t = &mut nlist.caps(sid); + t.copy_from_slice(thread_caps); + return; + } + State::Look { look, next } => { + if !look.matches(haystack, at) { + return; + } + sid = next; + } + State::Union { ref alternates } => { + sid = match alternates.get(0) { + None => return, + Some(&sid) => sid, + }; + stack.extend( + alternates[1..] + .iter() + .copied() + .rev() + .map(FollowEpsilon::StateID), + ); + } + State::Capture { next, slot } => { + if slot < thread_caps.len() { + stack.push(FollowEpsilon::Capture { + slot, + pos: thread_caps[slot], + }); + thread_caps[slot] = Some(at); + } + sid = next; + } + } + } + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindLeftmostMatches<'r, 'c, 't> { + vm: &'r PikeVM, + cache: &'c mut Cache, + // scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { + fn new( + vm: &'r PikeVM, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None } + } +} + +impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { + // type Item = Captures; + type Item = MultiMatch; + + // fn next(&mut self) -> Option<Captures> { + fn next(&mut self) -> Option<MultiMatch> { + if self.last_end > self.text.len() { + return None; + } + let mut caps = self.vm.create_captures(); + let m = self.vm.find_leftmost_at( + self.cache, + self.text, + self.last_end, + self.text.len(), + &mut caps, + )?; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.vm.config.get_utf8() { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(m) + } +} + +#[derive(Clone, Debug)] +pub struct Captures { + slots: Vec<Slot>, +} + +impl Captures { + pub fn new(nfa: &NFA) -> Captures { + Captures { slots: vec![None; nfa.capture_slot_len()] } + } +} + +#[derive(Clone, Debug)] +pub struct Cache { + stack: Vec<FollowEpsilon>, + clist: Threads, + nlist: Threads, +} + +type Slot = Option<usize>; + +#[derive(Clone, Debug)] +struct Threads { + set: SparseSet, + caps: Vec<Slot>, + slots_per_thread: usize, +} + +#[derive(Clone, Debug)] +enum FollowEpsilon { + StateID(StateID), + Capture { slot: usize, pos: Slot }, +} + +impl Cache { + pub fn new(nfa: &NFA) -> Cache { + Cache { + stack: vec![], + clist: Threads::new(nfa), + nlist: Threads::new(nfa), + } + } + + fn clear(&mut self) { + self.stack.clear(); + self.clist.set.clear(); + self.nlist.set.clear(); + } + + fn swap(&mut self) { + core::mem::swap(&mut self.clist, &mut self.nlist); + } +} + +impl Threads { + fn new(nfa: &NFA) -> Threads { + let mut threads = Threads { + set: SparseSet::new(0), + caps: vec![], + slots_per_thread: 0, + }; + threads.resize(nfa); + threads + } + + fn resize(&mut self, nfa: &NFA) { + if nfa.states().len() == self.set.capacity() { + return; + } + self.slots_per_thread = nfa.capture_slot_len(); + self.set.resize(nfa.states().len()); + self.caps.resize(self.slots_per_thread * nfa.states().len(), None); + } + + fn caps(&mut self, sid: StateID) -> &mut [Slot] { + let i = sid.as_usize() * self.slots_per_thread; + &mut self.caps[i..i + self.slots_per_thread] + } +} diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/range_trie.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/range_trie.rs new file mode 100644 index 000000000..92f36ce3a --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/range_trie.rs @@ -0,0 +1,1051 @@ +// I've called the primary data structure in this module a "range trie." As far +// as I can tell, there is no prior art on a data structure like this, however, +// it's likely someone somewhere has built something like it. Searching for +// "range trie" turns up the paper "Range Tries for Scalable Address Lookup," +// but it does not appear relevant. +// +// The range trie is just like a trie in that it is a special case of a +// deterministic finite state machine. It has states and each state has a set +// of transitions to other states. It is acyclic, and, like a normal trie, +// it makes no attempt to reuse common suffixes among its elements. The key +// difference between a normal trie and a range trie below is that a range trie +// operates on *contiguous sequences* of bytes instead of singleton bytes. +// One could say say that our alphabet is ranges of bytes instead of bytes +// themselves, except a key part of range trie construction is splitting ranges +// apart to ensure there is at most one transition that can be taken for any +// byte in a given state. +// +// I've tried to explain the details of how the range trie works below, so +// for now, we are left with trying to understand what problem we're trying to +// solve. Which is itself fairly involved! +// +// At the highest level, here's what we want to do. We want to convert a +// sequence of Unicode codepoints into a finite state machine whose transitions +// are over *bytes* and *not* Unicode codepoints. We want this because it makes +// said finite state machines much smaller and much faster to execute. As a +// simple example, consider a byte oriented automaton for all Unicode scalar +// values (0x00 through 0x10FFFF, not including surrogate codepoints): +// +// [00-7F] +// [C2-DF][80-BF] +// [E0-E0][A0-BF][80-BF] +// [E1-EC][80-BF][80-BF] +// [ED-ED][80-9F][80-BF] +// [EE-EF][80-BF][80-BF] +// [F0-F0][90-BF][80-BF][80-BF] +// [F1-F3][80-BF][80-BF][80-BF] +// [F4-F4][80-8F][80-BF][80-BF] +// +// (These byte ranges are generated via the regex-syntax::utf8 module, which +// was based on Russ Cox's code in RE2, which was in turn based on Ken +// Thompson's implementation of the same idea in his Plan9 implementation of +// grep.) +// +// It should be fairly straight-forward to see how one could compile this into +// a DFA. The sequences are sorted and non-overlapping. Essentially, you could +// build a trie from this fairly easy. The problem comes when your initial +// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class +// represented by '\w' contains only a tenth of the codepoints that +// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges +// as we did above, the list would stretch to 892 entries! This turns into +// quite a large NFA with a few thousand states. Turning this beast into a DFA +// takes quite a bit of time. We are thus left with trying to trim down the +// number of states we produce as early as possible. +// +// One approach (used by RE2 and still by the regex crate, at time of writing) +// is to try to find common suffixes while building NFA states for the above +// and reuse them. This is very cheap to do and one can control precisely how +// much extra memory you want to use for the cache. +// +// Another approach, however, is to reuse an algorithm for constructing a +// *minimal* DFA from a sorted sequence of inputs. I don't want to go into +// the full details here, but I explain it in more depth in my blog post on +// FSTs[1]. Note that the algorithm was not invented by me, but was published +// in paper by Daciuk et al. in 2000 called "Incremental Construction of +// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, +// it is also possible to control the amount of extra memory one uses, although +// this usually comes with the cost of sacrificing true minimality. (But it's +// typically close enough with a reasonably sized cache of states.) +// +// The catch is that Daciuk's algorithm only works if you add your keys in +// lexicographic ascending order. In our case, since we're dealing with ranges, +// we also need the additional requirement that ranges are either equivalent +// or do not overlap at all. For example, if one were given the following byte +// ranges: +// +// [BC-BF][80-BF] +// [BC-BF][90-BF] +// +// Then Daciuk's algorithm would not work, since there is nothing to handle the +// fact that the ranges overlap. They would need to be split apart. Thankfully, +// Thompson's algorithm for producing byte ranges for Unicode codepoint ranges +// meets both of our requirements. (A proof for this eludes me, but it appears +// true.) +// +// ... however, we would also like to be able to compile UTF-8 automata in +// reverse. We want this because in order to find the starting location of a +// match using a DFA, we need to run a second DFA---a reversed version of the +// forward DFA---backwards to discover the match location. Unfortunately, if +// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are +// can overlap, even if they are sorted: +// +// [00-7F] +// [80-BF][80-9F][ED-ED] +// [80-BF][80-BF][80-8F][F4-F4] +// [80-BF][80-BF][80-BF][F1-F3] +// [80-BF][80-BF][90-BF][F0-F0] +// [80-BF][80-BF][E1-EC] +// [80-BF][80-BF][EE-EF] +// [80-BF][A0-BF][E0-E0] +// [80-BF][C2-DF] +// +// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have +// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no +// simple way to apply Daciuk's algorithm. +// +// And thus, the range trie was born. The range trie's only purpose is to take +// sequences of byte ranges like the ones above, collect them into a trie and +// then spit them in a sorted fashion with no overlapping ranges. For example, +// 0x00-0x10FFFF gets translated to: +// +// [0-7F] +// [80-BF][80-9F][80-8F][F1-F3] +// [80-BF][80-9F][80-8F][F4] +// [80-BF][80-9F][90-BF][F0] +// [80-BF][80-9F][90-BF][F1-F3] +// [80-BF][80-9F][E1-EC] +// [80-BF][80-9F][ED] +// [80-BF][80-9F][EE-EF] +// [80-BF][A0-BF][80-8F][F1-F3] +// [80-BF][A0-BF][80-8F][F4] +// [80-BF][A0-BF][90-BF][F0] +// [80-BF][A0-BF][90-BF][F1-F3] +// [80-BF][A0-BF][E0] +// [80-BF][A0-BF][E1-EC] +// [80-BF][A0-BF][EE-EF] +// [80-BF][C2-DF] +// +// We've thus satisfied our requirements for running Daciuk's algorithm. All +// sequences of ranges are sorted, and any corresponding ranges are either +// exactly equivalent or non-overlapping. +// +// In effect, a range trie is building a DFA from a sequence of arbitrary +// byte ranges. But it uses an algoritm custom tailored to its input, so it +// is not as costly as traditional DFA construction. While it is still quite +// a bit more costly than the forward's case (which only needs Daciuk's +// algorithm), it winds up saving a substantial amount of time if one is doing +// a full DFA powerset construction later by virtue of producing a much much +// smaller NFA. +// +// [1] - https://blog.burntsushi.net/transducers/ +// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 + +use core::{cell::RefCell, fmt, mem, ops::RangeInclusive, u32}; + +use alloc::{format, string::String, vec, vec::Vec}; + +use regex_syntax::utf8::Utf8Range; + +/// A smaller state ID means more effective use of the CPU cache and less +/// time spent copying. The implementation below will panic if the state ID +/// space is exhausted, but in order for that to happen, the range trie itself +/// would use well over 100GB of memory. Moreover, it's likely impossible +/// for the state ID space to get that big. In fact, it's likely that even a +/// u16 would be good enough here. But it's not quite clear how to prove this. +type StateID = u32; + +/// There is only one final state in this trie. Every sequence of byte ranges +/// added shares the same final state. +const FINAL: StateID = 0; + +/// The root state of the trie. +const ROOT: StateID = 1; + +/// A range trie represents an ordered set of sequences of bytes. +/// +/// A range trie accepts as input a sequence of byte ranges and merges +/// them into the existing set such that the trie can produce a sorted +/// non-overlapping sequence of byte ranges. The sequence emitted corresponds +/// precisely to the sequence of bytes matched by the given keys, although the +/// byte ranges themselves may be split at different boundaries. +/// +/// The order complexity of this data structure seems difficult to analyze. +/// If the size of a byte is held as a constant, then insertion is clearly +/// O(n) where n is the number of byte ranges in the input key. However, if +/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In +/// particular it seems possible for pathological inputs to cause insertion +/// to do a lot of work. However, for what we use this data structure for, +/// there should be no pathological inputs since the ultimate source is always +/// a sorted set of Unicode scalar value ranges. +/// +/// Internally, this trie is setup like a finite state machine. Note though +/// that it is acyclic. +#[derive(Clone)] +pub struct RangeTrie { + /// The states in this trie. The first is always the shared final state. + /// The second is always the root state. Otherwise, there is no + /// particular order. + states: Vec<State>, + /// A free-list of states. When a range trie is cleared, all of its states + /// are added to this list. Creating a new state reuses states from this + /// list before allocating a new one. + free: Vec<State>, + /// A stack for traversing this trie to yield sequences of byte ranges in + /// lexicographic order. + iter_stack: RefCell<Vec<NextIter>>, + /// A bufer that stores the current sequence during iteration. + iter_ranges: RefCell<Vec<Utf8Range>>, + /// A stack used for traversing the trie in order to (deeply) duplicate + /// a state. States are recursively duplicated when ranges are split. + dupe_stack: Vec<NextDupe>, + /// A stack used for traversing the trie during insertion of a new + /// sequence of byte ranges. + insert_stack: Vec<NextInsert>, +} + +/// A single state in this trie. +#[derive(Clone)] +struct State { + /// A sorted sequence of non-overlapping transitions to other states. Each + /// transition corresponds to a single range of bytes. + transitions: Vec<Transition>, +} + +/// A transition is a single range of bytes. If a particular byte is in this +/// range, then the corresponding machine may transition to the state pointed +/// to by `next_id`. +#[derive(Clone)] +struct Transition { + /// The byte range. + range: Utf8Range, + /// The next state to transition to. + next_id: StateID, +} + +impl RangeTrie { + /// Create a new empty range trie. + pub fn new() -> RangeTrie { + let mut trie = RangeTrie { + states: vec![], + free: vec![], + iter_stack: RefCell::new(vec![]), + iter_ranges: RefCell::new(vec![]), + dupe_stack: vec![], + insert_stack: vec![], + }; + trie.clear(); + trie + } + + /// Clear this range trie such that it is empty. Clearing a range trie + /// and reusing it can beneficial because this may reuse allocations. + pub fn clear(&mut self) { + self.free.extend(self.states.drain(..)); + self.add_empty(); // final + self.add_empty(); // root + } + + /// Iterate over all of the sequences of byte ranges in this trie, and + /// call the provided function for each sequence. Iteration occurs in + /// lexicographic order. + pub fn iter<E, F: FnMut(&[Utf8Range]) -> Result<(), E>>( + &self, + mut f: F, + ) -> Result<(), E> { + let mut stack = self.iter_stack.borrow_mut(); + stack.clear(); + let mut ranges = self.iter_ranges.borrow_mut(); + ranges.clear(); + + // We do iteration in a way that permits us to use a single buffer + // for our keys. We iterate in a depth first fashion, while being + // careful to expand our frontier as we move deeper in the trie. + stack.push(NextIter { state_id: ROOT, tidx: 0 }); + while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() { + // This could be implemented more simply without an inner loop + // here, but at the cost of more stack pushes. + loop { + let state = self.state(state_id); + // If we've visited all transitions in this state, then pop + // back to the parent state. + if tidx >= state.transitions.len() { + ranges.pop(); + break; + } + + let t = &state.transitions[tidx]; + ranges.push(t.range); + if t.next_id == FINAL { + f(&ranges)?; + ranges.pop(); + tidx += 1; + } else { + // Expand our frontier. Once we come back to this state + // via the stack, start in on the next transition. + stack.push(NextIter { state_id, tidx: tidx + 1 }); + // Otherwise, move to the first transition of the next + // state. + state_id = t.next_id; + tidx = 0; + } + } + } + Ok(()) + } + + /// Inserts a new sequence of ranges into this trie. + /// + /// The sequence given must be non-empty and must not have a length + /// exceeding 4. + pub fn insert(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + assert!(ranges.len() <= 4); + + let mut stack = mem::replace(&mut self.insert_stack, vec![]); + stack.clear(); + + stack.push(NextInsert::new(ROOT, ranges)); + while let Some(next) = stack.pop() { + let (state_id, ranges) = (next.state_id(), next.ranges()); + assert!(!ranges.is_empty()); + + let (mut new, rest) = (ranges[0], &ranges[1..]); + + // i corresponds to the position of the existing transition on + // which we are operating. Typically, the result is to remove the + // transition and replace it with two or more new transitions + // corresponding to the partitions generated by splitting the + // 'new' with the ith transition's range. + let mut i = self.state(state_id).find(new); + + // In this case, there is no overlap *and* the new range is greater + // than all existing ranges. So we can just add it to the end. + if i == self.state(state_id).transitions.len() { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition(state_id, new, next_id); + continue; + } + + // The need for this loop is a bit subtle, buf basically, after + // we've handled the partitions from our initial split, it's + // possible that there will be a partition leftover that overlaps + // with a subsequent transition. If so, then we have to repeat + // the split process again with the leftovers and that subsequent + // transition. + 'OUTER: loop { + let old = self.state(state_id).transitions[i].clone(); + let split = match Split::new(old.range, new) { + Some(split) => split, + None => { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition_at(i, state_id, new, next_id); + continue; + } + }; + let splits = split.as_slice(); + // If we only have one partition, then the ranges must be + // equivalent. There's nothing to do here for this state, so + // just move on to the next one. + if splits.len() == 1 { + // ... but only if we have anything left to do. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + break; + } + // At this point, we know that 'split' is non-empty and there + // must be some overlap AND that the two ranges are not + // equivalent. Therefore, the existing range MUST be removed + // and split up somehow. Instead of actually doing the removal + // and then a subsequent insertion---with all the memory + // shuffling that entails---we simply overwrite the transition + // at position `i` for the first new transition we want to + // insert. After that, we're forced to do expensive inserts. + let mut first = true; + let mut add_trans = + |trie: &mut RangeTrie, pos, from, range, to| { + if first { + trie.set_transition_at(pos, from, range, to); + first = false; + } else { + trie.add_transition_at(pos, from, range, to); + } + }; + for (j, &srange) in splits.iter().enumerate() { + match srange { + SplitRange::Old(r) => { + // Deep clone the state pointed to by the ith + // transition. This is always necessary since 'old' + // is always coupled with at least a 'both' + // partition. We don't want any new changes made + // via the 'both' partition to impact the part of + // the transition that doesn't overlap with the + // new range. + let dup_id = self.duplicate(old.next_id); + add_trans(self, i, state_id, r, dup_id); + } + SplitRange::New(r) => { + // This is a bit subtle, but if this happens to be + // the last partition in our split, it is possible + // that this overlaps with a subsequent transition. + // If it does, then we must repeat the whole + // splitting process over again with `r` and the + // subsequent transition. + { + let trans = &self.state(state_id).transitions; + if j + 1 == splits.len() + && i < trans.len() + && intersects(r, trans[i].range) + { + new = r; + continue 'OUTER; + } + } + + // ... otherwise, setup exploration for a new + // empty state and add a brand new transition for + // this new range. + let next_id = + NextInsert::push(self, &mut stack, rest); + add_trans(self, i, state_id, r, next_id); + } + SplitRange::Both(r) => { + // Continue adding the remaining ranges on this + // path and update the transition with the new + // range. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + add_trans(self, i, state_id, r, old.next_id); + } + } + i += 1; + } + // If we've reached this point, then we know that there are + // no subsequent transitions with any overlap. Therefore, we + // can stop processing this range and move on to the next one. + break; + } + } + self.insert_stack = stack; + } + + pub fn add_empty(&mut self) -> StateID { + if self.states.len() as u64 > u32::MAX as u64 { + // This generally should not happen since a range trie is only + // ever used to compile a single sequence of Unicode scalar values. + // If we ever got to this point, we would, at *minimum*, be using + // 96GB in just the range trie alone. + panic!("too many sequences added to range trie"); + } + let id = self.states.len() as StateID; + // If we have some free states available, then use them to avoid + // more allocations. + if let Some(mut state) = self.free.pop() { + state.clear(); + self.states.push(state); + } else { + self.states.push(State { transitions: vec![] }); + } + id + } + + /// Performs a deep clone of the given state and returns the duplicate's + /// state ID. + /// + /// A "deep clone" in this context means that the state given along with + /// recursively all states that it points to are copied. Once complete, + /// the given state ID and the returned state ID share nothing. + /// + /// This is useful during range trie insertion when a new range overlaps + /// with an existing range that is bigger than the new one. The part + /// of the existing range that does *not* overlap with the new one is + /// duplicated so that adding the new range to the overlap doesn't disturb + /// the non-overlapping portion. + /// + /// There's one exception: if old_id is the final state, then it is not + /// duplicated and the same final state is returned. This is because all + /// final states in this trie are equivalent. + fn duplicate(&mut self, old_id: StateID) -> StateID { + if old_id == FINAL { + return FINAL; + } + + let mut stack = mem::replace(&mut self.dupe_stack, vec![]); + stack.clear(); + + let new_id = self.add_empty(); + // old_id is the state we're cloning and new_id is the ID of the + // duplicated state for old_id. + stack.push(NextDupe { old_id, new_id }); + while let Some(NextDupe { old_id, new_id }) = stack.pop() { + for i in 0..self.state(old_id).transitions.len() { + let t = self.state(old_id).transitions[i].clone(); + if t.next_id == FINAL { + // All final states are the same, so there's no need to + // duplicate it. + self.add_transition(new_id, t.range, FINAL); + continue; + } + + let new_child_id = self.add_empty(); + self.add_transition(new_id, t.range, new_child_id); + stack.push(NextDupe { + old_id: t.next_id, + new_id: new_child_id, + }); + } + } + self.dupe_stack = stack; + new_id + } + + /// Adds the given transition to the given state. + /// + /// Callers must ensure that all previous transitions in this state + /// are lexicographically smaller than the given range. + fn add_transition( + &mut self, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .push(Transition { range, next_id }); + } + + /// Like `add_transition`, except this inserts the transition just before + /// the ith transition. + fn add_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .insert(i, Transition { range, next_id }); + } + + /// Overwrites the transition at position i with the given transition. + fn set_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id).transitions[i] = Transition { range, next_id }; + } + + /// Return an immutable borrow for the state with the given ID. + fn state(&self, id: StateID) -> &State { + &self.states[id as usize] + } + + /// Return a mutable borrow for the state with the given ID. + fn state_mut(&mut self, id: StateID) -> &mut State { + &mut self.states[id as usize] + } +} + +impl State { + /// Find the position at which the given range should be inserted in this + /// state. + /// + /// The position returned is always in the inclusive range + /// [0, transitions.len()]. If 'transitions.len()' is returned, then the + /// given range overlaps with no other range in this state *and* is greater + /// than all of them. + /// + /// For all other possible positions, the given range either overlaps + /// with the transition at that position or is otherwise less than it + /// with no overlap (and is greater than the previous transition). In the + /// former case, careful attention must be paid to inserting this range + /// as a new transition. In the latter case, the range can be inserted as + /// a new transition at the given position without disrupting any other + /// transitions. + fn find(&self, range: Utf8Range) -> usize { + /// Returns the position `i` at which `pred(xs[i])` first returns true + /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never + /// returns true, then `xs.len()` is returned. + /// + /// We roll our own binary search because it doesn't seem like the + /// standard library's binary search can be used here. Namely, if + /// there is an overlapping range, then we want to find the first such + /// occurrence, but there may be many. Or at least, it's not quite + /// clear to me how to do it. + fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize + where + F: FnMut(&T) -> bool, + { + let (mut left, mut right) = (0, xs.len()); + while left < right { + // Overflow is impossible because xs.len() <= 256. + let mid = (left + right) / 2; + if pred(&xs[mid]) { + right = mid; + } else { + left = mid + 1; + } + } + left + } + + // Benchmarks suggest that binary search is just a bit faster than + // straight linear search. Specifically when using the debug tool: + // + // hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'" + binary_search(&self.transitions, |t| range.start <= t.range.end) + } + + /// Clear this state such that it has zero transitions. + fn clear(&mut self) { + self.transitions.clear(); + } +} + +/// The next state to process during duplication. +#[derive(Clone, Debug)] +struct NextDupe { + /// The state we want to duplicate. + old_id: StateID, + /// The ID of the new state that is a duplicate of old_id. + new_id: StateID, +} + +/// The next state (and its corresponding transition) that we want to visit +/// during iteration in lexicographic order. +#[derive(Clone, Debug)] +struct NextIter { + state_id: StateID, + tidx: usize, +} + +/// The next state to process during insertion and any remaining ranges that we +/// want to add for a partcular sequence of ranges. The first such instance +/// is always the root state along with all ranges given. +#[derive(Clone, Debug)] +struct NextInsert { + /// The next state to begin inserting ranges. This state should be the + /// state at which `ranges[0]` should be inserted. + state_id: StateID, + /// The ranges to insert. We used a fixed-size array here to avoid an + /// allocation. + ranges: [Utf8Range; 4], + /// The number of valid ranges in the above array. + len: u8, +} + +impl NextInsert { + /// Create the next item to visit. The given state ID should correspond + /// to the state at which the first range in the given slice should be + /// inserted. The slice given must not be empty and it must be no longer + /// than 4. + fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert { + let len = ranges.len(); + assert!(len > 0); + assert!(len <= 4); + + let mut tmp = [Utf8Range { start: 0, end: 0 }; 4]; + tmp[..len].copy_from_slice(ranges); + NextInsert { state_id, ranges: tmp, len: len as u8 } + } + + /// Push a new empty state to visit along with any remaining ranges that + /// still need to be inserted. The ID of the new empty state is returned. + /// + /// If ranges is empty, then no new state is created and FINAL is returned. + fn push( + trie: &mut RangeTrie, + stack: &mut Vec<NextInsert>, + ranges: &[Utf8Range], + ) -> StateID { + if ranges.is_empty() { + FINAL + } else { + let next_id = trie.add_empty(); + stack.push(NextInsert::new(next_id, ranges)); + next_id + } + } + + /// Return the ID of the state to visit. + fn state_id(&self) -> StateID { + self.state_id + } + + /// Return the remaining ranges to insert. + fn ranges(&self) -> &[Utf8Range] { + &self.ranges[..self.len as usize] + } +} + +/// Split represents a partitioning of two ranges into one or more ranges. This +/// is the secret sauce that makes a range trie work, as it's what tells us +/// how to deal with two overlapping but unequal ranges during insertion. +/// +/// Essentially, either two ranges overlap or they don't. If they don't, then +/// handling insertion is easy: just insert the new range into its +/// lexicographically correct position. Since it does not overlap with anything +/// else, no other transitions are impacted by the new range. +/// +/// If they do overlap though, there are generally three possible cases to +/// handle: +/// +/// 1. The part where the two ranges actually overlap. i.e., The intersection. +/// 2. The part of the existing range that is not in the the new range. +/// 3. The part of the new range that is not in the old range. +/// +/// (1) is guaranteed to always occur since all overlapping ranges have a +/// non-empty intersection. If the two ranges are not equivalent, then at +/// least one of (2) or (3) is guaranteed to occur as well. In some cases, +/// e.g., `[0-4]` and `[4-9]`, all three cases will occur. +/// +/// This `Split` type is responsible for providing (1), (2) and (3) for any +/// possible pair of byte ranges. +/// +/// As for insertion, for the overlap in (1), the remaining ranges to insert +/// should be added by following the corresponding transition. However, this +/// should only be done for the overlapping parts of the range. If there was +/// a part of the existing range that was not in the new range, then that +/// existing part must be split off from the transition and duplicated. The +/// remaining parts of the overlap can then be added to using the new ranges +/// without disturbing the existing range. +/// +/// Handling the case for the part of a new range that is not in an existing +/// range is seemingly easy. Just treat it as if it were a non-overlapping +/// range. The problem here is that if this new non-overlapping range occurs +/// after both (1) and (2), then it's possible that it can overlap with the +/// next transition in the current state. If it does, then the whole process +/// must be repeated! +/// +/// # Details of the 3 cases +/// +/// The following details the various cases that are implemented in code +/// below. It's plausible that the number of cases is not actually minimal, +/// but it's important for this code to remain at least somewhat readable. +/// +/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define +/// the follow distinct relationships where at least one must apply. The order +/// of these matters, since multiple can match. The first to match applies. +/// +/// 1. b < x <=> [a,b] < [x,y] +/// 2. y < a <=> [x,y] < [a,b] +/// +/// In the case of (1) and (2), these are the only cases where there is no +/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In +/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The +/// intersection in all of the following cases is non-empty. +/// +/// 3. a = x && b = y <=> [a,b] == [x,y] +/// 4. a = x && b < y <=> [x,y] right-extends [a,b] +/// 5. b = y && a > x <=> [x,y] left-extends [a,b] +/// 6. x = a && y < b <=> [a,b] right-extends [x,y] +/// 7. y = b && x > a <=> [a,b] left-extends [x,y] +/// 8. a > x && b < y <=> [x,y] covers [a,b] +/// 9. x > a && y < b <=> [a,b] covers [x,y] +/// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y] +/// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b] +/// 12. b > x && b < y <=> [a,b] left-overlaps [x,y] +/// 13. y > a && y < b <=> [x,y] left-overlaps [a,b] +/// +/// In cases 3-13, we can form rules that partition the ranges into a +/// non-overlapping ordered sequence of ranges: +/// +/// 3. [a,b] +/// 4. [a,b], [b+1,y] +/// 5. [x,a-1], [a,b] +/// 6. [x,y], [y+1,b] +/// 7. [a,x-1], [x,y] +/// 8. [x,a-1], [a,b], [b+1,y] +/// 9. [a,x-1], [x,y], [y+1,b] +/// 10. [a,b-1], [b,b], [b+1,y] +/// 11. [x,y-1], [y,y], [y+1,b] +/// 12. [a,x-1], [x,b], [b+1,y] +/// 13. [x,a-1], [a,y], [y+1,b] +/// +/// In the code below, we go a step further and identify each of the above +/// outputs as belonging either to the overlap of the two ranges or to one +/// of [a,b] or [x,y] exclusively. +#[derive(Clone, Debug, Eq, PartialEq)] +struct Split { + partitions: [SplitRange; 3], + len: usize, +} + +/// A tagged range indicating how it was derived from a pair of ranges. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SplitRange { + Old(Utf8Range), + New(Utf8Range), + Both(Utf8Range), +} + +impl Split { + /// Create a partitioning of the given ranges. + /// + /// If the given ranges have an empty intersection, then None is returned. + fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // Use same names as the comment above to make it easier to compare. + let (a, b, x, y) = (o.start, o.end, n.start, n.end); + + if b < x || y < a { + // case 1, case 2 + None + } else if a == x && b == y { + // case 3 + Some(Split::parts1(both(a..=b))) + } else if a == x && b < y { + // case 4 + Some(Split::parts2(both(a..=b), new(b + 1..=y))) + } else if b == y && a > x { + // case 5 + Some(Split::parts2(new(x..=a - 1), both(a..=b))) + } else if x == a && y < b { + // case 6 + Some(Split::parts2(both(x..=y), old(y + 1..=b))) + } else if y == b && x > a { + // case 7 + Some(Split::parts2(old(a..=x - 1), both(x..=y))) + } else if a > x && b < y { + // case 8 + Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y))) + } else if x > a && y < b { + // case 9 + Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b))) + } else if b == x && a < y { + // case 10 + Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y))) + } else if y == a && x < b { + // case 11 + Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b))) + } else if b > x && b < y { + // case 12 + Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y))) + } else if y > a && y < b { + // case 13 + Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b))) + } else { + unreachable!() + } + } + + /// Create a new split with a single partition. This only occurs when two + /// ranges are equivalent. + fn parts1(r1: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, nada, nada], len: 1 } + } + + /// Create a new split with two partitions. + fn parts2(r1: SplitRange, r2: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, r2, nada], len: 2 } + } + + /// Create a new split with three partitions. + fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split { + Split { partitions: [r1, r2, r3], len: 3 } + } + + /// Return the partitions in this split as a slice. + fn as_slice(&self) -> &[SplitRange] { + &self.partitions[..self.len] + } +} + +impl fmt::Debug for RangeTrie { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "")?; + for (i, state) in self.states.iter().enumerate() { + let status = if i == FINAL as usize { '*' } else { ' ' }; + writeln!(f, "{}{:06}: {:?}", status, i, state)?; + } + Ok(()) + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let rs = self + .transitions + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "{}", rs) + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.range.start == self.range.end { + write!(f, "{:02X} => {:02X}", self.range.start, self.next_id) + } else { + write!( + f, + "{:02X}-{:02X} => {:02X}", + self.range.start, self.range.end, self.next_id + ) + } + } +} + +/// Returns true if and only if the given ranges intersect. +fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { + !(r1.end < r2.start || r2.end < r1.start) +} + +#[cfg(test)] +mod tests { + use core::ops::RangeInclusive; + + use regex_syntax::utf8::Utf8Range; + + use super::*; + + fn r(range: RangeInclusive<u8>) -> Utf8Range { + Utf8Range { start: *range.start(), end: *range.end() } + } + + fn split_maybe( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Option<Split> { + Split::new(r(old), r(new)) + } + + fn split( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Vec<SplitRange> { + split_maybe(old, new).unwrap().as_slice().to_vec() + } + + #[test] + fn no_splits() { + // case 1 + assert_eq!(None, split_maybe(0..=1, 2..=3)); + // case 2 + assert_eq!(None, split_maybe(2..=3, 0..=1)); + } + + #[test] + fn splits() { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // case 3 + assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]); + assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]); + + // case 4 + assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]); + assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]); + assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]); + + // case 5 + assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]); + assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]); + assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]); + + // case 6 + assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]); + assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]); + assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]); + + // case 7 + assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]); + assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]); + assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]); + + // case 8 + assert_eq!( + split(3..=6, 2..=7), + vec![new(2..=2), both(3..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 1..=8), + vec![new(1..=2), both(3..=6), new(7..=8)], + ); + + // case 9 + assert_eq!( + split(2..=7, 3..=6), + vec![old(2..=2), both(3..=6), old(7..=7)], + ); + assert_eq!( + split(1..=8, 3..=6), + vec![old(1..=2), both(3..=6), old(7..=8)], + ); + + // case 10 + assert_eq!( + split(3..=6, 6..=7), + vec![old(3..=5), both(6..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 6..=8), + vec![old(3..=5), both(6..=6), new(7..=8)], + ); + assert_eq!( + split(5..=6, 6..=7), + vec![old(5..=5), both(6..=6), new(7..=7)], + ); + + // case 11 + assert_eq!( + split(6..=7, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=7)], + ); + assert_eq!( + split(6..=8, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=8)], + ); + assert_eq!( + split(6..=7, 5..=6), + vec![new(5..=5), both(6..=6), old(7..=7)], + ); + + // case 12 + assert_eq!( + split(3..=7, 5..=9), + vec![old(3..=4), both(5..=7), new(8..=9)], + ); + assert_eq!( + split(3..=5, 4..=6), + vec![old(3..=3), both(4..=5), new(6..=6)], + ); + + // case 13 + assert_eq!( + split(5..=9, 3..=7), + vec![new(3..=4), both(5..=7), old(8..=9)], + ); + assert_eq!( + split(4..=6, 3..=5), + vec![new(3..=3), both(4..=5), old(6..=6)], + ); + } + + // Arguably there should be more tests here, but in practice, this data + // structure is well covered by the huge number of regex tests. +} diff --git a/vendor/regex-automata-0.2.0/src/util/alphabet.rs b/vendor/regex-automata-0.2.0/src/util/alphabet.rs new file mode 100644 index 000000000..0bc1ece58 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/alphabet.rs @@ -0,0 +1,790 @@ +use core::convert::TryFrom; + +use crate::util::{ + bytes::{DeserializeError, SerializeError}, + DebugByte, +}; + +/// Unit represents a single unit of input for DFA based regex engines. +/// +/// **NOTE:** It is not expected for consumers of this crate to need to use +/// this type unless they are implementing their own DFA. And even then, it's +/// not required: implementors may use other techniques to handle input. +/// +/// Typically, a single unit of input for a DFA would be a single byte. +/// However, for the DFAs in this crate, matches are delayed by a single byte +/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once +/// we have consumed the haystack, we must run the DFA through one additional +/// transition using an input that indicates the haystack has ended. +/// +/// Since there is no way to represent a sentinel with a `u8` since all +/// possible values *may* be valid inputs to a DFA, this type explicitly adds +/// room for a sentinel value. +/// +/// The sentinel EOI value is always its own equivalence class and is +/// ultimately represented by adding 1 to the maximum equivalence class value. +/// So for example, the regex `^[a-z]+$` might be split into the following +/// equivalence classes: +/// +/// ```text +/// 0 => [\x00-`] +/// 1 => [a-z] +/// 2 => [{-\xFF] +/// 3 => [EOI] +/// ``` +/// +/// Where EOI is the special sentinel value that is always in its own +/// singleton equivalence class. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub enum Unit { + U8(u8), + EOI(u16), +} + +impl Unit { + /// Create a new input unit from a byte value. + /// + /// All possible byte values are legal. However, when creating an input + /// unit for a specific DFA, one should be careful to only construct input + /// units that are in that DFA's alphabet. Namely, one way to compact a + /// DFA's in-memory representation is to collapse its transitions to a set + /// of equivalence classes into a set of all possible byte values. If a + /// DFA uses equivalence classes instead of byte values, then the byte + /// given here should be the equivalence class. + pub fn u8(byte: u8) -> Unit { + Unit::U8(byte) + } + + pub fn eoi(num_byte_equiv_classes: usize) -> Unit { + assert!( + num_byte_equiv_classes <= 256, + "max number of byte-based equivalent classes is 256, but got {}", + num_byte_equiv_classes, + ); + Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap()) + } + + pub fn as_u8(self) -> Option<u8> { + match self { + Unit::U8(b) => Some(b), + Unit::EOI(_) => None, + } + } + + #[cfg(feature = "alloc")] + pub fn as_eoi(self) -> Option<usize> { + match self { + Unit::U8(_) => None, + Unit::EOI(eoi) => Some(eoi as usize), + } + } + + pub fn as_usize(self) -> usize { + match self { + Unit::U8(b) => b as usize, + Unit::EOI(eoi) => eoi as usize, + } + } + + pub fn is_eoi(&self) -> bool { + match *self { + Unit::EOI(_) => true, + _ => false, + } + } + + #[cfg(feature = "alloc")] + pub fn is_word_byte(&self) -> bool { + self.as_u8().map_or(false, crate::util::is_word_byte) + } +} + +impl core::fmt::Debug for Unit { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self { + Unit::U8(b) => write!(f, "{:?}", DebugByte(b)), + Unit::EOI(_) => write!(f, "EOI"), + } + } +} + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in a DFA to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of a dense DFA, +/// but also on compile times. +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + #[cfg(feature = "alloc")] + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for i in 0..256 { + classes.set(i as u8, i as u8); + } + classes + } + + /// Deserializes a byte class map from the given slice. If the slice is of + /// insufficient length or otherwise contains an impossible mapping, then + /// an error is returned. Upon success, the number of bytes read along with + /// the map are returned. The number of bytes read is always a multiple of + /// 8. + pub fn from_bytes( + slice: &[u8], + ) -> Result<(ByteClasses, usize), DeserializeError> { + if slice.len() < 256 { + return Err(DeserializeError::buffer_too_small("byte class map")); + } + let mut classes = ByteClasses::empty(); + for (b, &class) in slice[..256].iter().enumerate() { + classes.set(b as u8, class); + } + for b in classes.iter() { + if b.as_usize() >= classes.alphabet_len() { + return Err(DeserializeError::generic( + "found equivalence class greater than alphabet len", + )); + } + } + Ok((classes, 256)) + } + + /// Writes this byte class map to the given byte buffer. if the given + /// buffer is too small, then an error is returned. Upon success, the total + /// number of bytes written is returned. The number of bytes written is + /// guaranteed to be a multiple of 8. + pub fn write_to( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("byte class map")); + } + for b in 0..=255 { + dst[0] = self.get(b); + dst = &mut dst[1..]; + } + Ok(nwrite) + } + + /// Returns the total number of bytes written by `write_to`. + pub fn write_to_len(&self) -> usize { + 256 + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[byte as usize] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + self.0[byte as usize] + } + + /// Get the equivalence class for the given byte while forcefully + /// eliding bounds checks. + #[inline] + pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { + *self.0.get_unchecked(byte as usize) + } + + /// Get the equivalence class for the given input unit and return the + /// class as a `usize`. + #[inline] + pub fn get_by_unit(&self, unit: Unit) -> usize { + match unit { + Unit::U8(b) => usize::try_from(self.get(b)).unwrap(), + Unit::EOI(b) => usize::try_from(b).unwrap(), + } + } + + #[inline] + pub fn eoi(&self) -> Unit { + Unit::eoi(self.alphabet_len().checked_sub(1).unwrap()) + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + // Add one since the number of equivalence classes is one bigger than + // the last one. But add another to account for the final EOI class + // that isn't explicitly represented. + self.0[255] as usize + 1 + 1 + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. + #[cfg(feature = "alloc")] + pub fn stride2(&self) -> usize { + self.alphabet_len().next_power_of_two().trailing_zeros() as usize + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 257 equivalence classes + /// and each class contains exactly one byte (plus the special EOI class). + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 257 + } + + /// Returns an iterator over all equivalence classes in this set. + pub fn iter(&self) -> ByteClassIter<'_> { + ByteClassIter { classes: self, i: 0 } + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class. Namely, this yields exactly N items, where N is + /// equivalent to the number of equivalence classes. Each item is an + /// arbitrary byte drawn from each equivalence class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes yet. Picking an arbitrary + /// byte from each equivalence class then permits a full exploration of + /// the NFA instead of using every possible byte value. + #[cfg(feature = "alloc")] + pub fn representatives(&self) -> ByteClassRepresentatives<'_> { + ByteClassRepresentatives { classes: self, byte: 0, last_class: None } + } + + /// Returns an iterator of the bytes in the given equivalence class. + pub fn elements(&self, class: Unit) -> ByteClassElements { + ByteClassElements { classes: self, class, byte: 0 } + } + + /// Returns an iterator of byte ranges in the given equivalence class. + /// + /// That is, a sequence of contiguous ranges are returned. Typically, every + /// class maps to a single contiguous range. + fn element_ranges(&self, class: Unit) -> ByteClassElementRanges { + ByteClassElementRanges { elements: self.elements(class), range: None } + } +} + +impl core::fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for (i, class) in self.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?} => [", class.as_usize())?; + for (start, end) in self.element_ranges(class) { + if start == end { + write!(f, "{:?}", start)?; + } else { + write!(f, "{:?}-{:?}", start, end)?; + } + } + write!(f, "]")?; + } + write!(f, ")") + } + } +} + +/// An iterator over each equivalence class. +#[derive(Debug)] +pub struct ByteClassIter<'a> { + classes: &'a ByteClasses, + i: usize, +} + +impl<'a> Iterator for ByteClassIter<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + if self.i + 1 == self.classes.alphabet_len() { + self.i += 1; + Some(self.classes.eoi()) + } else if self.i < self.classes.alphabet_len() { + let class = self.i as u8; + self.i += 1; + Some(Unit::u8(class)) + } else { + None + } + } +} + +/// An iterator over representative bytes from each equivalence class. +#[cfg(feature = "alloc")] +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + byte: usize, + last_class: Option<u8>, +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.byte < 256 { + let byte = self.byte as u8; + let class = self.classes.get(byte); + self.byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(Unit::u8(byte)); + } + } + if self.byte == 256 { + self.byte += 1; + return Some(self.classes.eoi()); + } + None + } +} + +/// An iterator over all elements in an equivalence class. +#[derive(Debug)] +pub struct ByteClassElements<'a> { + classes: &'a ByteClasses, + class: Unit, + byte: usize, +} + +impl<'a> Iterator for ByteClassElements<'a> { + type Item = Unit; + + fn next(&mut self) -> Option<Unit> { + while self.byte < 256 { + let byte = self.byte as u8; + self.byte += 1; + if self.class.as_u8() == Some(self.classes.get(byte)) { + return Some(Unit::u8(byte)); + } + } + if self.byte < 257 { + self.byte += 1; + if self.class.is_eoi() { + return Some(Unit::eoi(256)); + } + } + None + } +} + +/// An iterator over all elements in an equivalence class expressed as a +/// sequence of contiguous ranges. +#[derive(Debug)] +pub struct ByteClassElementRanges<'a> { + elements: ByteClassElements<'a>, + range: Option<(Unit, Unit)>, +} + +impl<'a> Iterator for ByteClassElementRanges<'a> { + type Item = (Unit, Unit); + + fn next(&mut self) -> Option<(Unit, Unit)> { + loop { + let element = match self.elements.next() { + None => return self.range.take(), + Some(element) => element, + }; + match self.range.take() { + None => { + self.range = Some((element, element)); + } + Some((start, end)) => { + if end.as_usize() + 1 != element.as_usize() + || element.is_eoi() + { + self.range = Some((element, element)); + return Some((start, end)); + } + self.range = Some((start, element)); + } + } + } + } +} + +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the +/// same equivalence class because it never matters whether an `a` or a `b` is +/// seen, and no combination of `a`s and `b`s in the text can discriminate a +/// match. +/// +/// Note though that this does not compute the minimal set of equivalence +/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the +/// same equivalence class for the same reason that `a` and `b` are in the +/// same equivalence class in the aforementioned regex. However, in this +/// implementation, `a` and `c` are put into distinct equivalence classes. The +/// reason for this is implementation complexity. In the future, we should +/// endeavor to compute the minimal equivalence classes since they can have a +/// rather large impact on the size of the DFA. (Doing this will likely require +/// rethinking how equivalence classes are computed, including changing the +/// representation here, which is only able to group contiguous bytes into the +/// same equivalence class.) +#[derive(Clone, Debug)] +pub struct ByteClassSet(ByteSet); + +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + #[cfg(feature = "alloc")] + pub fn empty() -> Self { + ByteClassSet(ByteSet::empty()) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + #[cfg(feature = "alloc")] + pub fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0.add(start - 1); + } + self.0.add(end); + } + + /// Add the contiguous ranges in the set given to this byte class set. + #[cfg(feature = "alloc")] + pub fn add_set(&mut self, set: &ByteSet) { + for (start, end) in set.iter_ranges() { + self.set_range(start, end); + } + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + #[cfg(feature = "alloc")] + pub fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut b = 0u8; + loop { + classes.set(b, class); + if b == 255 { + break; + } + if self.0.contains(b) { + class = class.checked_add(1).unwrap(); + } + b = b.checked_add(1).unwrap(); + } + classes + } +} + +/// A simple set of bytes that is reasonably cheap to copy and allocation free. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct ByteSet { + bits: BitSet, +} + +/// The representation of a byte set. Split out so that we can define a +/// convenient Debug impl for it while keeping "ByteSet" in the output. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +struct BitSet([u128; 2]); + +impl ByteSet { + /// Create an empty set of bytes. + #[cfg(feature = "alloc")] + pub fn empty() -> ByteSet { + ByteSet { bits: BitSet([0; 2]) } + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + #[cfg(feature = "alloc")] + pub fn add(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[bucket as usize] |= 1 << bit; + } + + /// Add an inclusive range of bytes. + #[cfg(feature = "alloc")] + pub fn add_all(&mut self, start: u8, end: u8) { + for b in start..=end { + self.add(b); + } + } + + /// Remove a byte from this set. + /// + /// If the given byte is not in this set, then this is a no-op. + #[cfg(feature = "alloc")] + pub fn remove(&mut self, byte: u8) { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[bucket as usize] &= !(1 << bit); + } + + /// Remove an inclusive range of bytes. + #[cfg(feature = "alloc")] + pub fn remove_all(&mut self, start: u8, end: u8) { + for b in start..=end { + self.remove(b); + } + } + + /// Return true if and only if the given byte is in this set. + pub fn contains(&self, byte: u8) -> bool { + let bucket = byte / 128; + let bit = byte % 128; + self.bits.0[bucket as usize] & (1 << bit) > 0 + } + + /// Return true if and only if the given inclusive range of bytes is in + /// this set. + #[cfg(feature = "alloc")] + pub fn contains_range(&self, start: u8, end: u8) -> bool { + (start..=end).all(|b| self.contains(b)) + } + + /// Returns an iterator over all bytes in this set. + #[cfg(feature = "alloc")] + pub fn iter(&self) -> ByteSetIter { + ByteSetIter { set: self, b: 0 } + } + + /// Returns an iterator over all contiguous ranges of bytes in this set. + #[cfg(feature = "alloc")] + pub fn iter_ranges(&self) -> ByteSetRangeIter { + ByteSetRangeIter { set: self, b: 0 } + } + + /// Return the number of bytes in this set. + #[cfg(feature = "alloc")] + pub fn len(&self) -> usize { + (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize + } + + /// Return true if and only if this set is empty. + #[cfg(feature = "alloc")] + pub fn is_empty(&self) -> bool { + self.bits.0 == [0, 0] + } +} + +impl core::fmt::Debug for BitSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmtd = f.debug_set(); + for b in (0..256).map(|b| b as u8) { + if (ByteSet { bits: *self }).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +#[derive(Debug)] +pub struct ByteSetIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetIter<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while self.b <= 255 { + let b = self.b as u8; + self.b += 1; + if self.set.contains(b) { + return Some(b); + } + } + None + } +} + +#[derive(Debug)] +pub struct ByteSetRangeIter<'a> { + set: &'a ByteSet, + b: usize, +} + +impl<'a> Iterator for ByteSetRangeIter<'a> { + type Item = (u8, u8); + + fn next(&mut self) -> Option<(u8, u8)> { + while self.b <= 255 { + let start = self.b as u8; + self.b += 1; + if !self.set.contains(start) { + continue; + } + + let mut end = start; + while self.b <= 255 && self.set.contains(self.b as u8) { + end = self.b as u8; + self.b += 1; + } + return Some((start, end)); + } + None + } +} + +#[cfg(test)] +#[cfg(feature = "alloc")] +mod tests { + use alloc::{vec, vec::Vec}; + + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::empty(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::empty(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::empty(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.byte_classes().alphabet_len(), 257); + } + + #[test] + fn elements_typical() { + let mut set = ByteClassSet::empty(); + set.set_range(b'b', b'd'); + set.set_range(b'g', b'm'); + set.set_range(b'z', b'z'); + let classes = set.byte_classes(); + // class 0: \x00-a + // class 1: b-d + // class 2: e-f + // class 3: g-m + // class 4: n-y + // class 5: z-z + // class 6: \x7B-\xFF + // class 7: EOI + assert_eq!(classes.alphabet_len(), 8); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 98); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[97], Unit::u8(b'a')); + + let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')], + ); + + let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],); + + let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>(); + assert_eq!( + elements, + vec![ + Unit::u8(b'g'), + Unit::u8(b'h'), + Unit::u8(b'i'), + Unit::u8(b'j'), + Unit::u8(b'k'), + Unit::u8(b'l'), + Unit::u8(b'm'), + ], + ); + + let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 12); + assert_eq!(elements[0], Unit::u8(b'n')); + assert_eq!(elements[11], Unit::u8(b'y')); + + let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'z')]); + + let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 133); + assert_eq!(elements[0], Unit::u8(b'\x7B')); + assert_eq!(elements[132], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_singletons() { + let classes = ByteClasses::singletons(); + assert_eq!(classes.alphabet_len(), 257); + + let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::u8(b'a')]); + + let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } + + #[test] + fn elements_empty() { + let classes = ByteClasses::empty(); + assert_eq!(classes.alphabet_len(), 2); + + let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>(); + assert_eq!(elements.len(), 256); + assert_eq!(elements[0], Unit::u8(b'\x00')); + assert_eq!(elements[255], Unit::u8(b'\xFF')); + + let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>(); + assert_eq!(elements, vec![Unit::eoi(256)]); + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/bytes.rs b/vendor/regex-automata-0.2.0/src/util/bytes.rs new file mode 100644 index 000000000..5877bb149 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/bytes.rs @@ -0,0 +1,950 @@ +/* +A collection of helper functions, types and traits for serializing automata. + +This crate defines its own bespoke serialization mechanism for some structures +provided in the public API, namely, DFAs. A bespoke mechanism was developed +primarily because structures like automata demand a specific binary format. +Attempting to encode their rich structure in an existing serialization +format is just not feasible. Moreover, the format for each structure is +generally designed such that deserialization is cheap. More specifically, that +deserialization can be done in constant time. (The idea being that you can +embed it into your binary or mmap it, and then use it immediately.) + +In order to achieve this, most of the structures in this crate use an in-memory +representation that very closely corresponds to its binary serialized form. +This pervades and complicates everything, and in some cases, requires dealing +with alignment and reasoning about safety. + +This technique does have major advantages. In particular, it permits doing +the potentially costly work of compiling a finite state machine in an offline +manner, and then loading it at runtime not only without having to re-compile +the regex, but even without the code required to do the compilation. This, for +example, permits one to use a pre-compiled DFA not only in environments without +Rust's standard library, but also in environments without a heap. + +In the code below, whenever we insert some kind of padding, it's to enforce a +4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type +supported. (In a previous version of this library, DFAs were generic over the +state ID representation.) + +Also, serialization generally requires the caller to specify endianness, +where as deserialization always assumes native endianness (otherwise cheap +deserialization would be impossible). This implies that serializing a structure +generally requires serializing both its big-endian and little-endian variants, +and then loading the correct one based on the target's endianness. +*/ + +use core::{ + cmp, + convert::{TryFrom, TryInto}, + mem::size_of, +}; + +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; + +use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError}; + +/// An error that occurs when serializing an object from this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. To this end, serialization is generally infallible. +/// However, it can fail when caller provided buffer sizes are too small. When +/// that occurs, a serialization error is reported. +/// +/// A `SerializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct SerializeError { + /// The name of the thing that a buffer is too small for. + /// + /// Currently, the only kind of serialization error is one that is + /// committed by a caller: providing a destination buffer that is too + /// small to fit the serialized object. This makes sense conceptually, + /// since every valid inhabitant of a type should be serializable. + /// + /// This is somewhat exposed in the public API of this crate. For example, + /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are + /// guaranteed to never panic or error. This is only possible because the + /// implementation guarantees that it will allocate a `Vec<u8>` that is + /// big enough. + /// + /// In summary, if a new serialization error kind needs to be added, then + /// it will need careful consideration. + what: &'static str, +} + +impl SerializeError { + pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError { + SerializeError { what } + } +} + +impl core::fmt::Display for SerializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "destination buffer is too small to write {}", self.what) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SerializeError {} + +/// An error that occurs when deserializing an object defined in this crate. +/// +/// Serialization, as used in this crate, universally refers to the process +/// of transforming a structure (like a DFA) into a custom binary format +/// represented by `&[u8]`. Deserialization, then, refers to the process of +/// cheaply converting this binary format back to the object's in-memory +/// representation as defined in this crate. To the extent possible, +/// deserialization will report this error whenever this process fails. +/// +/// A `DeserializeError` provides no introspection capabilities. Its only +/// supported operation is conversion to a human readable error message. +/// +/// This error type implements the `std::error::Error` trait only when the +/// `std` feature is enabled. Otherwise, this type is defined in all +/// configurations. +#[derive(Debug)] +pub struct DeserializeError(DeserializeErrorKind); + +#[derive(Debug)] +enum DeserializeErrorKind { + Generic { msg: &'static str }, + BufferTooSmall { what: &'static str }, + InvalidUsize { what: &'static str }, + InvalidVarint { what: &'static str }, + VersionMismatch { expected: u32, found: u32 }, + EndianMismatch { expected: u32, found: u32 }, + AlignmentMismatch { alignment: usize, address: usize }, + LabelMismatch { expected: &'static str }, + ArithmeticOverflow { what: &'static str }, + PatternID { err: PatternIDError, what: &'static str }, + StateID { err: StateIDError, what: &'static str }, +} + +impl DeserializeError { + pub(crate) fn generic(msg: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::Generic { msg }) + } + + pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::BufferTooSmall { what }) + } + + pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::InvalidUsize { what }) + } + + fn invalid_varint(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::InvalidVarint { what }) + } + + fn version_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::VersionMismatch { + expected, + found, + }) + } + + fn endian_mismatch(expected: u32, found: u32) -> DeserializeError { + DeserializeError(DeserializeErrorKind::EndianMismatch { + expected, + found, + }) + } + + fn alignment_mismatch( + alignment: usize, + address: usize, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::AlignmentMismatch { + alignment, + address, + }) + } + + fn label_mismatch(expected: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::LabelMismatch { expected }) + } + + fn arithmetic_overflow(what: &'static str) -> DeserializeError { + DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what }) + } + + pub(crate) fn pattern_id_error( + err: PatternIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::PatternID { err, what }) + } + + pub(crate) fn state_id_error( + err: StateIDError, + what: &'static str, + ) -> DeserializeError { + DeserializeError(DeserializeErrorKind::StateID { err, what }) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for DeserializeError {} + +impl core::fmt::Display for DeserializeError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use self::DeserializeErrorKind::*; + + match self.0 { + Generic { msg } => write!(f, "{}", msg), + BufferTooSmall { what } => { + write!(f, "buffer is too small to read {}", what) + } + InvalidUsize { what } => { + write!(f, "{} is too big to fit in a usize", what) + } + InvalidVarint { what } => { + write!(f, "could not decode valid varint for {}", what) + } + VersionMismatch { expected, found } => write!( + f, + "unsupported version: \ + expected version {} but found version {}", + expected, found, + ), + EndianMismatch { expected, found } => write!( + f, + "endianness mismatch: expected 0x{:X} but got 0x{:X}. \ + (Are you trying to load an object serialized with a \ + different endianness?)", + expected, found, + ), + AlignmentMismatch { alignment, address } => write!( + f, + "alignment mismatch: slice starts at address \ + 0x{:X}, which is not aligned to a {} byte boundary", + address, alignment, + ), + LabelMismatch { expected } => write!( + f, + "label mismatch: start of serialized object should \ + contain a NUL terminated {:?} label, but a different \ + label was found", + expected, + ), + ArithmeticOverflow { what } => { + write!(f, "arithmetic overflow for {}", what) + } + PatternID { ref err, what } => { + write!(f, "failed to read pattern ID for {}: {}", what, err) + } + StateID { ref err, what } => { + write!(f, "failed to read state ID for {}: {}", what, err) + } + } + } +} + +/// Checks that the given slice has an alignment that matches `T`. +/// +/// This is useful for checking that a slice has an appropriate alignment +/// before casting it to a &[T]. Note though that alignment is not itself +/// sufficient to perform the cast for any `T`. +pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> { + let alignment = core::mem::align_of::<T>(); + let address = slice.as_ptr() as usize; + if address % alignment == 0 { + return Ok(()); + } + Err(DeserializeError::alignment_mismatch(alignment, address)) +} + +/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning +/// of the given slice. All padding bytes must be NUL bytes. +/// +/// This is useful because it can be theoretically necessary to pad the +/// beginning of a serialized object with NUL bytes to ensure that it starts +/// at a correctly aligned address. These padding bytes should come immediately +/// before the label. +/// +/// This returns the number of bytes read from the given slice. +pub fn skip_initial_padding(slice: &[u8]) -> usize { + let mut nread = 0; + while nread < 7 && nread < slice.len() && slice[nread] == 0 { + nread += 1; + } + nread +} + +/// Allocate a byte buffer of the given size, along with some initial padding +/// such that `buf[padding..]` has the same alignment as `T`, where the +/// alignment of `T` must be at most `8`. In particular, callers should treat +/// the first N bytes (second return value) as padding bytes that must not be +/// overwritten. In all cases, the following identity holds: +/// +/// ```ignore +/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE); +/// assert_eq!(SIZE, buf[padding..].len()); +/// ``` +/// +/// In practice, padding is often zero. +/// +/// The requirement for `8` as a maximum here is somewhat arbitrary. In +/// practice, we never need anything bigger in this crate, and so this function +/// does some sanity asserts under the assumption of a max alignment of `8`. +#[cfg(feature = "alloc")] +pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) { + // FIXME: This is a kludge because there's no easy way to allocate a + // Vec<u8> with an alignment guaranteed to be greater than 1. We could + // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8> + // without concern, since reallocing or dropping the Vec<u8> is UB + // (different alignment than the initial allocation). We could define a + // wrapper type to manage this for us, but it seems like more machinery + // than it's worth. + let mut buf = vec![0; size]; + let align = core::mem::align_of::<T>(); + let address = buf.as_ptr() as usize; + if address % align == 0 { + return (buf, 0); + } + // It's not quite clear how to robustly test this code, since the allocator + // in my environment appears to always return addresses aligned to at + // least 8 bytes, even when the alignment requirement is smaller. A feeble + // attempt at ensuring correctness is provided with asserts. + let padding = ((address & !0b111).checked_add(8).unwrap()) + .checked_sub(address) + .unwrap(); + assert!(padding <= 7, "padding of {} is bigger than 7", padding); + buf.extend(core::iter::repeat(0).take(padding)); + assert_eq!(size + padding, buf.len()); + assert_eq!( + 0, + buf[padding..].as_ptr() as usize % align, + "expected end of initial padding to be aligned to {}", + align, + ); + (buf, padding) +} + +/// Reads a NUL terminated label starting at the beginning of the given slice. +/// +/// If a NUL terminated label could not be found, then an error is returned. +/// Similary, if a label is found but doesn't match the expected label, then +/// an error is returned. +/// +/// Upon success, the total number of bytes read (including padding bytes) is +/// returned. +pub fn read_label( + slice: &[u8], + expected_label: &'static str, +) -> Result<usize, DeserializeError> { + // Set an upper bound on how many bytes we scan for a NUL. Since no label + // in this crate is longer than 256 bytes, if we can't find one within that + // range, then we have corrupted data. + let first_nul = + slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0); + let first_nul = match first_nul { + Some(first_nul) => first_nul, + None => { + return Err(DeserializeError::generic( + "could not find NUL terminated label \ + at start of serialized object", + )); + } + }; + let len = first_nul + padding_len(first_nul); + if slice.len() < len { + return Err(DeserializeError::generic( + "could not find properly sized label at start of serialized object" + )); + } + if expected_label.as_bytes() != &slice[..first_nul] { + return Err(DeserializeError::label_mismatch(expected_label)); + } + Ok(len) +} + +/// Writes the given label to the buffer as a NUL terminated string. The label +/// given must not contain NUL, otherwise this will panic. Similarly, the label +/// must not be longer than 255 bytes, otherwise this will panic. +/// +/// Additional NUL bytes are written as necessary to ensure that the number of +/// bytes written is always a multiple of 4. +/// +/// Upon success, the total number of bytes written (including padding) is +/// returned. +pub fn write_label( + label: &str, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_label_len(label); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("label")); + } + dst[..label.len()].copy_from_slice(label.as_bytes()); + for i in 0..(nwrite - label.len()) { + dst[label.len() + i] = 0; + } + assert_eq!(nwrite % 4, 0); + Ok(nwrite) +} + +/// Returns the total number of bytes (including padding) that would be written +/// for the given label. This panics if the given label contains a NUL byte or +/// is longer than 255 bytes. (The size restriction exists so that searching +/// for a label during deserialization can be done in small bounded space.) +pub fn write_label_len(label: &str) -> usize { + if label.len() > 255 { + panic!("label must not be longer than 255 bytes"); + } + if label.as_bytes().iter().position(|&b| b == 0).is_some() { + panic!("label must not contain NUL bytes"); + } + let label_len = label.len() + 1; // +1 for the NUL terminator + label_len + padding_len(label_len) +} + +/// Reads the endianness check from the beginning of the given slice and +/// confirms that the endianness of the serialized object matches the expected +/// endianness. If the slice is too small or if the endianness check fails, +/// this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "endianness check")?; + assert_eq!(nr, write_endianness_check_len()); + if n != 0xFEFF { + return Err(DeserializeError::endian_mismatch(0xFEFF, n)); + } + Ok(nr) +} + +/// Writes 0xFEFF as an integer using the given endianness. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure the proper +/// endianness is used. +/// +/// Upon success, the total number of bytes written is returned. +pub fn write_endianness_check<E: Endian>( + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_endianness_check_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("endianness check")); + } + E::write_u32(0xFEFF, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by the endianness check. +pub fn write_endianness_check_len() -> usize { + size_of::<u32>() +} + +/// Reads a version number from the beginning of the given slice and confirms +/// that is matches the expected version number given. If the slice is too +/// small or if the version numbers aren't equivalent, this returns an error. +/// +/// Upon success, the total number of bytes read is returned. +/// +/// N.B. Currently, we require that the version number is exactly equivalent. +/// In the future, if we bump the version number without a semver bump, then +/// we'll need to relax this a bit and support older versions. +pub fn read_version( + slice: &[u8], + expected_version: u32, +) -> Result<usize, DeserializeError> { + let (n, nr) = try_read_u32(slice, "version")?; + assert_eq!(nr, write_version_len()); + if n != expected_version { + return Err(DeserializeError::version_mismatch(expected_version, n)); + } + Ok(nr) +} + +/// Writes the given version number to the beginning of the given slice. +/// +/// This is useful for writing into the header of a serialized object. It can +/// be read during deserialization as a sanity check to ensure that the library +/// code supports the format of the serialized object. +/// +/// Upon success, the total number of bytes written is returned. +pub fn write_version<E: Endian>( + version: u32, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let nwrite = write_version_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("version number")); + } + E::write_u32(version, dst); + Ok(nwrite) +} + +/// Returns the number of bytes written by writing the version number. +pub fn write_version_len() -> usize { + size_of::<u32>() +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the pattern +/// ID limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub fn read_pattern_id( + slice: &[u8], + what: &'static str, +) -> Result<(PatternID, usize), DeserializeError> { + let bytes: [u8; PatternID::SIZE] = + slice[..PatternID::SIZE].try_into().unwrap(); + let pid = PatternID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::pattern_id_error(err, what))?; + Ok((pid, PatternID::SIZE)) +} + +/// Reads a pattern ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid pattern ID. +/// +/// This also returns the number of bytes read. +pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { + let pid = PatternID::from_ne_bytes_unchecked( + slice[..PatternID::SIZE].try_into().unwrap(), + ); + (pid, PatternID::SIZE) +} + +/// Write the given pattern ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `PatternID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize { + E::write_u32(pid.as_u32(), dst); + PatternID::SIZE +} + +/// Attempts to read a state ID from the given slice. If the slice has an +/// insufficient number of bytes or if the state ID exceeds the limit for +/// the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + if slice.len() < StateID::SIZE { + return Err(DeserializeError::buffer_too_small(what)); + } + read_state_id(slice, what) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. If the deserialized integer exceeds the state ID +/// limit for the current target, then this returns an error. +/// +/// Upon success, this also returns the number of bytes read. +pub fn read_state_id( + slice: &[u8], + what: &'static str, +) -> Result<(StateID, usize), DeserializeError> { + let bytes: [u8; StateID::SIZE] = + slice[..StateID::SIZE].try_into().unwrap(); + let sid = StateID::from_ne_bytes(bytes) + .map_err(|err| DeserializeError::state_id_error(err, what))?; + Ok((sid, StateID::SIZE)) +} + +/// Reads a state ID from the given slice. If the slice has insufficient +/// length, then this panics. Otherwise, the deserialized integer is assumed +/// to be a valid state ID. +/// +/// This also returns the number of bytes read. +pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { + let sid = StateID::from_ne_bytes_unchecked( + slice[..StateID::SIZE].try_into().unwrap(), + ); + (sid, StateID::SIZE) +} + +/// Write the given state ID to the beginning of the given slice of bytes +/// using the specified endianness. The given slice must have length at least +/// `StateID::SIZE`, or else this panics. Upon success, the total number of +/// bytes written is returned. +pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize { + E::write_u32(sid.as_u32(), dst); + StateID::SIZE +} + +/// Try to read a u16 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 2 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u16_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u16(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u32 as a usize from the beginning of the given slice in +/// native endian format. If the slice has fewer than 4 bytes or if the +/// deserialized number cannot be represented by usize, then this returns an +/// error. The error message will include the `what` description of what is +/// being deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u32_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + try_read_u32(slice, what).and_then(|(n, nr)| { + usize::try_from(n) + .map(|n| (n, nr)) + .map_err(|_| DeserializeError::invalid_usize(what)) + }) +} + +/// Try to read a u16 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 2 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u16( + slice: &[u8], + what: &'static str, +) -> Result<(u16, usize), DeserializeError> { + if slice.len() < size_of::<u16>() { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok((read_u16(slice), size_of::<u16>())) +} + +/// Try to read a u32 from the beginning of the given slice in native endian +/// format. If the slice has fewer than 4 bytes, then this returns an error. +/// The error message will include the `what` description of what is being +/// deserialized, for better error messages. `what` should be a noun in +/// singular form. +/// +/// Upon success, this also returns the number of bytes read. +pub fn try_read_u32( + slice: &[u8], + what: &'static str, +) -> Result<(u32, usize), DeserializeError> { + if slice.len() < size_of::<u32>() { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok((read_u32(slice), size_of::<u32>())) +} + +/// Read a u16 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 2 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[inline(always)] +pub fn read_u16(slice: &[u8]) -> u16 { + let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap(); + u16::from_ne_bytes(bytes) +} + +/// Read a u32 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 4 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[inline(always)] +pub fn read_u32(slice: &[u8]) -> u32 { + let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap(); + u32::from_ne_bytes(bytes) +} + +/// Read a u64 from the beginning of the given slice in native endian format. +/// If the slice has fewer than 8 bytes, then this panics. +/// +/// Marked as inline to speed up sparse searching which decodes integers from +/// its automaton at search time. +#[inline(always)] +pub fn read_u64(slice: &[u8]) -> u64 { + let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap(); + u64::from_ne_bytes(bytes) +} + +/// Write a variable sized integer and return the total number of bytes +/// written. If the slice was not big enough to contain the bytes, then this +/// returns an error including the "what" description in it. This does no +/// padding. +/// +/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints +#[allow(dead_code)] +pub fn write_varu64( + mut n: u64, + what: &'static str, + dst: &mut [u8], +) -> Result<usize, SerializeError> { + let mut i = 0; + while n >= 0b1000_0000 { + if i >= dst.len() { + return Err(SerializeError::buffer_too_small(what)); + } + dst[i] = (n as u8) | 0b1000_0000; + n >>= 7; + i += 1; + } + if i >= dst.len() { + return Err(SerializeError::buffer_too_small(what)); + } + dst[i] = n as u8; + Ok(i + 1) +} + +/// Returns the total number of bytes that would be writen to encode n as a +/// variable sized integer. +/// +/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints +#[allow(dead_code)] +pub fn write_varu64_len(mut n: u64) -> usize { + let mut i = 0; + while n >= 0b1000_0000 { + n >>= 7; + i += 1; + } + i + 1 +} + +/// Like read_varu64, but attempts to cast the result to usize. If the integer +/// cannot fit into a usize, then an error is returned. +#[allow(dead_code)] +pub fn read_varu64_as_usize( + slice: &[u8], + what: &'static str, +) -> Result<(usize, usize), DeserializeError> { + let (n, nread) = read_varu64(slice, what)?; + let n = usize::try_from(n) + .map_err(|_| DeserializeError::invalid_usize(what))?; + Ok((n, nread)) +} + +/// Reads a variable sized integer from the beginning of slice, and returns the +/// integer along with the total number of bytes read. If a valid variable +/// sized integer could not be found, then an error is returned that includes +/// the "what" description in it. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +#[allow(dead_code)] +pub fn read_varu64( + slice: &[u8], + what: &'static str, +) -> Result<(u64, usize), DeserializeError> { + let mut n: u64 = 0; + let mut shift: u32 = 0; + // The biggest possible value is u64::MAX, which needs all 64 bits which + // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading + // an unnecessary number of bytes. + let limit = cmp::min(slice.len(), 10); + for (i, &b) in slice[..limit].iter().enumerate() { + if b < 0b1000_0000 { + return match (b as u64).checked_shl(shift) { + None => Err(DeserializeError::invalid_varint(what)), + Some(b) => Ok((n | b, i + 1)), + }; + } + match ((b as u64) & 0b0111_1111).checked_shl(shift) { + None => return Err(DeserializeError::invalid_varint(what)), + Some(b) => n |= b, + } + shift += 7; + } + Err(DeserializeError::invalid_varint(what)) +} + +/// Checks that the given slice has some minimal length. If it's smaller than +/// the bound given, then a "buffer too small" error is returned with `what` +/// describing what the buffer represents. +pub fn check_slice_len<T>( + slice: &[T], + at_least_len: usize, + what: &'static str, +) -> Result<(), DeserializeError> { + if slice.len() < at_least_len { + return Err(DeserializeError::buffer_too_small(what)); + } + Ok(()) +} + +/// Multiply the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub fn mul( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_mul(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Add the given numbers, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub fn add( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + match a.checked_add(b) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// Shift `a` left by `b`, and on overflow, return an error that includes +/// 'what' in the error message. +/// +/// This is useful when doing arithmetic with untrusted data. +pub fn shl( + a: usize, + b: usize, + what: &'static str, +) -> Result<usize, DeserializeError> { + let amount = u32::try_from(b) + .map_err(|_| DeserializeError::arithmetic_overflow(what))?; + match a.checked_shl(amount) { + Some(c) => Ok(c), + None => Err(DeserializeError::arithmetic_overflow(what)), + } +} + +/// A simple trait for writing code generic over endianness. +/// +/// This is similar to what byteorder provides, but we only need a very small +/// subset. +pub trait Endian { + /// Writes a u16 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 2, then + /// this panics. + fn write_u16(n: u16, dst: &mut [u8]); + + /// Writes a u32 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 4, then + /// this panics. + fn write_u32(n: u32, dst: &mut [u8]); + + /// Writes a u64 to the given destination buffer in a particular + /// endianness. If the destination buffer has a length smaller than 8, then + /// this panics. + fn write_u64(n: u64, dst: &mut [u8]); +} + +/// Little endian writing. +pub enum LE {} +/// Big endian writing. +pub enum BE {} + +#[cfg(target_endian = "little")] +pub type NE = LE; +#[cfg(target_endian = "big")] +pub type NE = BE; + +impl Endian for LE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_le_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_le_bytes()); + } +} + +impl Endian for BE { + fn write_u16(n: u16, dst: &mut [u8]) { + dst[..2].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u32(n: u32, dst: &mut [u8]) { + dst[..4].copy_from_slice(&n.to_be_bytes()); + } + + fn write_u64(n: u64, dst: &mut [u8]) { + dst[..8].copy_from_slice(&n.to_be_bytes()); + } +} + +/// Returns the number of additional bytes required to add to the given length +/// in order to make the total length a multiple of 4. The return value is +/// always less than 4. +pub fn padding_len(non_padding_len: usize) -> usize { + (4 - (non_padding_len & 0b11)) & 0b11 +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + + #[test] + fn labels() { + let mut buf = [0; 1024]; + + let nwrite = write_label("fooba", &mut buf).unwrap(); + assert_eq!(nwrite, 8); + assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00"); + + let nread = read_label(&buf, "fooba").unwrap(); + assert_eq!(nread, 8); + } + + #[test] + #[should_panic] + fn bad_label_interior_nul() { + // interior NULs are not allowed + write_label("foo\x00bar", &mut [0; 1024]).unwrap(); + } + + #[test] + fn bad_label_almost_too_long() { + // ok + write_label(&"z".repeat(255), &mut [0; 1024]).unwrap(); + } + + #[test] + #[should_panic] + fn bad_label_too_long() { + // labels longer than 255 bytes are banned + write_label(&"z".repeat(256), &mut [0; 1024]).unwrap(); + } + + #[test] + fn padding() { + assert_eq!(0, padding_len(8)); + assert_eq!(3, padding_len(9)); + assert_eq!(2, padding_len(10)); + assert_eq!(1, padding_len(11)); + assert_eq!(0, padding_len(12)); + assert_eq!(3, padding_len(13)); + assert_eq!(2, padding_len(14)); + assert_eq!(1, padding_len(15)); + assert_eq!(0, padding_len(16)); + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs b/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs new file mode 100644 index 000000000..b384de8e1 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs @@ -0,0 +1,493 @@ +/*! +This module contains types and routines for implementing determinization. + +In this crate, there are at least two places where we implement +determinization: fully ahead-of-time compiled DFAs in the `dfa` module and +lazily compiled DFAs in the `hybrid` module. The stuff in this module +corresponds to the things that are in common between these implementations. + +There are three broad things that our implementations of determinization have +in common, as defined by this module: + +* The classification of start states. That is, whether we're dealing with +word boundaries, line boundaries, etc., is all the same. This also includes +the look-behind assertions that are satisfied by each starting state +classification. + +* The representation of DFA states as sets of NFA states, including +convenience types for building these DFA states that are amenable to reusing +allocations. + +* Routines for the "classical" parts of determinization: computing the +epsilon closure, tracking match states (with corresponding pattern IDs, since +we support multi-pattern finite automata) and, of course, computing the +transition function between states for units of input. + +I did consider a couple of alternatives to this particular form of code reuse: + +1. Don't do any code reuse. The problem here is that we *really* want both +forms of determinization to do exactly identical things when it comes to +their handling of NFA states. While our tests generally ensure this, the code +is tricky and large enough where not reusing code is a pretty big bummer. + +2. Implement all of determinization once and make it generic over fully +compiled DFAs and lazily compiled DFAs. While I didn't actually try this +approach, my instinct is that it would be more complex than is needed here. +And the interface required would be pretty hairy. Instead, I think splitting +it into logical sub-components works better. +*/ + +use alloc::vec::Vec; + +pub(crate) use self::state::{ + State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, +}; + +use crate::{ + nfa::thompson::{self, Look, LookSet}, + util::{ + alphabet, + id::StateID, + matchtypes::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + }, +}; + +mod state; + +/// Compute the set of all eachable NFA states, including the full epsilon +/// closure, from a DFA state for a single unit of input. The set of reachable +/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned +/// also includes any look-behind assertions satisfied by `unit`, in addition +/// to whether it is a match state. For multi-pattern DFAs, the builder will +/// also include the pattern IDs that match (in the order seen). +/// +/// `nfa` must be able to resolve any NFA state in `state` and any NFA state +/// reachable via the epsilon closure of any NFA state in `state`. `sparses` +/// must have capacity equivalent to `nfa.len()`. +/// +/// `match_kind` should correspond to the match semantics implemented by the +/// DFA being built. Generally speaking, for leftmost-first match semantics, +/// states that appear after the first NFA match state will not be included in +/// the `StateBuilderNFA` returned since they are impossible to visit. +/// +/// `sparses` is used as scratch space for NFA traversal. Other than their +/// capacity requirements (detailed above), there are no requirements on what's +/// contained within them (if anything). Similarly, what's inside of them once +/// this routine returns is unspecified. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +/// +/// `state` corresponds to the current DFA state on which one wants to compute +/// the transition for the input `unit`. +/// +/// `empty_builder` corresponds to the builder allocation to use to produce a +/// complete `StateBuilderNFA` state. If the state is not needed (or is already +/// cached), then it can be cleared and reused without needing to create a new +/// `State`. The `StateBuilderNFA` state returned is final and ready to be +/// turned into a `State` if necessary. +pub(crate) fn next( + nfa: &thompson::NFA, + match_kind: MatchKind, + sparses: &mut SparseSets, + stack: &mut Vec<StateID>, + state: &State, + unit: alphabet::Unit, + empty_builder: StateBuilderEmpty, +) -> StateBuilderNFA { + sparses.clear(); + + // Put the NFA state IDs into a sparse set in case we need to + // re-compute their epsilon closure. + // + // Doing this state shuffling is technically not necessary unless some + // kind of look-around is used in the DFA. Some ad hoc experiments + // suggested that avoiding this didn't lead to much of an improvement, + // but perhaps more rigorous experimentation should be done. And in + // particular, avoiding this check requires some light refactoring of + // the code below. + state.iter_nfa_state_ids(|nfa_id| { + sparses.set1.insert(nfa_id); + }); + + // Compute look-ahead assertions originating from the current state. + // Based on the input unit we're transitioning over, some additional + // set of assertions may be true. Thus, we re-compute this state's + // epsilon closure (but only if necessary). + if !state.look_need().is_empty() { + // Add look-ahead assertions that are now true based on the current + // input unit. + let mut look_have = state.look_have().clone(); + match unit.as_u8() { + Some(b'\n') => { + look_have.insert(Look::EndLine); + } + Some(_) => {} + None => { + look_have.insert(Look::EndText); + look_have.insert(Look::EndLine); + } + } + if state.is_from_word() == unit.is_word_byte() { + look_have.insert(Look::WordBoundaryUnicodeNegate); + look_have.insert(Look::WordBoundaryAsciiNegate); + } else { + look_have.insert(Look::WordBoundaryUnicode); + look_have.insert(Look::WordBoundaryAscii); + } + // If we have new assertions satisfied that are among the set of + // assertions that exist in this state (that is, just because + // we added an EndLine assertion above doesn't mean there is an + // EndLine conditional epsilon transition in this state), then we + // re-compute this state's epsilon closure using the updated set of + // assertions. + if !look_have + .subtract(state.look_have()) + .intersect(state.look_need()) + .is_empty() + { + for nfa_id in &sparses.set1 { + epsilon_closure( + nfa, + nfa_id, + look_have, + stack, + &mut sparses.set2, + ); + } + sparses.swap(); + sparses.set2.clear(); + } + } + + // Convert our empty builder into one that can record assertions and match + // pattern IDs. + let mut builder = empty_builder.into_matches(); + // Set whether the StartLine look-behind assertion is true for this + // transition or not. The look-behind assertion for ASCII word boundaries + // is handled below. + if nfa.has_any_anchor() { + if unit.as_u8().map_or(false, |b| b == b'\n') { + // Why only handle StartLine here and not StartText? That's + // because StartText can only impact the starting state, which + // is speical cased in start state handling. + builder.look_have().insert(Look::StartLine); + } + } + for nfa_id in &sparses.set1 { + match *nfa.state(nfa_id) { + thompson::State::Union { .. } + | thompson::State::Fail + | thompson::State::Look { .. } + | thompson::State::Capture { .. } => {} + thompson::State::Match { id } => { + // Notice here that we are calling the NEW state a match + // state if the OLD state we are transitioning from + // contains an NFA match state. This is precisely how we + // delay all matches by one byte and also what therefore + // guarantees that starting states cannot be match states. + // + // If we didn't delay matches by one byte, then whether + // a DFA is a matching state or not would be determined + // by whether one of its own constituent NFA states + // was a match state. (And that would be done in + // 'add_nfa_states'.) + // + // Also, 'add_match_pattern_id' requires that callers never + // pass duplicative pattern IDs. We do in fact uphold that + // guarantee here, but it's subtle. In particular, a Thompson + // NFA guarantees that each pattern has exactly one match + // state. Moreover, since we're iterating over the NFA state + // IDs in a set, we are guarateed not to have any duplicative + // match states. Thus, it is impossible to add the same pattern + // ID more than once. + builder.add_match_pattern_id(id); + if !match_kind.continue_past_first_match() { + break; + } + } + thompson::State::Range { range: ref r } => { + if r.matches_unit(unit) { + epsilon_closure( + nfa, + r.next, + *builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches_unit(unit) { + epsilon_closure( + nfa, + next, + *builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + } + } + // We only set the word byte if there's a word boundary look-around + // anywhere in this regex. Otherwise, there's no point in bloating the + // number of states if we don't have one. + // + // We also only set it when the state has a non-zero number of NFA states. + // Otherwise, we could wind up with states that *should* be DEAD states + // but are otherwise distinct from DEAD states because of this look-behind + // assertion being set. While this can't technically impact correctness *in + // theory*, it can create pathological DFAs that consume input until EOI or + // a quit byte is seen. Consuming until EOI isn't a correctness problem, + // but a (serious) perf problem. Hitting a quit byte, however, could be a + // correctness problem since it could cause search routines to report an + // error instead of a detected match once the quit state is entered. (The + // search routine could be made to be a bit smarter by reporting a match + // if one was detected once it enters a quit state (and indeed, the search + // routines in this crate do just that), but it seems better to prevent + // these things by construction if possible.) + if nfa.has_word_boundary() + && unit.is_word_byte() + && !sparses.set2.is_empty() + { + builder.set_is_from_word(); + } + let mut builder_nfa = builder.into_nfa(); + add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); + builder_nfa +} + +/// Compute the epsilon closure for the given NFA state. The epsilon closure +/// consists of all NFA state IDs, including `start_nfa_id`, that can be +/// reached from `start_nfa_id` without consuming any input. These state IDs +/// are written to `set` in the order they are visited, but only if they are +/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA +/// given. +/// +/// `look_have` consists of the satisfied assertions at the current +/// position. For conditional look-around epsilon transitions, these are +/// only followed if they are satisfied by `look_have`. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +pub(crate) fn epsilon_closure( + nfa: &thompson::NFA, + start_nfa_id: StateID, + look_have: LookSet, + stack: &mut Vec<StateID>, + set: &mut SparseSet, +) { + assert!(stack.is_empty()); + // If this isn't an epsilon state, then the epsilon closure is always just + // itself, so there's no need to spin up the machinery below to handle it. + if !nfa.state(start_nfa_id).is_epsilon() { + set.insert(start_nfa_id); + return; + } + + stack.push(start_nfa_id); + while let Some(mut id) = stack.pop() { + // In many cases, we can avoid stack operations when an NFA state only + // adds one new state to visit. In that case, we just set our ID to + // that state and mush on. We only use the stack when an NFA state + // introduces multiple new states to visit. + loop { + // Insert this NFA state, and if it's already in the set and thus + // already visited, then we can move on to the next one. + if !set.insert(id) { + break; + } + match *nfa.state(id) { + thompson::State::Range { .. } + | thompson::State::Sparse { .. } + | thompson::State::Fail + | thompson::State::Match { .. } => break, + thompson::State::Look { look, next } => { + if !look_have.contains(look) { + break; + } + id = next; + } + thompson::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + // We need to process our alternates in order to preserve + // match preferences, so put the earliest alternates closer + // to the top of the stack. + stack.extend(alternates[1..].iter().rev()); + } + thompson::State::Capture { next, .. } => { + id = next; + } + } + } + } +} + +/// Add the NFA state IDs in the given `set` to the given DFA builder state. +/// The order in which states are added corresponds to the order in which they +/// were added to `set`. +/// +/// The DFA builder state given should already have its complete set of match +/// pattern IDs added (if any) and any look-behind assertions (StartLine, +/// StartText and whether this state is being generated for a transition over a +/// word byte when applicable) that are true immediately prior to transitioning +/// into this state (via `builder.look_have()`). The match pattern IDs should +/// correspond to matches that occured on the previous transition, since all +/// matches are delayed by one byte. The things that should _not_ be set are +/// look-ahead assertions (EndLine, EndText and whether the next byte is a +/// word byte or not). The builder state should also not have anything in +/// `look_need` set, as this routine will compute that for you. +/// +/// The given NFA should be able to resolve all identifiers in `set` to a +/// particular NFA state. Additionally, `set` must have capacity equivalent +/// to `nfa.len()`. +pub(crate) fn add_nfa_states( + nfa: &thompson::NFA, + set: &SparseSet, + builder: &mut StateBuilderNFA, +) { + for nfa_id in set { + match *nfa.state(nfa_id) { + thompson::State::Range { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Sparse { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Look { look, .. } => { + builder.add_nfa_state_id(nfa_id); + builder.look_need().insert(look); + } + thompson::State::Union { .. } + | thompson::State::Capture { .. } => { + // Pure epsilon transitions don't need to be tracked + // as part of the DFA state. Tracking them is actually + // superfluous; they won't cause any harm other than making + // determinization slower. + // + // Why aren't these needed? Well, in an NFA, epsilon + // transitions are really just jumping points to other + // states. So once you hit an epsilon transition, the same + // set of resulting states always appears. Therefore, + // putting them in a DFA's set of ordered NFA states is + // strictly redundant. + // + // Look-around states are also epsilon transitions, but + // they are *conditional*. So their presence could be + // discriminatory, and thus, they are tracked above. + // + // But wait... why are epsilon states in our `set` in the + // first place? Why not just leave them out? They're in + // our `set` because it was generated by computing an + // epsilon closure, and we want to keep track of all states + // we visited to avoid re-visiting them. In exchange, we + // have to do this second iteration over our collected + // states to finalize our DFA state. + // + // Note that this optimization requires that we re-compute + // the epsilon closure to account for look-ahead in 'next' + // *only when necessary*. Namely, only when the set of + // look-around assertions changes and only when those + // changes are within the set of assertions that are + // needed in order to step through the closure correctly. + // Otherwise, if we re-do the epsilon closure needlessly, + // it could change based on the fact that we are omitting + // epsilon states here. + } + thompson::State::Fail => { + break; + } + thompson::State::Match { .. } => { + // Normally, the NFA match state doesn't actually need to + // be inside the DFA state. But since we delay matches by + // one byte, the matching DFA state corresponds to states + // that transition from the one we're building here. And + // the way we detect those cases is by looking for an NFA + // match state. See 'next' for how this is handled. + builder.add_nfa_state_id(nfa_id); + } + } + } + // If we know this state contains no look-around assertions, then + // there's no reason to track which look-around assertions were + // satisfied when this state was created. + if builder.look_need().is_empty() { + builder.look_have().clear(); + } +} + +/// Sets the appropriate look-behind assertions on the given state based on +/// this starting configuration. +pub(crate) fn set_lookbehind_from_start( + start: &Start, + builder: &mut StateBuilderMatches, +) { + match *start { + Start::NonWordByte => {} + Start::WordByte => { + builder.set_is_from_word(); + } + Start::Text => { + builder.look_have().insert(Look::StartText); + builder.look_have().insert(Look::StartLine); + } + Start::Line => { + builder.look_have().insert(Look::StartLine); + } + } +} + +#[cfg(test)] +mod tests { + use super::Start; + + #[test] + #[should_panic] + fn start_fwd_bad_range() { + Start::from_position_fwd(&[], 0, 1); + } + + #[test] + #[should_panic] + fn start_rev_bad_range() { + Start::from_position_rev(&[], 0, 1); + } + + #[test] + fn start_fwd() { + let f = Start::from_position_fwd; + + assert_eq!(Start::Text, f(&[], 0, 0)); + assert_eq!(Start::Text, f(b"abc", 0, 3)); + assert_eq!(Start::Text, f(b"\nabc", 0, 3)); + + assert_eq!(Start::Line, f(b"\nabc", 1, 3)); + + assert_eq!(Start::WordByte, f(b"abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(b" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = Start::from_position_rev; + + assert_eq!(Start::Text, f(&[], 0, 0)); + assert_eq!(Start::Text, f(b"abc", 0, 3)); + assert_eq!(Start::Text, f(b"abc\n", 0, 4)); + + assert_eq!(Start::Line, f(b"abc\nz", 0, 3)); + + assert_eq!(Start::WordByte, f(b"abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3)); + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/determinize/state.rs b/vendor/regex-automata-0.2.0/src/util/determinize/state.rs new file mode 100644 index 000000000..567e600d6 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/determinize/state.rs @@ -0,0 +1,873 @@ +/*! +This module defines a DFA state representation and builders for constructing +DFA states. + +This representation is specifically for use in implementations of NFA-to-DFA +conversion via powerset construction. (Also called "determinization" in this +crate.) + +The term "DFA state" is somewhat overloaded in this crate. In some cases, it +refers to the set of transitions over an alphabet for a particular state. In +other cases, it refers to a set of NFA states. The former is really about the +final representation of a state in a DFA's transition table, where as the +latter---what this module is focusedon---is closer to an intermediate form that +is used to help eventually build the transition table. + +This module exports four types. All four types represent the same idea: an +ordered set of NFA states. This ordered set represents the epsilon closure of a +particular NFA state, where the "epsilon closure" is the set of NFA states that +can be transitioned to without consuming any input. i.e., Follow all of theNFA +state's epsilon transitions. In addition, this implementation of DFA states +cares about two other things: the ordered set of pattern IDs corresponding +to the patterns that match if the state is a match state, and the set of +look-behind assertions that were true when the state was created. + +The first, `State`, is a frozen representation of a state that cannot be +modified. It may be cheaply cloned without copying the state itself and can be +accessed safely from multiple threads simultaneously. This type is useful for +when one knows that the DFA state being constructed is distinct from any other +previously constructed states. Namely, powerset construction, in practice, +requires one to keep a cache of previously created DFA states. Otherwise, +the number of DFA states created in memory balloons to an impractically +large number. For this reason, equivalent states should endeavor to have an +equivalent byte-level representation. (In general, "equivalency" here means, +"equivalent assertions, pattern IDs and NFA state IDs." We do not require that +full DFA minimization be implemented here. This form of equivalency is only +surface deep and is more-or-less a practical necessity.) + +The other three types represent different phases in the construction of a +DFA state. Internally, these three types (and `State`) all use the same +byte-oriented representation. That means one can use any of the builder types +to check whether the state it represents already exists or not. If it does, +then there is no need to freeze it into a `State` (which requires an alloc and +a copy). Here are the three types described succinctly: + +* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions +and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A +`StateBuilderEmpty` can only be used to query its underlying memory capacity, +or to convert into a builder for recording pattern IDs and/or assertions. +* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero +or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` +can only be used for adding pattern IDs and recording assertions. +* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or +more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` +can only be used for adding NFA state IDs and recording some assertions. + +The expected flow here is to use the above builders to construct a candidate +DFA state to check if it already exists. If it does, then there's no need to +freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state` +can be called to freeze the builder into an immutable `State`. In either +case, `clear` should be called on the builder to turn it back into a +`StateBuilderEmpty` that reuses the underyling memory. + +The main purpose for splitting the builder into these distinct types is to +make it impossible to do things like adding a pattern ID after adding an NFA +state ID. Namely, this makes it simpler to use a space-and-time efficient +binary representation for the state. (The format is documented on the `Repr` +type below.) If we just used one type for everything, it would be possible for +callers to use an incorrect interleaving of calls and thus result in a corrupt +representation. I chose to use more type machinery to make this impossible to +do because 1) determinization is itself pretty complex and it wouldn't be too +hard to foul this up and 2) there isn't too much machinery involve and it's +well contained. + +As an optimization, sometimes states won't have certain things set. For +example, if the underlying NFA has no word boundary assertions, then there is +no reason to set a state's look-behind assertion as to whether it was generated +from a word byte or not. Similarly, if a state has no NFA states corresponding +to look-around assertions, then there is no reason to set `look_have` to a +non-empty set. Finally, callers usually omit unconditional epsilon transitions +when adding NFA state IDs since they aren't discriminatory. + +Finally, the binary representation used by these states is, thankfully, not +serialized anywhere. So any kind of change can be made with reckless abandon, +as long as everything in this module agrees. +*/ + +use core::{convert::TryFrom, mem}; + +use alloc::{sync::Arc, vec::Vec}; + +use crate::{ + nfa::thompson::LookSet, + util::{ + bytes::{self, Endian}, + id::{PatternID, StateID}, + }, +}; + +/// A DFA state that, at its core, is represented by an ordered set of NFA +/// states. +/// +/// This type is intended to be used only in NFA-to-DFA conversion via powerset +/// construction. +/// +/// It may be cheaply cloned and accessed safely from mulitple threads +/// simultaneously. +#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub(crate) struct State(Arc<[u8]>); + +/// This Borrow impl permits us to lookup any state in a map by its byte +/// representation. This is particularly convenient when one has a StateBuilder +/// and we want to see if a correspondingly equivalent state already exists. If +/// one does exist, then we can reuse the allocation required by StateBuilder +/// without having to convert it into a State first. +impl core::borrow::Borrow<[u8]> for State { + fn borrow(&self) -> &[u8] { + &*self.0 + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("State").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl State { + pub(crate) fn dead() -> State { + StateBuilderEmpty::new().into_matches().into_nfa().to_state() + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn look_have(&self) -> LookSet { + self.repr().look_have() + } + + pub(crate) fn look_need(&self) -> LookSet { + self.repr().look_need() + } + + pub(crate) fn match_count(&self) -> usize { + self.repr().match_count() + } + + pub(crate) fn match_pattern(&self, index: usize) -> PatternID { + self.repr().match_pattern(index) + } + + pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + self.repr().match_pattern_ids() + } + + pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) { + self.repr().iter_match_pattern_ids(f) + } + + pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) { + self.repr().iter_nfa_state_ids(f) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.0.len() + } + + fn repr(&self) -> Repr<'_> { + Repr(&*self.0) + } +} + +/// A state builder that represents an empty state. +/// +/// This is a useful "initial condition" for state construction. It has no +/// NFA state IDs, no assertions set and no pattern IDs. No allocations are +/// made when new() is called. Its main use is for being converted into a +/// builder that can capture assertions and pattern IDs. +#[derive(Clone, Debug)] +pub(crate) struct StateBuilderEmpty(Vec<u8>); + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderEmpty { + pub(crate) fn new() -> StateBuilderEmpty { + StateBuilderEmpty(alloc::vec![]) + } + + pub(crate) fn into_matches(mut self) -> StateBuilderMatches { + self.0.extend_from_slice(&[0, 0, 0]); + StateBuilderMatches(self.0) + } + + fn clear(&mut self) { + self.0.clear(); + } + + pub(crate) fn capacity(&self) -> usize { + self.0.capacity() + } +} + +/// A state builder that collects assertions and pattern IDs. +/// +/// When collecting pattern IDs is finished, this can be converted into a +/// builder that collects NFA state IDs. +#[derive(Clone)] +pub(crate) struct StateBuilderMatches(Vec<u8>); + +impl core::fmt::Debug for StateBuilderMatches { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderMatches { + pub(crate) fn into_nfa(mut self) -> StateBuilderNFA { + self.repr_vec().close_match_pattern_ids(); + StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } + } + + pub(crate) fn clear(self) -> StateBuilderEmpty { + let mut builder = StateBuilderEmpty(self.0); + builder.clear(); + builder + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn set_is_from_word(&mut self) { + self.repr_vec().set_is_from_word() + } + + pub(crate) fn look_have(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[1]) + } + + pub(crate) fn look_need(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[2]) + } + + pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { + self.repr_vec().add_match_pattern_id(pid) + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.0) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.0) + } +} + +/// A state builder that collects some assertions and NFA state IDs. +/// +/// When collecting NFA state IDs is finished, this can be used to build a +/// `State` if necessary. +/// +/// When dont with building a state (regardless of whether it got kept or not), +/// it's usually a good idea to call `clear` to get an empty builder back so +/// that it can be reused to build the next state. +#[derive(Clone)] +pub(crate) struct StateBuilderNFA { + repr: Vec<u8>, + prev_nfa_state_id: StateID, +} + +impl core::fmt::Debug for StateBuilderNFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish() + } +} + +/// For docs on these routines, see the internal Repr and ReprVec types below. +impl StateBuilderNFA { + pub(crate) fn to_state(&self) -> State { + State(Arc::from(&*self.repr)) + } + + pub(crate) fn clear(self) -> StateBuilderEmpty { + let mut builder = StateBuilderEmpty(self.repr); + builder.clear(); + builder + } + + pub(crate) fn is_match(&self) -> bool { + self.repr().is_match() + } + + pub(crate) fn is_from_word(&self) -> bool { + self.repr().is_from_word() + } + + pub(crate) fn look_have(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.repr[1]) + } + + pub(crate) fn look_need(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.repr[2]) + } + + pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { + ReprVec(&mut self.repr) + .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) + } + + pub(crate) fn memory_usage(&self) -> usize { + self.repr.len() + } + + pub(crate) fn as_bytes(&self) -> &[u8] { + &self.repr + } + + fn repr(&self) -> Repr<'_> { + Repr(&self.repr) + } + + fn repr_vec(&mut self) -> ReprVec<'_> { + ReprVec(&mut self.repr) + } +} + +/// Repr is a read-only view into the representation of a DFA state. +/// +/// Primarily, a Repr is how we achieve DRY: we implement decoding the format +/// in one place, and then use a Repr to implement the various methods on the +/// public state types. +/// +/// The format is as follows: +/// +/// The first three bytes correspond to bitsets. +/// +/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the +/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1 +/// if the state has pattern IDs explicitly written to it. (This is a flag that +/// is not meant to be set by determinization, but rather, is used as part of +/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was +/// generated by a transition over a "word" byte. (Callers may not always set +/// this. For example, if the NFA has no word boundary assertion, then needing +/// to track whether a state came from a word byte or not is superfluous and +/// wasteful.) +/// +/// Byte 1 corresponds to the look-behind assertions that were satisfied by +/// the transition that created this state. This generally only includes the +/// StartLine and StartText assertions. (Look-ahead assertions are not tracked +/// as part of states. Instead, these are applied by re-computing the epsilon +/// closure of a state when computing the transition function. See `next` in +/// the parent module.) +/// +/// Byte 2 corresponds to the set of look-around assertions (including both +/// look-behind and look-ahead) that appear somewhere in this state's set of +/// NFA state IDs. This is used to determine whether this state's epsilon +/// closure should be re-computed when computing the transition function. +/// Namely, look-around assertions are "just" conditional epsilon transitions, +/// so if there are new assertions available when computing the transition +/// function, we should only re-compute the epsilon closure if those new +/// assertions are relevant to this particular state. +/// +/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer +/// corresponding to the number of patterns encoded in this state. If the state +/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is +/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte +/// offset 3 is the position at which the first NFA state ID is encoded. +/// +/// For a match state with at least one non-ZERO pattern ID, the next bytes +/// correspond to a sequence of 32-bit native endian encoded integers that +/// represent each pattern ID, in order, that this match state represents. +/// +/// After the pattern IDs (if any), NFA state IDs are delta encoded as +/// varints.[1] The first NFA state ID is encoded as itself, and each +/// subsequent NFA state ID is encoded as the difference between itself and the +/// previous NFA state ID. +/// +/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints +struct Repr<'a>(&'a [u8]); + +impl<'a> Repr<'a> { + /// Returns true if and only if this is a match state. + /// + /// If callers have added pattern IDs to this state, then callers MUST set + /// this state as a match state explicitly. However, as a special case, + /// states that are marked as match states but with no pattern IDs, then + /// the state is treated as if it had a single pattern ID equivalent to + /// PatternID::ZERO. + fn is_match(&self) -> bool { + self.0[0] & (1 << 0) > 0 + } + + /// Returns true if and only if this state has had at least one pattern + /// ID added to it. + /// + /// This is an internal-only flag that permits the representation to save + /// space in the common case of an NFA with one pattern in it. In that + /// case, a match state can only ever have exactly one pattern ID: + /// PatternID::ZERO. So there's no need to represent it. + fn has_pattern_ids(&self) -> bool { + self.0[0] & (1 << 1) > 0 + } + + /// Returns true if and only if this state is marked as having been created + /// from a transition over a word byte. This is useful for checking whether + /// a word boundary assertion is true or not, which requires look-behind + /// (whether the current state came from a word byte or not) and look-ahead + /// (whether the transition byte is a word byte or not). + /// + /// Since states with this set are distinct from states that don't have + /// this set (even if they are otherwise equivalent), callers should not + /// set this assertion unless the underlying NFA has at least one word + /// boundary assertion somewhere. Otherwise, a superfluous number of states + /// may be created. + fn is_from_word(&self) -> bool { + self.0[0] & (1 << 2) > 0 + } + + /// The set of look-behind assertions that were true in the transition that + /// created this state. + /// + /// Generally, this should be empty if 'look_need' is empty, since there is + /// no reason to track which look-behind assertions are true if the state + /// has no conditional epsilon transitions. + /// + /// Satisfied look-ahead assertions are not tracked in states. Instead, + /// these are re-computed on demand via epsilon closure when computing the + /// transition function. + fn look_have(&self) -> LookSet { + LookSet::from_repr(self.0[1]) + } + + /// The set of look-around (both behind and ahead) assertions that appear + /// at least once in this state's set of NFA states. + /// + /// This is used to determine whether the epsilon closure needs to be + /// re-computed when computing the transition function. Namely, if the + /// state has no conditional epsilon transitions, then there is no need + /// to re-compute the epsilon closure. + fn look_need(&self) -> LookSet { + LookSet::from_repr(self.0[2]) + } + + /// Returns the total number of match pattern IDs in this state. + /// + /// If this state is not a match state, then this always returns 0. + fn match_count(&self) -> usize { + if !self.is_match() { + return 0; + } else if !self.has_pattern_ids() { + 1 + } else { + self.encoded_pattern_count() + } + } + + /// Returns the pattern ID for this match state at the given index. + /// + /// If the given index is greater than or equal to `match_count()` for this + /// state, then this could panic or return incorrect results. + fn match_pattern(&self, index: usize) -> PatternID { + if !self.has_pattern_ids() { + PatternID::ZERO + } else { + let offset = 7 + index * PatternID::SIZE; + // This is OK since we only ever serialize valid PatternIDs to + // states. + bytes::read_pattern_id_unchecked(&self.0[offset..]).0 + } + } + + /// Returns a copy of all match pattern IDs in this state. If this state + /// is not a match state, then this returns None. + fn match_pattern_ids(&self) -> Option<Vec<PatternID>> { + if !self.is_match() { + return None; + } + let mut pids = alloc::vec![]; + self.iter_match_pattern_ids(|pid| pids.push(pid)); + Some(pids) + } + + /// Calls the given function on every pattern ID in this state. + fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) { + if !self.is_match() { + return; + } + // As an optimization for a very common case, when this is a match + // state for an NFA with only one pattern, we don't actually write the + // pattern ID to the state representation. Instead, we know it must + // be there since it is the only possible choice. + if !self.has_pattern_ids() { + f(PatternID::ZERO); + return; + } + let mut pids = &self.0[7..self.pattern_offset_end()]; + while !pids.is_empty() { + let pid = bytes::read_u32(pids); + pids = &pids[PatternID::SIZE..]; + // This is OK since we only ever serialize valid PatternIDs to + // states. And since pattern IDs can never exceed a usize, the + // unwrap is OK. + f(PatternID::new_unchecked(usize::try_from(pid).unwrap())); + } + } + + /// Calls the given function on every NFA state ID in this state. + fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) { + let mut sids = &self.0[self.pattern_offset_end()..]; + let mut prev = 0i32; + while !sids.is_empty() { + let (delta, nr) = read_vari32(sids); + sids = &sids[nr..]; + let sid = prev + delta; + prev = sid; + // This is OK since we only ever serialize valid StateIDs to + // states. And since state IDs can never exceed an isize, they must + // always be able to fit into a usize, and thus cast is OK. + f(StateID::new_unchecked(sid as usize)) + } + } + + /// Returns the offset into this state's representation where the pattern + /// IDs end and the NFA state IDs begin. + fn pattern_offset_end(&self) -> usize { + let encoded = self.encoded_pattern_count(); + if encoded == 0 { + return 3; + } + // This arithmetic is OK since we were able to address this many bytes + // when writing to the state, thus, it must fit into a usize. + encoded.checked_mul(4).unwrap().checked_add(7).unwrap() + } + + /// Returns the total number of *encoded* pattern IDs in this state. + /// + /// This may return 0 even when this is a match state, since the pattern + /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in + /// the match state (the overwhelming common case). + fn encoded_pattern_count(&self) -> usize { + if !self.has_pattern_ids() { + return 0; + } + // This unwrap is OK since the total number of patterns is always + // guaranteed to fit into a usize. + usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap() + } +} + +impl<'a> core::fmt::Debug for Repr<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut nfa_ids = alloc::vec![]; + self.iter_nfa_state_ids(|sid| nfa_ids.push(sid)); + f.debug_struct("Repr") + .field("is_match", &self.is_match()) + .field("is_from_word", &self.is_from_word()) + .field("look_have", &self.look_have()) + .field("look_need", &self.look_need()) + .field("match_pattern_ids", &self.match_pattern_ids()) + .field("nfa_state_ids", &nfa_ids) + .finish() + } +} + +/// ReprVec is a write-only view into the representation of a DFA state. +/// +/// See Repr for more details on the purpose of this type and also the format. +/// +/// Note that not all possible combinations of methods may be called. This is +/// precisely what the various StateBuilder types encapsulate: they only +/// permit valid combinations via Rust's linear typing. +struct ReprVec<'a>(&'a mut Vec<u8>); + +impl<'a> ReprVec<'a> { + /// Set this state as a match state. + /// + /// This should not be exposed explicitly outside of this module. It is + /// set automatically when a pattern ID is added. + fn set_is_match(&mut self) { + self.0[0] |= 1 << 0; + } + + /// Set that this state has pattern IDs explicitly written to it. + /// + /// This should not be exposed explicitly outside of this module. This is + /// used internally as a space saving optimization. Namely, if the state + /// is a match state but does not have any pattern IDs written to it, + /// then it is automatically inferred to have a pattern ID of ZERO. + fn set_has_pattern_ids(&mut self) { + self.0[0] |= 1 << 1; + } + + /// Set this state as being built from a transition over a word byte. + /// + /// Setting this is only necessary when one needs to deal with word + /// boundary assertions. Therefore, if the underlying NFA has no word + /// boundary assertions, callers should not set this. + fn set_is_from_word(&mut self) { + self.0[0] |= 1 << 2; + } + + /// Return a mutable reference to the 'look_have' assertion set. + fn look_have_mut(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[1]) + } + + /// Return a mutable reference to the 'look_need' assertion set. + fn look_need_mut(&mut self) -> &mut LookSet { + LookSet::from_repr_mut(&mut self.0[2]) + } + + /// Add a pattern ID to this state. All match states must have at least + /// one pattern ID associated with it. + /// + /// Callers must never add duplicative pattern IDs. + /// + /// The order in which patterns are added must correspond to the order + /// in which patterns are reported as matches. + fn add_match_pattern_id(&mut self, pid: PatternID) { + // As a (somewhat small) space saving optimization, in the case where + // a matching state has exactly one pattern ID, PatternID::ZERO, we do + // not write either the pattern ID or the number of patterns encoded. + // Instead, all we do is set the 'is_match' bit on this state. Overall, + // this saves 8 bytes per match state for the overwhelming majority of + // match states. + // + // In order to know whether pattern IDs need to be explicitly read or + // not, we use another internal-only bit, 'has_pattern_ids', to + // indicate whether they have been explicitly written or not. + if !self.repr().has_pattern_ids() { + if pid == PatternID::ZERO { + self.set_is_match(); + return; + } + // Make room for 'close_match_pattern_ids' to write the total + // number of pattern IDs written. + self.0.extend(core::iter::repeat(0).take(PatternID::SIZE)); + self.set_has_pattern_ids(); + // If this was already a match state, then the only way that's + // possible when the state doesn't have pattern IDs is if + // PatternID::ZERO was added by the caller previously. In this + // case, we are now adding a non-ZERO pattern ID after it, in + // which case, we want to make sure to represent ZERO explicitly + // now. + if self.repr().is_match() { + write_u32(self.0, 0) + } else { + // Otherwise, just make sure the 'is_match' bit is set. + self.set_is_match(); + } + } + write_u32(self.0, pid.as_u32()); + } + + /// Indicate that no more pattern IDs will be added to this state. + /// + /// Once this is called, callers must not call it or 'add_match_pattern_id' + /// again. + /// + /// This should not be exposed explicitly outside of this module. It + /// should be called only when converting a StateBuilderMatches into a + /// StateBuilderNFA. + fn close_match_pattern_ids(&mut self) { + // If we never wrote any pattern IDs, then there's nothing to do here. + if !self.repr().has_pattern_ids() { + return; + } + let patsize = PatternID::SIZE; + let pattern_bytes = self.0.len() - 7; + // Every pattern ID uses 4 bytes, so number of bytes should be + // divisible by 4. + assert_eq!(pattern_bytes % patsize, 0); + // This unwrap is OK since we are guaranteed that the maximum number + // of possible patterns fits into a u32. + let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); + bytes::NE::write_u32(count32, &mut self.0[3..7]); + } + + /// Add an NFA state ID to this state. The order in which NFA states are + /// added matters. It is the caller's responsibility to ensure that + /// duplicate NFA state IDs are not added. + fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) { + let delta = sid.as_i32() - prev.as_i32(); + write_vari32(self.0, delta); + *prev = sid; + } + + /// Return a read-only view of this state's representation. + fn repr(&self) -> Repr<'_> { + Repr(self.0.as_slice()) + } +} + +/// Write a signed 32-bit integer using zig-zag encoding. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec<u8>, n: i32) { + let mut un = (n as u32) << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// Read a signed 32-bit integer using zig-zag encoding. Also, return the +/// number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = (un >> 1) as i32; + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written +/// as a sequence of bytes where all bytes except for the last one have the +/// most significant bit set. The least significant 7 bits correspond to the +/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in +/// very common cases, it uses fewer than 4. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec<u8>, mut n: u32) { + while n >= 0b1000_0000 { + data.push((n as u8) | 0b1000_0000); + n >>= 7; + } + data.push(n as u8); +} + +/// Read an unsigned 32-bit varint. Also, return the number of bytes read. +/// +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + // N.B. We can assume correctness here since we know that all varuints are + // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic + // is all okay. + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | ((b as u32) << shift), i + 1); + } + n |= ((b as u32) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +/// Push a native-endian encoded `n` on to `dst`. +fn write_u32(dst: &mut Vec<u8>, n: u32) { + use crate::util::bytes::{Endian, NE}; + + let start = dst.len(); + dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>())); + NE::write_u32(n, &mut dst[start..]); +} + +#[cfg(test)] +mod tests { + use alloc::vec; + + use quickcheck::quickcheck; + + use super::*; + + quickcheck! { + fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + + let mut b = StateBuilderEmpty::new().into_matches().into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + let s = b.to_state(); + let mut got = vec![]; + s.iter_nfa_state_ids(|sid| got.push(sid)); + got == sids + } + + fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool { + // Builders states do not permit duplicate IDs. + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + let s = b.into_nfa().to_state(); + let mut got = vec![]; + s.iter_match_pattern_ids(|pid| got.push(pid)); + got == pids + } + + fn prop_state_read_write_nfa_state_and_pattern_ids( + sids: Vec<StateID>, + pids: Vec<PatternID> + ) -> bool { + // Builders states do not permit duplicate IDs. + let sids = dedup_state_ids(sids); + let pids = dedup_pattern_ids(pids); + + let mut b = StateBuilderEmpty::new().into_matches(); + for &pid in &pids { + b.add_match_pattern_id(pid); + } + + let mut b = b.into_nfa(); + for &sid in &sids { + b.add_nfa_state_id(sid); + } + + let s = b.to_state(); + let mut got_pids = vec![]; + s.iter_match_pattern_ids(|pid| got_pids.push(pid)); + let mut got_sids = vec![]; + s.iter_nfa_state_ids(|sid| got_sids.push(sid)); + got_pids == pids && got_sids == sids + } + + fn prop_read_write_varu32(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + + fn prop_read_write_vari32(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + } + + fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for sid in sids { + if set.contains(&sid) { + continue; + } + set.insert(sid); + deduped.push(sid); + } + deduped + } + + fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> { + let mut set = alloc::collections::BTreeSet::new(); + let mut deduped = vec![]; + for pid in pids { + if set.contains(&pid) { + continue; + } + set.insert(pid); + deduped.push(pid); + } + deduped + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/id.rs b/vendor/regex-automata-0.2.0/src/util/id.rs new file mode 100644 index 000000000..70bf0a93b --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/id.rs @@ -0,0 +1,608 @@ +/*! +Type definitions for identifier types. + +A [`StateID`] represents the possible set of identifiers used in regex engine +implementations in this crate. For example, they are used to identify both NFA +and DFA states. + +A [`PatternID`] represents the possible set of identifiers for patterns. All +regex engine implementations in this crate support searching for multiple +patterns simultaneously. A `PatternID` is how each pattern is uniquely +identified for a particular instance of a regex engine. Namely, a pattern is +assigned an auto-incrementing integer, starting at `0`, based on the order of +patterns supplied during the construction of the regex engine. + +These identifier types represent a way for this crate to make correctness +guarantees around the possible set of values that a `StateID` or a `PatternID` +might represent. Similarly, they also provide a way of constraining the size of +these identifiers to reduce space usage while still guaranteeing that all such +identifiers are repsentable by a `usize` for the current target. + +Moreover, the identifier types clamp the range of permissible values to a range +that is typically smaller than its internal representation. (With the maximum +value being, e.g., `StateID::MAX`.) Users of these types may not rely this +clamping for the purpose of memory safety. Users may, however, rely on these +invariants to avoid panics or other types of logic bugs. +*/ + +// Continuing from the above comment about correctness guarantees, an example +// of a way in which we use the guarantees on these types is delta encoding. +// Namely, we require that IDs can be at most 2^31 - 2, which means the +// difference between any two IDs is always representable as an i32. + +use core::{ + convert::{Infallible, TryFrom}, + mem, ops, +}; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +/// An identifier for a regex pattern. +/// +/// The identifier for a pattern corresponds to its relative position among +/// other patterns in a single finite state machine. Namely, when building +/// a multi-pattern regex engine, one must supply a sequence of patterns to +/// match. The position (starting at 0) of each pattern in that sequence +/// represents its identifier. This identifier is in turn used to identify and +/// report matches of that pattern in various APIs. +/// +/// A pattern ID is guaranteed to be representable by a `usize`. Similarly, +/// the number of patterns in any regex engine in this crate is guaranteed to +/// be representable by a `usize`. This applies to regex engines that have +/// been deserialized; a deserialization error will be returned if it contains +/// pattern IDs that violate these requirements in your current environment. +/// +/// For extra convenience in some cases, this type also guarantees that all +/// IDs can fit into an `i32` and an `isize` without overflowing. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `PatternID` to index slices. +/// +/// # Safety +/// +/// While a `PatternID` is meant to guarantee that its value fits into `usize` +/// (while using a possibly smaller representation than `usize` on some +/// targets), callers must not rely on this property for safety. Callers may +/// choose to rely on this property for correctness however. +#[repr(transparent)] +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct PatternID(u32); + +impl PatternID { + /// The maximum pattern ID value, represented as a `usize`. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: PatternID = + PatternID::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum pattern ID value, represented as a `usize`. + #[cfg(target_pointer_width = "16")] + pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1); + + /// The total number of patterns that are allowed in any single regex + /// engine. + pub const LIMIT: usize = PatternID::MAX.as_usize() + 1; + + /// The zero pattern ID value. + pub const ZERO: PatternID = PatternID::new_unchecked(0); + + /// The number of bytes that a single `PatternID` uses in memory. + pub const SIZE: usize = core::mem::size_of::<PatternID>(); + + /// Create a new pattern ID. + /// + /// If the given identifier exceeds [`PatternID::MAX`], then this returns + /// an error. + #[inline] + pub fn new(id: usize) -> Result<PatternID, PatternIDError> { + PatternID::try_from(id) + } + + /// Create a new pattern ID without checking whether the given value + /// exceeds [`PatternID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety, as documented above. + #[inline] + pub const fn new_unchecked(id: usize) -> PatternID { + PatternID(id as u32) + } + + /// Like [`PatternID::new`], but panics if the given ID is not valid. + #[inline] + pub fn must(id: usize) -> PatternID { + PatternID::new(id).unwrap() + } + + /// Return this pattern ID as a `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0 as usize + } + + /// Return the internal u32 of this pattern ID. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal u32 of this pattern ID represented as an i32. + /// + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0 as i32 + } + + /// Returns one more than this pattern ID as a usize. + /// + /// Since a pattern ID has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize` (and a `u32`). + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize().checked_add(1).unwrap() + } + + /// Decode this pattern ID from the bytes given using the native endian + /// byte order for the current target. + /// + /// If the decoded integer is not representable as a pattern ID for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> { + let id = u32::from_ne_bytes(bytes); + if id > PatternID::MAX.as_u32() { + return Err(PatternIDError { attempted: id as u64 }); + } + Ok(PatternID::new_unchecked(id as usize)) + } + + /// Decode this pattern ID from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`PatternID::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a pattern ID. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID { + PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize) + } + + /// Return the underlying pattern ID integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all pattern IDs from 0 up to and not including + /// the given length. + /// + /// If the given length exceeds [`PatternID::LIMIT`], then this panics. + #[cfg(feature = "alloc")] + pub(crate) fn iter(len: usize) -> PatternIDIter { + PatternIDIter::new(len) + } +} + +/// This error occurs when a pattern ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum pattern ID value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PatternIDError { + attempted: u64, +} + +impl PatternIDError { + /// Returns the value that failed to constructed a pattern ID. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for PatternIDError {} + +impl core::fmt::Display for PatternIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create PatternID from {:?}, which exceeds {:?}", + self.attempted(), + PatternID::MAX, + ) + } +} + +/// An identifier for a state in a regex engine. +/// +/// A state ID is guaranteed to be representable by a `usize`. Similarly, the +/// number of states in any regex engine in this crate is guaranteed to be +/// representable by a `usize`. This applies to regex engines that have been +/// deserialized; a deserialization error will be returned if it contains state +/// IDs that violate these requirements in your current environment. +/// +/// For extra convenience in some cases, this type also guarantees that all +/// IDs can fit into an `i32` and an `isize` without overflowing. +/// +/// # Representation +/// +/// This type is always represented internally by a `u32` and is marked as +/// `repr(transparent)`. Thus, this type always has the same representation as +/// a `u32`. +/// +/// # Indexing +/// +/// For convenience, callers may use a `StateID` to index slices. +/// +/// # Safety +/// +/// While a `StateID` is meant to guarantee that its value fits into `usize` +/// (while using a possibly smaller representation than `usize` on some +/// targets), callers must not rely on this property for safety. Callers may +/// choose to rely on this property for correctness however. +#[repr(transparent)] +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct StateID(u32); + +impl StateID { + /// The maximum state ID value. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub const MAX: StateID = + StateID::new_unchecked(core::i32::MAX as usize - 1); + + /// The maximum state ID value. + #[cfg(target_pointer_width = "16")] + pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1); + + /// The total number of states that are allowed in any single regex + /// engine, represented as a `usize`. + pub const LIMIT: usize = StateID::MAX.as_usize() + 1; + + /// The zero state ID value. + pub const ZERO: StateID = StateID::new_unchecked(0); + + /// The number of bytes that a single `StateID` uses in memory. + pub const SIZE: usize = core::mem::size_of::<StateID>(); + + /// Create a new state ID. + /// + /// If the given identifier exceeds [`StateID::MAX`], then this returns + /// an error. + #[inline] + pub fn new(id: usize) -> Result<StateID, StateIDError> { + StateID::try_from(id) + } + + /// Create a new state ID without checking whether the given value + /// exceeds [`StateID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety, as documented above. + #[inline] + pub const fn new_unchecked(id: usize) -> StateID { + StateID(id as u32) + } + + /// Like [`StateID::new`], but panics if the given ID is not valid. + #[inline] + pub fn must(id: usize) -> StateID { + StateID::new(id).unwrap() + } + + /// Return this state ID as a `usize`. + #[inline] + pub const fn as_usize(&self) -> usize { + self.0 as usize + } + + /// Return the internal u32 of this state ID. + #[inline] + pub const fn as_u32(&self) -> u32 { + self.0 + } + + /// Return the internal u32 of this pattern ID represented as an i32. + /// + /// This is guaranteed to never overflow an `i32`. + #[inline] + pub const fn as_i32(&self) -> i32 { + self.0 as i32 + } + + /// Returns one more than this state ID as a usize. + /// + /// Since a state ID has constraints on its maximum value, adding `1` to + /// it will always fit in a `usize` (and a `u32`). + #[inline] + pub fn one_more(&self) -> usize { + self.as_usize().checked_add(1).unwrap() + } + + /// Decode this state ID from the bytes given using the native endian byte + /// order for the current target. + /// + /// If the decoded integer is not representable as a state ID for the + /// current target, then this returns an error. + #[inline] + pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> { + let id = u32::from_ne_bytes(bytes); + if id > StateID::MAX.as_u32() { + return Err(StateIDError { attempted: id as u64 }); + } + Ok(StateID::new_unchecked(id as usize)) + } + + /// Decode this state ID from the bytes given using the native endian + /// byte order for the current target. + /// + /// This is analogous to [`StateID::new_unchecked`] in that is does not + /// check whether the decoded integer is representable as a state ID. + #[inline] + pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID { + StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize) + } + + /// Return the underlying state ID integer as raw bytes in native endian + /// format. + #[inline] + pub fn to_ne_bytes(&self) -> [u8; 4] { + self.0.to_ne_bytes() + } + + /// Returns an iterator over all state IDs from 0 up to and not including + /// the given length. + /// + /// If the given length exceeds [`StateID::LIMIT`], then this panics. + #[cfg(feature = "alloc")] + pub(crate) fn iter(len: usize) -> StateIDIter { + StateIDIter::new(len) + } +} + +/// This error occurs when a state ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum state ID value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct StateIDError { + attempted: u64, +} + +impl StateIDError { + /// Returns the value that failed to constructed a state ID. + pub fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for StateIDError {} + +impl core::fmt::Display for StateIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create StateID from {:?}, which exceeds {:?}", + self.attempted(), + StateID::MAX, + ) + } +} + +/// A macro for defining exactly identical (modulo names) impls for ID types. +macro_rules! impls { + ($ty:ident, $tyerr:ident, $tyiter:ident) => { + #[derive(Clone, Debug)] + pub(crate) struct $tyiter { + rng: ops::Range<usize>, + } + + impl $tyiter { + #[cfg(feature = "alloc")] + fn new(len: usize) -> $tyiter { + assert!( + len <= $ty::LIMIT, + "cannot create iterator with IDs when number of \ + elements exceed {:?}", + $ty::LIMIT, + ); + $tyiter { rng: 0..len } + } + } + + impl Iterator for $tyiter { + type Item = $ty; + + fn next(&mut self) -> Option<$ty> { + if self.rng.start >= self.rng.end { + return None; + } + let next_id = self.rng.start + 1; + let id = mem::replace(&mut self.rng.start, next_id); + // new_unchecked is OK since we asserted that the number of + // elements in this iterator will fit in an ID at construction. + Some($ty::new_unchecked(id)) + } + } + + impl<T> core::ops::Index<$ty> for [T] { + type Output = T; + + #[inline] + fn index(&self, index: $ty) -> &T { + &self[index.as_usize()] + } + } + + impl<T> core::ops::IndexMut<$ty> for [T] { + #[inline] + fn index_mut(&mut self, index: $ty) -> &mut T { + &mut self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::Index<$ty> for Vec<T> { + type Output = T; + + #[inline] + fn index(&self, index: $ty) -> &T { + &self[index.as_usize()] + } + } + + #[cfg(feature = "alloc")] + impl<T> core::ops::IndexMut<$ty> for Vec<T> { + #[inline] + fn index_mut(&mut self, index: $ty) -> &mut T { + &mut self[index.as_usize()] + } + } + + impl TryFrom<usize> for $ty { + type Error = $tyerr; + + fn try_from(id: usize) -> Result<$ty, $tyerr> { + if id > $ty::MAX.as_usize() { + return Err($tyerr { attempted: id as u64 }); + } + Ok($ty::new_unchecked(id)) + } + } + + impl TryFrom<u8> for $ty { + type Error = Infallible; + + fn try_from(id: u8) -> Result<$ty, Infallible> { + Ok($ty::new_unchecked(id as usize)) + } + } + + impl TryFrom<u16> for $ty { + type Error = $tyerr; + + fn try_from(id: u16) -> Result<$ty, $tyerr> { + if id as u32 > $ty::MAX.as_u32() { + return Err($tyerr { attempted: id as u64 }); + } + Ok($ty::new_unchecked(id as usize)) + } + } + + impl TryFrom<u32> for $ty { + type Error = $tyerr; + + fn try_from(id: u32) -> Result<$ty, $tyerr> { + if id > $ty::MAX.as_u32() { + return Err($tyerr { attempted: id as u64 }); + } + Ok($ty::new_unchecked(id as usize)) + } + } + + impl TryFrom<u64> for $ty { + type Error = $tyerr; + + fn try_from(id: u64) -> Result<$ty, $tyerr> { + if id > $ty::MAX.as_u32() as u64 { + return Err($tyerr { attempted: id }); + } + Ok($ty::new_unchecked(id as usize)) + } + } + + #[cfg(test)] + impl quickcheck::Arbitrary for $ty { + fn arbitrary(gen: &mut quickcheck::Gen) -> $ty { + use core::cmp::max; + + let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); + if id > $ty::MAX.as_i32() { + $ty::MAX + } else { + $ty::new(usize::try_from(id).unwrap()).unwrap() + } + } + } + }; +} + +impls!(PatternID, PatternIDError, PatternIDIter); +impls!(StateID, StateIDError, StateIDIter); + +/// A utility trait that defines a couple of adapters for making it convenient +/// to access indices as ID types. We require ExactSizeIterator so that +/// iterator construction can do a single check to make sure the index of each +/// element is representable by its ID type. +#[cfg(feature = "alloc")] +pub(crate) trait IteratorIDExt: Iterator { + fn with_pattern_ids(self) -> WithPatternIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithPatternIDIter::new(self) + } + + fn with_state_ids(self) -> WithStateIDIter<Self> + where + Self: Sized + ExactSizeIterator, + { + WithStateIDIter::new(self) + } +} + +#[cfg(feature = "alloc")] +impl<I: Iterator> IteratorIDExt for I {} + +#[cfg(feature = "alloc")] +macro_rules! iditer { + ($ty:ident, $iterty:ident, $withiterty:ident) => { + /// An iterator adapter that is like std::iter::Enumerate, but attaches + /// IDs. It requires ExactSizeIterator. At construction, it ensures + /// that the index of each element in the iterator is representable in + /// the corresponding ID type. + #[derive(Clone, Debug)] + pub(crate) struct $withiterty<I> { + it: I, + ids: $iterty, + } + + impl<I: Iterator + ExactSizeIterator> $withiterty<I> { + fn new(it: I) -> $withiterty<I> { + let ids = $ty::iter(it.len()); + $withiterty { it, ids } + } + } + + impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> { + type Item = ($ty, I::Item); + + fn next(&mut self) -> Option<($ty, I::Item)> { + let item = self.it.next()?; + // Number of elements in this iterator must match, according + // to contract of ExactSizeIterator. + let id = self.ids.next().unwrap(); + Some((id, item)) + } + } + }; +} + +#[cfg(feature = "alloc")] +iditer!(PatternID, PatternIDIter, WithPatternIDIter); +#[cfg(feature = "alloc")] +iditer!(StateID, StateIDIter, WithStateIDIter); diff --git a/vendor/regex-automata-0.2.0/src/util/lazy.rs b/vendor/regex-automata-0.2.0/src/util/lazy.rs new file mode 100644 index 000000000..d8cac6ef4 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/lazy.rs @@ -0,0 +1,31 @@ +use core::{ + cell::Cell, + ptr, + sync::atomic::{AtomicPtr, Ordering}, +}; + +use alloc::{boxed::Box, vec::Vec}; + +#[inline(always)] +pub(crate) fn get_or_init<T: Send + Sync + 'static>( + location: &'static AtomicPtr<T>, + init: impl FnOnce() -> T, +) -> &'static T { + let mut ptr = location.load(Ordering::Acquire); + if ptr.is_null() { + let new_dfa = Box::new(init()); + ptr = Box::into_raw(new_dfa); + let result = location.compare_exchange( + ptr::null_mut(), + ptr, + Ordering::AcqRel, + Ordering::Acquire, + ); + if let Err(old) = result { + let redundant = unsafe { Box::from_raw(ptr) }; + drop(redundant); + ptr = old; + } + } + unsafe { &*ptr } +} diff --git a/vendor/regex-automata-0.2.0/src/util/matchtypes.rs b/vendor/regex-automata-0.2.0/src/util/matchtypes.rs new file mode 100644 index 000000000..de0fa65bf --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/matchtypes.rs @@ -0,0 +1,356 @@ +use crate::util::id::PatternID; + +/// The kind of match semantics to use for a DFA. +/// +/// The default match kind is `LeftmostFirst`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Report all possible matches. + All, + /// Report only the leftmost matches. When multiple leftmost matches exist, + /// report the match corresponding to the part of the regex that appears + /// first in the syntax. + LeftmostFirst, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, + // There is prior art in RE2 that shows that we should be able to add + // LeftmostLongest too. The tricky part of it is supporting ungreedy + // repetitions. Instead of treating all NFA states as having equivalent + // priority (as in 'All') or treating all NFA states as having distinct + // priority based on order (as in 'LeftmostFirst'), we instead group NFA + // states into sets, and treat members of each set as having equivalent + // priority, but having greater priority than all following members + // of different sets. + // + // However, it's not clear whether it's really worth adding this. After + // all, leftmost-longest can be emulated when using literals by using + // leftmost-first and sorting the literals by length in descending order. + // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will + // always match `a` in `ab` when using leftmost-first, but leftmost-longest + // would match `ab`. +} + +impl MatchKind { + #[cfg(feature = "alloc")] + pub(crate) fn continue_past_first_match(&self) -> bool { + *self == MatchKind::All + } +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// A representation of a match reported by a regex engine. +/// +/// A match records the start and end offsets of the match in the haystack. +/// +/// Every match guarantees that `start <= end`. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The start offset of the match, inclusive. + start: usize, + /// The end offset of the match, exclusive. + end: usize, +} + +impl Match { + /// Create a new match from a byte offset span. + /// + /// # Panics + /// + /// This panics if `end < start`. + #[inline] + pub fn new(start: usize, end: usize) -> Match { + assert!(start <= end); + Match { start, end } + } + + /// The starting position of the match. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// The ending position of the match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the match location as a range. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.start..self.end + } + + /// Returns true if and only if this match is empty. That is, when + /// `start() == end()`. + /// + /// An empty match can only be returned when the empty string was among + /// the patterns used to build the Aho-Corasick automaton. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } +} + +/// A representation of a match reported by a DFA. +/// +/// This is called a "half" match because it only includes the end location +/// (or start location for a reverse match) of a match. This corresponds to the +/// information that a single DFA scan can report. Getting the other half of +/// the match requires a second scan with a reversed DFA. +/// +/// A half match also includes the pattern that matched. The pattern is +/// identified by an ID, which corresponds to its position (starting from `0`) +/// relative to other patterns used to construct the corresponding DFA. If only +/// a single pattern is provided to the DFA, then all matches are guaranteed to +/// have a pattern ID of `0`. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct HalfMatch { + /// The pattern ID. + pub(crate) pattern: PatternID, + /// The offset of the match. + /// + /// For forward searches, the offset is exclusive. For reverse searches, + /// the offset is inclusive. + pub(crate) offset: usize, +} + +impl HalfMatch { + /// Create a new half match from a pattern ID and a byte offset. + #[inline] + pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { + HalfMatch { pattern, offset } + } + + /// Create a new half match from a pattern ID and a byte offset. + /// + /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + #[inline] + pub fn must(pattern: usize, offset: usize) -> HalfMatch { + HalfMatch::new(PatternID::new(pattern).unwrap(), offset) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding DFA. The first pattern has + /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The position of the match. + /// + /// If this match was produced by a forward search, then the offset is + /// exclusive. If this match was produced by a reverse search, then the + /// offset is inclusive. + #[inline] + pub fn offset(&self) -> usize { + self.offset + } +} + +/// A representation of a multi match reported by a regex engine. +/// +/// A multi match has two essential pieces of information: the identifier of +/// the pattern that matched, along with the start and end offsets of the match +/// in the haystack. +/// +/// The pattern is identified by an ID, which corresponds to its position +/// (starting from `0`) relative to other patterns used to construct the +/// corresponding regex engine. If only a single pattern is provided, then all +/// multi matches are guaranteed to have a pattern ID of `0`. +/// +/// Every multi match guarantees that `start <= end`. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct MultiMatch { + /// The pattern ID. + pattern: PatternID, + /// The start offset of the match, inclusive. + start: usize, + /// The end offset of the match, exclusive. + end: usize, +} + +impl MultiMatch { + /// Create a new match from a pattern ID and a byte offset span. + /// + /// # Panics + /// + /// This panics if `end < start`. + #[inline] + pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch { + assert!(start <= end); + MultiMatch { pattern, start, end } + } + + /// Create a new match from a pattern ID and a byte offset span. + /// + /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a + /// [`PatternID`]. This panics if the given `usize` is not representable + /// as a `PatternID`. + /// + /// # Panics + /// + /// This panics if `end < start` or if `pattern > PatternID::MAX`. + #[inline] + pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch { + MultiMatch::new(PatternID::new(pattern).unwrap(), start, end) + } + + /// Returns the ID of the pattern that matched. + /// + /// The ID of a pattern is derived from the position in which it was + /// originally inserted into the corresponding regex engine. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and + /// so on. + #[inline] + pub fn pattern(&self) -> PatternID { + self.pattern + } + + /// The starting position of the match. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// The ending position of the match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the match location as a range. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.start..self.end + } + + /// Returns true if and only if this match is empty. That is, when + /// `start() == end()`. + /// + /// An empty match can only be returned when the empty string was among + /// the patterns used to build the Aho-Corasick automaton. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } +} + +/// An error type indicating that a search stopped prematurely without finding +/// a match. +/// +/// This error type implies that one cannot assume that no matches occur, since +/// the search stopped before completing. +/// +/// Normally, when one searches for something, the response is either an +/// affirmative "it was found at this location" or a negative "not found at +/// all." However, in some cases, a regex engine can be configured to stop its +/// search before concluding whether a match exists or not. When this happens, +/// it may be important for the caller to know why the regex engine gave up and +/// where in the input it gave up at. This error type exposes the 'why' and the +/// 'where.' +/// +/// For example, the DFAs provided by this library generally cannot correctly +/// implement Unicode word boundaries. Instead, they provide an option to +/// eagerly support them on ASCII text (since Unicode word boundaries are +/// equivalent to ASCII word boundaries when searching ASCII text), but will +/// "give up" if a non-ASCII byte is seen. In such cases, one is usually +/// required to either report the failure to the caller (unergonomic) or +/// otherwise fall back to some other regex engine (ergonomic, but potentially +/// costly). +/// +/// More generally, some regex engines offer the ability for callers to specify +/// certain bytes that will trigger the regex engine to automatically quit if +/// they are seen. +/// +/// Still yet, there may be other reasons for a failed match. For example, +/// the hybrid DFA provided by this crate can be configured to give up if it +/// believes that it is not efficient. This in turn permits callers to choose a +/// different regex engine. +/// +/// # Advice +/// +/// While this form of error reporting adds complexity, it is generally +/// possible for callers to configure regex engines to never give up a search, +/// and thus never return an error. Indeed, the default configuration for every +/// regex engine in this crate is such that they will never stop searching +/// early. Therefore, the only way to get a match error is if the regex engine +/// is explicitly configured to do so. Options that enable this behavior +/// document the new error conditions they imply. +/// +/// Regex engines for which no errors are possible for any configuration will +/// return the normal `Option<Match>` and not use this error type at all. +/// +/// For example, regex engines in the `dfa` sub-module will only report +/// `MatchError::Quit` if instructed by either +/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary) +/// or by +/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit). +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub enum MatchError { + // Note that the first version of this type was called `SearchError` and it + // included a third `None` variant to indicate that the search completed + // and no match was found. However, this was problematic for iterator + // APIs where the `None` sentinel for stopping iteration corresponds + // precisely to the "match not found" case. The fact that the `None` + // variant was buried inside this type was in turn quite awkward. So + // instead, I removed the `None` variant, renamed the type and used + // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the + // conceptually simpler `Result<Match, MatchError>`. However, we "regain" + // ergonomics by only putting the more complex API in the `try_` variants + // ("fallible") of search methods. The infallible APIs will instead just + // return `Option<Match>` and panic on error. + /// The search saw a "quit" byte at which it was instructed to stop + /// searching. + Quit { + /// The "quit" byte that was observed that caused the search to stop. + byte: u8, + /// The offset at which the quit byte was observed. + offset: usize, + }, + /// The search, based on heuristics, determined that it would be better + /// to stop, typically to provide the caller an opportunity to use an + /// alternative regex engine. + /// + /// Currently, the only way for this to occur is via the lazy DFA and + /// only when it is configured to do so (it will not return this error by + /// default). + GaveUp { + /// The offset at which the search stopped. This corresponds to the + /// position immediately following the last byte scanned. + offset: usize, + }, +} + +#[cfg(feature = "std")] +impl std::error::Error for MatchError {} + +impl core::fmt::Display for MatchError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + match *self { + MatchError::Quit { byte, offset } => write!( + f, + "quit search after observing byte \\x{:02X} at offset {}", + byte, offset, + ), + MatchError::GaveUp { offset } => { + write!(f, "gave up searching at offset {}", offset) + } + } + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/mod.rs b/vendor/regex-automata-0.2.0/src/util/mod.rs new file mode 100644 index 000000000..798507da2 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/mod.rs @@ -0,0 +1,275 @@ +/*! +TODO +*/ + +use core::{ascii, fmt, str}; + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +pub mod alphabet; +pub(crate) mod bytes; +#[cfg(feature = "alloc")] +pub(crate) mod determinize; +pub mod id; +#[cfg(feature = "alloc")] +pub(crate) mod lazy; +pub(crate) mod matchtypes; +pub mod prefilter; +#[cfg(feature = "alloc")] +pub(crate) mod sparse_set; +pub(crate) mod start; +#[cfg(feature = "alloc")] +pub(crate) mod syntax; + +/// The offset, in bytes, that a match is delayed by in the DFAs generated by +/// this crate. (This includes lazy DFAs.) +/// +/// The purpose of this delay is to support look-ahead such as \b (ASCII-only) +/// and $. In particular, both of these operators may require the +/// identification of the end of input in order to confirm a match. Not only +/// does this mean that all matches must therefore be delayed by a single byte, +/// but that a special EOI value is added to the alphabet of all DFAs. (Which +/// means that even though the alphabet of a DFA is typically all byte values, +/// the actual maximum alphabet size is 257 due to the extra EOI value.) +/// +/// Since we delay matches by only 1 byte, this can't fully support a +/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed, +/// DFAs in this crate do not support it. (It's not as simple as just +/// increasing the match offset to do it---otherwise we would---but building +/// the full Unicode-aware word boundary detection into an automaton is quite +/// tricky.) +pub(crate) const MATCH_OFFSET: usize = 1; + +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct DebugByte(pub u8); + +impl fmt::Debug for DebugByte { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +/// +/// For all inputs, including invalid UTF-8 and any value of `i`, the return +/// value is guaranteed to be greater than `i`. +/// +/// Generally speaking, this should only be called on `text` when it is +/// permitted to assume that it is valid UTF-8 and where either `i >= +/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. +#[inline(always)] +pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i.checked_add(1).unwrap(), + Some(&b) => b, + }; + // For cases where we see an invalid UTF-8 byte, there isn't much we can do + // other than just start at the next byte. + let inc = utf8_len(b).unwrap_or(1); + i.checked_add(inc).unwrap() +} + +/// Returns true if and only if the given byte is considered a word character. +/// This only applies to ASCII. +/// +/// This was copied from regex-syntax so that we can use it to determine the +/// starting DFA state while searching without depending on regex-syntax. The +/// definition is never going to change, so there's no maintenance/bit-rot +/// hazard here. +#[inline(always)] +pub(crate) fn is_word_byte(b: u8) -> bool { + match b { + b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, + _ => false, + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[inline(always)] +pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let len = match utf8_len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(bytes[0] as char)), + Some(len) => len, + }; + match str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Decodes the last UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the end of the given byte +/// slice, then the last byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +#[inline(always)] +pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { + return None; + } + let mut start = bytes.len() - 1; + let limit = bytes.len().saturating_sub(4); + while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) { + start -= 1; + } + match decode_utf8(&bytes[start..]) { + None => None, + Some(Ok(ch)) => Some(Ok(ch)), + Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +#[inline(always)] +fn utf8_len(byte: u8) -> Option<usize> { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +/// Returns true if and only if the given byte is either a valid leading UTF-8 +/// byte, or is otherwise an invalid byte that can never appear anywhere in a +/// valid UTF-8 sequence. +#[inline(always)] +fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { + // In the ASCII case, the most significant bit is never set. The leading + // byte of a 2/3/4-byte sequence always has the top two most significant + // bits set. For bytes that can never appear anywhere in valid UTF-8, this + // also returns true, since every such byte has its two most significant + // bits set: + // + // \xC0 :: 11000000 + // \xC1 :: 11000001 + // \xF5 :: 11110101 + // \xF6 :: 11110110 + // \xF7 :: 11110111 + // \xF8 :: 11111000 + // \xF9 :: 11111001 + // \xFA :: 11111010 + // \xFB :: 11111011 + // \xFC :: 11111100 + // \xFD :: 11111101 + // \xFE :: 11111110 + // \xFF :: 11111111 + (b & 0b1100_0000) != 0b1000_0000 +} + +#[cfg(feature = "alloc")] +#[inline(always)] +pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool { + use core::{ptr, sync::atomic::AtomicPtr}; + + use crate::{ + dfa::{ + dense::{self, DFA}, + Automaton, + }, + util::lazy, + }; + + static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); + + let dfa = lazy::get_or_init(&WORD, || { + // TODO: Should we use a lazy DFA here instead? It does complicate + // things somewhat, since we then need a mutable cache, which probably + // means a thread local. + dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .build(r"\w") + .unwrap() + }); + // This is OK since '\w' contains no look-around. + let mut sid = dfa.universal_start_state(); + while at < bytes.len() { + let byte = bytes[at]; + sid = dfa.next_state(sid, byte); + at += 1; + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return true; + } else if dfa.is_dead_state(sid) { + return false; + } + } + } + dfa.is_match_state(dfa.next_eoi_state(sid)) +} + +#[cfg(feature = "alloc")] +#[inline(always)] +pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool { + use core::{ptr, sync::atomic::AtomicPtr}; + + use crate::{ + dfa::{ + dense::{self, DFA}, + Automaton, + }, + nfa::thompson::NFA, + }; + + static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); + + let dfa = lazy::get_or_init(&WORD, || { + dense::Builder::new() + .configure(dense::Config::new().anchored(true)) + .thompson(NFA::config().reverse(true).shrink(true)) + .build(r"\w") + .unwrap() + }); + + // This is OK since '\w' contains no look-around. + let mut sid = dfa.universal_start_state(); + while at > 0 { + at -= 1; + let byte = bytes[at]; + sid = dfa.next_state(sid, byte); + if dfa.is_special_state(sid) { + if dfa.is_match_state(sid) { + return true; + } else if dfa.is_dead_state(sid) { + return false; + } + } + } + dfa.is_match_state(dfa.next_eoi_state(sid)) +} diff --git a/vendor/regex-automata-0.2.0/src/util/prefilter.rs b/vendor/regex-automata-0.2.0/src/util/prefilter.rs new file mode 100644 index 000000000..5fe151524 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/prefilter.rs @@ -0,0 +1,281 @@ +use crate::Match; + +/// A candidate is the result of running a prefilter on a haystack at a +/// particular position. The result is one of no match, a confirmed match or +/// a possible match. +/// +/// When no match is returned, the prefilter is guaranteeing that no possible +/// match can be found in the haystack, and the caller may trust this. That is, +/// all correct prefilters must never report false negatives. +/// +/// In some cases, a prefilter can confirm a match very quickly, in which case, +/// the caller may use this to stop what it's doing and report the match. In +/// this case, prefilter implementations must never report a false positive. +/// In other cases, the prefilter can only report a potential match, in which +/// case the callers must attempt to confirm the match. In this case, prefilter +/// implementations are permitted to return false positives. +#[derive(Clone, Debug)] +pub enum Candidate { + /// The prefilter reports that no match is possible. Prefilter + /// implementations will never report false negatives. + None, + /// The prefilter reports that a match has been confirmed at the provided + /// byte offsets. When this variant is reported, the prefilter is + /// guaranteeing a match. No false positives are permitted. + Match(Match), + /// The prefilter reports that a match *may* start at the given position. + /// When this variant is reported, it may correspond to a false positive. + PossibleStartOfMatch(usize), +} + +impl Candidate { + /// Convert this candidate into an option. This is useful when callers do + /// not distinguish between true positives and false positives (i.e., the + /// caller must always confirm the match in order to update some other + /// state). + /// + /// The byte offset in the option returned corresponds to the starting + /// position of the possible match. + pub fn into_option(self) -> Option<usize> { + match self { + Candidate::None => None, + Candidate::Match(ref m) => Some(m.start()), + Candidate::PossibleStartOfMatch(start) => Some(start), + } + } +} + +/// A prefilter describes the behavior of fast literal scanners for quickly +/// skipping past bytes in the haystack that we know cannot possibly +/// participate in a match. +pub trait Prefilter: core::fmt::Debug { + /// Returns the next possible match candidate. This may yield false + /// positives, so callers must confirm a match starting at the position + /// returned. This, however, must never produce false negatives. That is, + /// this must, at minimum, return the starting position of the next match + /// in the given haystack after or at the given position. + fn next_candidate( + &self, + state: &mut State, + haystack: &[u8], + at: usize, + ) -> Candidate; + + /// Returns the approximate total amount of heap used by this prefilter, in + /// units of bytes. + fn heap_bytes(&self) -> usize; + + /// Returns true if and only if this prefilter may return false positives + /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful + /// when false positives are not posssible (in which case, implementations + /// should return false), which may allow completely avoiding heavier regex + /// machinery when the prefilter can quickly confirm its own matches. + /// + /// By default, this returns true, which is conservative; it is always + /// correct to return `true`. Returning `false` here and reporting a false + /// positive will result in incorrect searches. + fn reports_false_positives(&self) -> bool { + true + } +} + +impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { + #[inline] + fn next_candidate( + &self, + state: &mut State, + haystack: &[u8], + at: usize, + ) -> Candidate { + (**self).next_candidate(state, haystack, at) + } + + fn heap_bytes(&self) -> usize { + (**self).heap_bytes() + } + + fn reports_false_positives(&self) -> bool { + (**self).reports_false_positives() + } +} + +#[derive(Clone)] +pub struct Scanner<'p> { + prefilter: &'p dyn Prefilter, + state: State, +} + +impl<'p> Scanner<'p> { + pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> { + Scanner { prefilter, state: State::new() } + } + + pub(crate) fn is_effective(&mut self, at: usize) -> bool { + self.state.is_effective(at) + } + + pub(crate) fn reports_false_positives(&self) -> bool { + self.prefilter.reports_false_positives() + } + + pub(crate) fn next_candidate( + &mut self, + bytes: &[u8], + at: usize, + ) -> Candidate { + let cand = self.prefilter.next_candidate(&mut self.state, bytes, at); + match cand { + Candidate::None => { + self.state.update_skipped_bytes(bytes.len() - at); + } + Candidate::Match(ref m) => { + self.state.update_skipped_bytes(m.start() - at); + } + Candidate::PossibleStartOfMatch(i) => { + self.state.update_skipped_bytes(i - at); + } + } + cand + } +} + +impl<'p> core::fmt::Debug for Scanner<'p> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("Scanner").field("state", &self.state).finish() + } +} + +/// State tracks state associated with the effectiveness of a +/// prefilter. It is used to track how many bytes, on average, are skipped by +/// the prefilter. If this average dips below a certain threshold over time, +/// then the state renders the prefilter inert and stops using it. +/// +/// A prefilter state should be created for each search. (Where creating an +/// iterator via, e.g., `find_iter`, is treated as a single search.) +#[derive(Clone, Debug)] +pub struct State { + /// The number of skips that has been executed. + skips: usize, + /// The total number of bytes that have been skipped. + skipped: usize, + /// Once this heuristic has been deemed permanently ineffective, it will be + /// inert throughout the rest of its lifetime. This serves as a cheap way + /// to check inertness. + inert: bool, + /// The last (absolute) position at which a prefilter scanned to. + /// Prefilters can use this position to determine whether to re-scan or + /// not. + /// + /// Unlike other things that impact effectiveness, this is a fleeting + /// condition. That is, a prefilter can be considered ineffective if it is + /// at a position before `last_scan_at`, but can become effective again + /// once the search moves past `last_scan_at`. + /// + /// The utility of this is to both avoid additional overhead from calling + /// the prefilter and to avoid quadratic behavior. This ensures that a + /// prefilter will scan any particular byte at most once. (Note that some + /// prefilters, like the start-byte prefilter, do not need to use this + /// field at all, since it only looks for starting bytes.) + last_scan_at: usize, +} + +impl State { + /// The minimum number of skip attempts to try before considering whether + /// a prefilter is effective or not. + const MIN_SKIPS: usize = 40; + + /// The minimum amount of bytes that skipping must average. + /// + /// That is, after MIN_SKIPS have occurred, if the average number of bytes + /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be + /// rendered inert. + const MIN_AVG_SKIP: usize = 16; + + /// Create a fresh prefilter state. + pub fn new() -> State { + State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 } + } + + /// Updates the position at which the last scan stopped. This may be + /// greater than the position of the last candidate reported. For example, + /// searching for the byte `z` in `abczdef` for the pattern `abcz` will + /// report a candidate at position `0`, but the end of its last scan will + /// be at position `3`. + /// + /// This position factors into the effectiveness of this prefilter. If the + /// current position is less than the last position at which a scan ended, + /// then the prefilter should not be re-run until the search moves past + /// that position. + /// + /// It is always correct to never update the last scan position. In fact, + /// it is also always correct to set the last scan position to an arbitrary + /// value. The key is setting it to a position in the future at which it + /// makes sense to restart the prefilter. + pub fn update_last_scan(&mut self, at: usize) { + if at > self.last_scan_at { + self.last_scan_at = at; + } + } + + /// Return true if and only if this state indicates that a prefilter is + /// still effective. If the prefilter is not effective, then this state + /// is rendered "inert." At which point, all subsequent calls to + /// `is_effective` on this state will return `false`. + /// + /// `at` should correspond to the current starting position of the search. + /// + /// Callers typically do not need to use this, as it represents the + /// default implementation of + /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective). + fn is_effective(&mut self, at: usize) -> bool { + if self.inert { + return false; + } + if at < self.last_scan_at { + return false; + } + if self.skips < State::MIN_SKIPS { + return true; + } + + if self.skipped >= State::MIN_AVG_SKIP * self.skips { + return true; + } + + // We're inert. + self.inert = true; + false + } + + /// Update this state with the number of bytes skipped on the last + /// invocation of the prefilter. + fn update_skipped_bytes(&mut self, skipped: usize) { + self.skips += 1; + self.skipped += skipped; + } +} + +/// A `Prefilter` implementation that reports a possible match at every +/// position. +/// +/// This should generally not be used as an actual prefilter. It is only +/// useful when one needs to represent the absence of a prefilter in a generic +/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex) +/// uses this prefilter by default to indicate that no prefilter should be +/// used. +/// +/// A `None` prefilter value cannot be constructed. +#[derive(Clone, Debug)] +pub struct None { + _priv: (), +} + +impl Prefilter for None { + fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate { + Candidate::PossibleStartOfMatch(at) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/sparse_set.rs b/vendor/regex-automata-0.2.0/src/util/sparse_set.rs new file mode 100644 index 000000000..bf59e4469 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/sparse_set.rs @@ -0,0 +1,229 @@ +use alloc::{boxed::Box, vec, vec::Vec}; + +use crate::util::id::StateID; + +/// A pairse of sparse sets. +/// +/// This is useful when one needs to compute NFA epsilon closures from a +/// previous set of states derived from an epsilon closure. One set can be the +/// starting states where as the other set can be the destination states after +/// following the transitions for a particular byte of input. +/// +/// There is no significance to 'set1' or 'set2'. They are both sparse sets of +/// the same size. +/// +/// The members of this struct are exposed so that callers may borrow 'set1' +/// and 'set2' individually without being force to borrow both at the same +/// time. +#[derive(Clone, Debug)] +pub(crate) struct SparseSets { + pub(crate) set1: SparseSet, + pub(crate) set2: SparseSet, +} + +impl SparseSets { + /// Create a new pair of sparse sets where each set has the given capacity. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + pub(crate) fn new(capacity: usize) -> SparseSets { + SparseSets { + set1: SparseSet::new(capacity), + set2: SparseSet::new(capacity), + } + } + + /// Resizes these sparse sets to have the new capacity given. + /// + /// The sets are automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + self.set1.resize(new_capacity); + self.set2.resize(new_capacity); + } + + /// Clear both sparse sets. + pub(crate) fn clear(&mut self) { + self.set1.clear(); + self.set2.clear(); + } + + /// Swap set1 with set2. + pub(crate) fn swap(&mut self) { + core::mem::swap(&mut self.set1, &mut self.set2); + } + + /// Returns the memory usage, in bytes, used by this pair of sparse sets. + pub(crate) fn memory_usage(&self) -> usize { + self.set1.memory_usage() + self.set2.memory_usage() + } +} + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone)] +pub(crate) struct SparseSet { + /// The number of elements currently in this set. + len: usize, + /// Dense contains the ids in the order in which they were inserted. + dense: Vec<StateID>, + /// Sparse maps ids to their location in dense. + /// + /// A state ID is in the set if and only if + /// sparse[id] < dense.len() && id == dense[sparse[id]]. + sparse: Vec<StateID>, +} + +impl SparseSet { + /// Create a new sparse set with the given capacity. + /// + /// Sparse sets have a fixed size and they cannot grow. Attempting to + /// insert more distinct elements than the total capacity of the set will + /// result in a panic. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn new(capacity: usize) -> SparseSet { + let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; + set.resize(capacity); + set + } + + /// Resizes this sparse set to have the new capacity given. + /// + /// This set is automatically cleared. + /// + /// This panics if the capacity given is bigger than `StateID::LIMIT`. + #[inline] + pub(crate) fn resize(&mut self, new_capacity: usize) { + assert!( + new_capacity <= StateID::LIMIT, + "sparse set capacity cannot excced {:?}", + StateID::LIMIT + ); + self.clear(); + self.dense.resize(new_capacity, StateID::ZERO); + self.sparse.resize(new_capacity, StateID::ZERO); + } + + /// Returns the capacity of this set. + /// + /// The capacity represents a fixed limit on the number of distinct + /// elements that are allowed in this set. The capacity cannot be changed. + #[inline] + pub(crate) fn capacity(&self) -> usize { + self.dense.len() + } + + /// Returns the number of elements in this set. + #[inline] + pub(crate) fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this set is empty. + #[inline] + pub(crate) fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the state ID value into this set and return true if the given + /// state ID was not previously in this set. + /// + /// This operation is idempotent. If the given value is already in this + /// set, then this is a no-op. + /// + /// If more than `capacity` ids are inserted, then this panics. + /// + /// This is marked as inline(always) since the compiler won't inline it + /// otherwise, and it's a fairly hot piece of code in DFA determinization. + #[inline(always)] + pub(crate) fn insert(&mut self, value: StateID) -> bool { + if self.contains(value) { + return false; + } + + let i = self.len(); + assert!( + i < self.capacity(), + "{:?} exceeds capacity of {:?} when inserting {:?}", + i, + self.capacity(), + value, + ); + // OK since i < self.capacity() and self.capacity() is guaranteed to + // be <= StateID::LIMIT. + let id = StateID::new_unchecked(i); + self.dense[id] = value; + self.sparse[value] = id; + self.len += 1; + true + } + + /// Returns true if and only if this set contains the given value. + #[inline] + pub(crate) fn contains(&self, value: StateID) -> bool { + let i = self.sparse[value]; + i.as_usize() < self.len() && self.dense[i] == value + } + + /// Returns the ith inserted element from this set. + /// + /// Panics when i >= self.len(). + #[inline] + pub(crate) fn get(&self, i: usize) -> StateID { + self.dense[i] + } + + /// Clear this set such that it has no members. + #[inline] + pub(crate) fn clear(&mut self) { + self.len = 0; + } + + /// Returns the heap memory usage, in bytes, used by this sparse set. + #[inline] + pub(crate) fn memory_usage(&self) -> usize { + 2 * self.dense.len() * StateID::SIZE + } +} + +impl core::fmt::Debug for SparseSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let elements: Vec<StateID> = self.into_iter().collect(); + f.debug_tuple("SparseSet").field(&elements).finish() + } +} + +/// An iterator over all elements in a sparse set. +/// +/// The lifetime `'a` refers to the lifetime of the set being iterated over. +#[derive(Debug)] +pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); + +impl<'a> IntoIterator for &'a SparseSet { + type Item = StateID; + type IntoIter = SparseSetIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + SparseSetIter(self.dense[..self.len()].iter()) + } +} + +impl<'a> Iterator for SparseSetIter<'a> { + type Item = StateID; + + #[inline(always)] + fn next(&mut self) -> Option<StateID> { + self.0.next().map(|value| *value) + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/start.rs b/vendor/regex-automata-0.2.0/src/util/start.rs new file mode 100644 index 000000000..3c756fc26 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/start.rs @@ -0,0 +1,109 @@ +/// Represents the four possible starting configurations of a DFA search. +/// +/// The starting configuration is determined by inspecting the the beginning of +/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if +/// specified) is what selects the start state to use in a DFA. +/// +/// In a DFA that doesn't have starting states for each pattern, then it will +/// have a maximum of four DFA start states. If the DFA was compiled with start +/// states for each pattern, then it will have a maximum of four DFA start +/// states for searching for any pattern, and then another maximum of four DFA +/// start states for executing an anchored search for each pattern. +/// +/// This ends up being represented as a table in the DFA (whether lazy or fully +/// built) where the stride of that table is 4, and each entry is an index into +/// the state transition table. Note though that multiple entries in the table +/// might point to the same state if the states would otherwise be equivalent. +/// (This is guaranteed by DFA minimization and may even be accomplished by +/// normal determinization, since it attempts to reuse equivalent states too.) +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum Start { + /// This occurs when the starting position is not any of the ones below. + NonWordByte = 0, + /// This occurs when the byte immediately preceding the start of the search + /// is an ASCII word byte. + WordByte = 1, + /// This occurs when the starting position of the search corresponds to the + /// beginning of the haystack. + Text = 2, + /// This occurs when the byte immediately preceding the start of the search + /// is a line terminator. Specifically, `\n`. + Line = 3, +} + +impl Start { + /// Return the starting state corresponding to the given integer. If no + /// starting state exists for the given integer, then None is returned. + pub(crate) fn from_usize(n: usize) -> Option<Start> { + match n { + 0 => Some(Start::NonWordByte), + 1 => Some(Start::WordByte), + 2 => Some(Start::Text), + 3 => Some(Start::Line), + _ => None, + } + } + + /// Returns the total number of starting state configurations. + pub(crate) fn count() -> usize { + 4 + } + + /// Returns the starting state configuration for the given search + /// parameters. If the given offset range is not valid, then this panics. + #[inline(always)] + pub(crate) fn from_position_fwd( + bytes: &[u8], + start: usize, + end: usize, + ) -> Start { + assert!( + bytes.get(start..end).is_some(), + "{}..{} is invalid", + start, + end + ); + if start == 0 { + Start::Text + } else if bytes[start - 1] == b'\n' { + Start::Line + } else if crate::util::is_word_byte(bytes[start - 1]) { + Start::WordByte + } else { + Start::NonWordByte + } + } + + /// Returns the starting state configuration for a reverse search with the + /// given search parameters. If the given offset range is not valid, then + /// this panics. + #[inline(always)] + pub(crate) fn from_position_rev( + bytes: &[u8], + start: usize, + end: usize, + ) -> Start { + assert!( + bytes.get(start..end).is_some(), + "{}..{} is invalid", + start, + end + ); + if end == bytes.len() { + Start::Text + } else if bytes[end] == b'\n' { + Start::Line + } else if crate::util::is_word_byte(bytes[end]) { + Start::WordByte + } else { + Start::NonWordByte + } + } + + /// Return this starting configuration as an integer. It is guaranteed to + /// be less than `Start::count()`. + #[inline(always)] + pub(crate) fn as_usize(&self) -> usize { + *self as usize + } +} diff --git a/vendor/regex-automata-0.2.0/src/util/syntax.rs b/vendor/regex-automata-0.2.0/src/util/syntax.rs new file mode 100644 index 000000000..88beeee75 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/syntax.rs @@ -0,0 +1,272 @@ +use regex_syntax::ParserBuilder; + +/// A common set of configuration options that apply to the syntax of a regex. +/// +/// This represents a group of configuration options that specifically apply +/// to how the concrete syntax of a regular expression is interpreted. In +/// particular, they are generally forwarded to the +/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) +/// in the +/// [`regex-syntax`](https://docs.rs/regex-syntax) +/// crate when building a regex from its concrete syntax directly. +/// +/// These options are defined as a group since they apply to every regex engine +/// in this crate. Instead of re-defining them on every engine's builder, they +/// are instead provided here as one cohesive unit. +#[derive(Clone, Copy, Debug)] +pub struct SyntaxConfig { + case_insensitive: bool, + multi_line: bool, + dot_matches_new_line: bool, + swap_greed: bool, + ignore_whitespace: bool, + unicode: bool, + utf8: bool, + nest_limit: u32, + octal: bool, +} + +impl SyntaxConfig { + /// Return a new default syntax configuration. + pub fn new() -> SyntaxConfig { + // These defaults match the ones used in regex-syntax. + SyntaxConfig { + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + utf8: true, + nest_limit: 250, + octal: false, + } + } + + /// Enable or disable the case insensitive flag by default. + /// + /// When Unicode mode is enabled, case insensitivity is Unicode-aware. + /// Specifically, it will apply the "simple" case folding rules as + /// specified by Unicode. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig { + self.case_insensitive = yes; + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// When this is enabled, the `^` and `$` look-around assertions will + /// match immediately after and immediately before a new line character, + /// respectively. Note that the `\A` and `\z` look-around assertions are + /// unaffected by this setting and always correspond to matching at the + /// beginning and end of the input. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(mut self, yes: bool) -> SyntaxConfig { + self.multi_line = yes; + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// When this is enabled, `.` will match any character. When it's disabled, + /// then `.` will match any character except for a new line character. + /// + /// Note that `.` is impacted by whether the "unicode" setting is enabled + /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8 + /// encoding of any Unicode scalar value (sans a new line, depending on + /// whether this "dot matches new line" option is enabled). When Unicode + /// mode is disabled, `.` will match any byte instead. Because of this, + /// when Unicode mode is disabled, `.` can only be used when the "allow + /// invalid UTF-8" option is enabled, since `.` could otherwise match + /// invalid UTF-8. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig { + self.dot_matches_new_line = yes; + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` + /// will become greedy. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig { + self.swap_greed = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig { + self.ignore_whitespace = yes; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + /// + /// **WARNING**: Unicode mode can greatly increase the size of the compiled + /// DFA, which can noticeably impact both memory usage and compilation + /// time. This is especially noticeable if your regex contains character + /// classes like `\w` that are impacted by whether Unicode is enabled or + /// not. If Unicode is not necessary, you are encouraged to disable it. + pub fn unicode(mut self, yes: bool) -> SyntaxConfig { + self.unicode = yes; + self + } + + /// When disabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// For example, when [`SyntaxConfig::unicode`] is disabled, then + /// expressions like `[^a]` may match invalid UTF-8 since they can match + /// any single byte that is not `a`. By default, these sub-expressions + /// are disallowed to avoid returning offsets that split a UTF-8 + /// encoded codepoint. However, in cases where matching at arbitrary + /// locations is desired, this option can be disabled to permit all such + /// sub-expressions. + /// + /// When enabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn utf8(mut self, yes: bool) -> SyntaxConfig { + self.utf8 = yes; + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the lenth of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\1` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(mut self, yes: bool) -> SyntaxConfig { + self.octal = yes; + self + } + + /// Returns whether "unicode" mode is enabled. + pub fn get_unicode(&self) -> bool { + self.unicode + } + + /// Returns whether "case insensitive" mode is enabled. + pub fn get_case_insensitive(&self) -> bool { + self.case_insensitive + } + + /// Returns whether "multi line" mode is enabled. + pub fn get_multi_line(&self) -> bool { + self.multi_line + } + + /// Returns whether "dot matches new line" mode is enabled. + pub fn get_dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line + } + + /// Returns whether "swap greed" mode is enabled. + pub fn get_swap_greed(&self) -> bool { + self.swap_greed + } + + /// Returns whether "ignore whitespace" mode is enabled. + pub fn get_ignore_whitespace(&self) -> bool { + self.ignore_whitespace + } + + /// Returns whether UTF-8 mode is enabled. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Returns the "nest limit" setting. + pub fn get_nest_limit(&self) -> u32 { + self.nest_limit + } + + /// Returns whether "octal" mode is enabled. + pub fn get_octal(&self) -> bool { + self.octal + } + + /// Applies this configuration to the given parser. + pub(crate) fn apply(&self, builder: &mut ParserBuilder) { + builder + .unicode(self.unicode) + .case_insensitive(self.case_insensitive) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .ignore_whitespace(self.ignore_whitespace) + .allow_invalid_utf8(!self.utf8) + .nest_limit(self.nest_limit) + .octal(self.octal); + } +} + +impl Default for SyntaxConfig { + fn default() -> SyntaxConfig { + SyntaxConfig::new() + } +} diff --git a/vendor/regex-automata-0.2.0/tests/data/bytes.toml b/vendor/regex-automata-0.2.0/tests/data/bytes.toml new file mode 100644 index 000000000..eb3a0942e --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/bytes.toml @@ -0,0 +1,235 @@ +# These are tests specifically crafted for regexes that can match arbitrary +# bytes. In some cases, we also test the Unicode variant as well, just because +# it's good sense to do so. But also, these tests aren't really about Unicode, +# but whether matches are only reported at valid UTF-8 boundaries. For most +# tests in this entire collection, utf8 = true. But for these tests, we use +# utf8 = false. + +[[tests]] +name = "word-boundary-ascii" +regex = ' \b' +input = " δ" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "word-boundary-unicode" +regex = ' \b' +input = " δ" +matches = [[0, 1]] +unicode = true +utf8 = false + +[[tests]] +name = "word-boundary-ascii-not" +regex = ' \B' +input = " δ" +matches = [[0, 1]] +unicode = false +utf8 = false + +[[tests]] +name = "word-boundary-unicode-not" +regex = ' \B' +input = " δ" +matches = [] +unicode = true +utf8 = false + +[[tests]] +name = "perl-word-ascii" +regex = '\w+' +input = "aδ" +matches = [[0, 1]] +unicode = false +utf8 = false + +[[tests]] +name = "perl-word-unicode" +regex = '\w+' +input = "aδ" +matches = [[0, 3]] +unicode = true +utf8 = false + +[[tests]] +name = "perl-decimal-ascii" +regex = '\d+' +input = "1२३9" +matches = [[0, 1], [7, 8]] +unicode = false +utf8 = false + +[[tests]] +name = "perl-decimal-unicode" +regex = '\d+' +input = "1२३9" +matches = [[0, 8]] +unicode = true +utf8 = false + +[[tests]] +name = "perl-whitespace-ascii" +regex = '\s+' +input = " \u1680" +matches = [[0, 1]] +unicode = false +utf8 = false + +[[tests]] +name = "perl-whitespace-unicode" +regex = '\s+' +input = " \u1680" +matches = [[0, 4]] +unicode = true +utf8 = false + +# The first `(.+)` matches two Unicode codepoints, but can't match the 5th +# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and +# matches. +[[tests]] +name = "mixed-dot" +regex = '(.+)(?-u)(.+)' +input = '\xCE\x93\xCE\x94\xFF' +captures = [ + [[0, 5], [0, 4], [4, 5]], +] +unescape = true +unicode = true +utf8 = false + +[[tests]] +name = "case-one-ascii" +regex = 'a' +input = "A" +matches = [[0, 1]] +case_insensitive = true +unicode = false +utf8 = false + +[[tests]] +name = "case-one-unicode" +regex = 'a' +input = "A" +matches = [[0, 1]] +case_insensitive = true +unicode = true +utf8 = false + +[[tests]] +name = "case-class-simple-ascii" +regex = '[a-z]+' +input = "AaAaA" +matches = [[0, 5]] +case_insensitive = true +unicode = false +utf8 = false + +[[tests]] +name = "case-class-ascii" +regex = '[a-z]+' +input = "aA\u212AaA" +matches = [[0, 2], [5, 7]] +case_insensitive = true +unicode = false +utf8 = false + +[[tests]] +name = "case-class-unicode" +regex = '[a-z]+' +input = "aA\u212AaA" +matches = [[0, 7]] +case_insensitive = true +unicode = true +utf8 = false + +[[tests]] +name = "negate-ascii" +regex = '[^a]' +input = "δ" +matches = [[0, 1], [1, 2]] +unicode = false +utf8 = false + +[[tests]] +name = "negate-unicode" +regex = '[^a]' +input = "δ" +matches = [[0, 2]] +unicode = true +utf8 = false + +# When utf8=true, this won't match, because the implicit '.*?' prefix is +# Unicode aware and will refuse to match through invalid UTF-8 bytes. +[[tests]] +name = "dotstar-prefix-ascii" +regex = 'a' +input = '\xFFa' +matches = [[1, 2]] +unescape = true +unicode = false +utf8 = false + +[[tests]] +name = "dotstar-prefix-unicode" +regex = 'a' +input = '\xFFa' +matches = [[1, 2]] +unescape = true +unicode = true +utf8 = false + +[[tests]] +name = "null-bytes" +regex = '(?P<cstr>[^\x00]+)\x00' +input = 'foo\x00' +captures = [ + [[0, 4], [0, 3]], +] +unescape = true +unicode = false +utf8 = false + +[[tests]] +name = "invalid-utf8-anchor-100" +regex = '\xCC?^' +input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' +matches = [[0, 0]] +unescape = true +unicode = false +utf8 = false + +[[tests]] +name = "invalid-utf8-anchor-200" +regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$' +input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' +matches = [[22, 22]] +unescape = true +unicode = false +utf8 = false + +[[tests]] +name = "invalid-utf8-anchor-300" +regex = '^|ddp\xff\xffdddddlQd@\x80' +input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4' +matches = [[0, 0]] +unescape = true +unicode = false +utf8 = false + +[[tests]] +name = "word-boundary-ascii-100" +regex = '\Bx\B' +input = "áxβ" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "word-boundary-ascii-200" +regex = '\B' +input = "0\U0007EF5E" +matches = [[2, 2], [3, 3], [4, 4], [5, 5]] +unicode = false +utf8 = false diff --git a/vendor/regex-automata-0.2.0/tests/data/crazy.toml b/vendor/regex-automata-0.2.0/tests/data/crazy.toml new file mode 100644 index 000000000..549b86cca --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/crazy.toml @@ -0,0 +1,302 @@ +# TODO: There are still a couple of manually written tests in crazy.rs. + +[[tests]] +name = "ranges" +regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b' +input = "num: 255" +matches = [[5, 8]] + +[[tests]] +name = "ranges-not" +regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b' +input = "num: 256" +matches = [] + +[[tests]] +name = "float1" +regex = '[-+]?[0-9]*\.?[0-9]+' +input = "0.1" +matches = [[0, 3]] + +[[tests]] +name = "float2" +regex = '[-+]?[0-9]*\.?[0-9]+' +input = "0.1.2" +matches = [[0, 3]] +match_limit = 1 + +[[tests]] +name = "float3" +regex = '[-+]?[0-9]*\.?[0-9]+' +input = "a1.2" +matches = [[1, 4]] + +[[tests]] +name = "float4" +regex = '[-+]?[0-9]*\.?[0-9]+' +input = "1.a" +matches = [[0, 1]] + +[[tests]] +name = "float5" +regex = '^[-+]?[0-9]*\.?[0-9]+$' +input = "1.a" +matches = [] + +[[tests]] +name = "email" +regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b' +input = "mine is jam.slam@gmail.com " +matches = [[8, 26]] + +[[tests]] +name = "email-not" +regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b' +input = "mine is jam.slam@gmail " +matches = [] + +[[tests]] +name = "email-big" +regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''' +input = "mine is jam.slam@gmail.com " +matches = [[8, 26]] + +[[tests]] +name = "date1" +regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$' +input = "1900-01-01" +matches = [[0, 10]] + +[[tests]] +name = "date2" +regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$' +input = "1900-00-01" +matches = [] + +[[tests]] +name = "date3" +regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$' +input = "1900-13-01" +matches = [] + +[[tests]] +name = "start-end-empty" +regex = '^$' +input = "" +matches = [[0, 0]] + +[[tests]] +name = "start-end-empty-rev" +regex = '$^' +input = "" +matches = [[0, 0]] + +[[tests]] +name = "start-end-empty-many-1" +regex = '^$^$^$' +input = "" +matches = [[0, 0]] + +[[tests]] +name = "start-end-empty-many-2" +regex = '^^^$$$' +input = "" +matches = [[0, 0]] + +[[tests]] +name = "start-end-empty-rep" +regex = '(?:^$)*' +input = "a\nb\nc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] + +[[tests]] +name = "start-end-empty-rep-rev" +regex = '(?:$^)*' +input = "a\nb\nc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] + +[[tests]] +name = "neg-class-letter" +regex = '[^ac]' +input = "acx" +matches = [[2, 3]] + +[[tests]] +name = "neg-class-letter-comma" +regex = '[^a,]' +input = "a,x" +matches = [[2, 3]] + +[[tests]] +name = "neg-class-letter-space" +regex = '[^a[:space:]]' +input = "a x" +matches = [[2, 3]] + +[[tests]] +name = "neg-class-comma" +regex = '[^,]' +input = ",,x" +matches = [[2, 3]] + +[[tests]] +name = "neg-class-space" +regex = '[^[:space:]]' +input = " a" +matches = [[1, 2]] + +[[tests]] +name = "neg-class-space-comma" +regex = '[^,[:space:]]' +input = ", a" +matches = [[2, 3]] + +[[tests]] +name = "neg-class-comma-space" +regex = '[^[:space:],]' +input = " ,a" +matches = [[2, 3]] + +[[tests]] +name = "neg-class-ascii" +regex = '[^[:alpha:]Z]' +input = "A1" +matches = [[1, 2]] + +[[tests]] +name = "lazy-many-many" +regex = '((?:.*)*?)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "lazy-many-optional" +regex = '((?:.?)*?)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "lazy-one-many-many" +regex = '((?:.*)+?)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "lazy-one-many-optional" +regex = '((?:.?)+?)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "lazy-range-min-many" +regex = '((?:.*){1,}?)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "lazy-range-many" +regex = '((?:.*){1,2}?)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "greedy-many-many" +regex = '((?:.*)*)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "greedy-many-optional" +regex = '((?:.?)*)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "greedy-one-many-many" +regex = '((?:.*)+)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "greedy-one-many-optional" +regex = '((?:.?)+)=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "greedy-range-min-many" +regex = '((?:.*){1,})=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "greedy-range-many" +regex = '((?:.*){1,2})=' +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "empty1" +regex = '' +input = "" +matches = [[0, 0]] + +[[tests]] +name = "empty2" +regex = '' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty3" +regex = '()' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty4" +regex = '()*' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty5" +regex = '()+' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty6" +regex = '()?' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty7" +regex = '()()' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty8" +regex = '()+|z' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty9" +regex = 'z|()+' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty10" +regex = '()+|b' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty11" +regex = 'b|()+' +input = "abc" +matches = [[0, 0], [1, 2], [3, 3]] diff --git a/vendor/regex-automata-0.2.0/tests/data/earliest.toml b/vendor/regex-automata-0.2.0/tests/data/earliest.toml new file mode 100644 index 000000000..6714a850b --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/earliest.toml @@ -0,0 +1,48 @@ +[[tests]] +name = "no-greedy-100" +regex = 'a+' +input = "aaa" +matches = [[0, 1], [1, 2], [2, 3]] +search_kind = "earliest" + +[[tests]] +name = "no-greedy-200" +regex = 'abc+' +input = "zzzabccc" +matches = [[3, 6]] +search_kind = "earliest" + +[[tests]] +name = "is-ungreedy" +regex = 'a+?' +input = "aaa" +matches = [[0, 1], [1, 2], [2, 3]] +search_kind = "earliest" + +[[tests]] +name = "look-start-test" +regex = '^(abc|a)' +input = "abc" +matches = [[0, 1]] +search_kind = "earliest" + +[[tests]] +name = "look-end-test" +regex = '(abc|a)$' +input = "abc" +matches = [[0, 3]] +search_kind = "earliest" + +[[tests]] +name = "no-leftmost-first-100" +regex = 'abc|a' +input = "abc" +matches = [[0, 1]] +search_kind = "earliest" + +[[tests]] +name = "no-leftmost-first-200" +regex = 'aba|a' +input = "aba" +matches = [[0, 1], [2, 3]] +search_kind = "earliest" diff --git a/vendor/regex-automata-0.2.0/tests/data/empty.toml b/vendor/regex-automata-0.2.0/tests/data/empty.toml new file mode 100644 index 000000000..ad703e601 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/empty.toml @@ -0,0 +1,113 @@ +[[tests]] +name = "100" +regex = "|b" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "110" +regex = "b|" +input = "abc" +matches = [[0, 0], [1, 2], [3, 3]] + +[[tests]] +name = "120" +regex = "|z" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "130" +regex = "z|" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "200" +regex = "|" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "210" +regex = "||" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "220" +regex = "||b" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "230" +regex = "b||" +input = "abc" +matches = [[0, 0], [1, 2], [3, 3]] + +[[tests]] +name = "240" +regex = "||z" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "300" +regex = "(?:)|b" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "310" +regex = "b|(?:)" +input = "abc" +matches = [[0, 0], [1, 2], [3, 3]] + +[[tests]] +name = "320" +regex = "(?:|)" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "330" +regex = "(?:|)|z" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "400" +regex = "a(?:)|b" +input = "abc" +matches = [[0, 1], [1, 2]] + +[[tests]] +name = "500" +regex = "" +input = "" +matches = [[0, 0]] + +[[tests]] +name = "510" +regex = "" +input = "a" +matches = [[0, 0], [1, 1]] + +[[tests]] +name = "520" +regex = "" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "600" +regex = '(|a)*' +input = "aaa" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "610" +regex = '(|a)+' +input = "aaa" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] diff --git a/vendor/regex-automata-0.2.0/tests/data/expensive.toml b/vendor/regex-automata-0.2.0/tests/data/expensive.toml new file mode 100644 index 000000000..e062e3902 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/expensive.toml @@ -0,0 +1,12 @@ +# These represent tests that may be expensive to run on some regex engines. For +# example, tests that build a full DFA ahead of time and minimize it can take a +# horrendously long time on regexes that are large (or result in an explosion +# in the number of states). We group these tests together so that such engines +# can simply skip these tests. + +# See: https://github.com/rust-lang/regex/issues/98 +[[tests]] +name = "regression-many-repeat-no-stack-overflow" +regex = '^.{1,2500}' +input = "a" +matches = [[0, 1]] diff --git a/vendor/regex-automata-0.2.0/tests/data/flags.toml b/vendor/regex-automata-0.2.0/tests/data/flags.toml new file mode 100644 index 000000000..2b631ef23 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/flags.toml @@ -0,0 +1,67 @@ +[[tests]] +name = "1" +regex = "(?i)abc" +input = "ABC" +matches = [[0, 3]] + +[[tests]] +name = "2" +regex = "(?i)a(?-i)bc" +input = "Abc" +matches = [[0, 3]] + +[[tests]] +name = "3" +regex = "(?i)a(?-i)bc" +input = "ABC" +matches = [] + +[[tests]] +name = "4" +regex = "(?is)a." +input = "A\n" +matches = [[0, 2]] + +[[tests]] +name = "5" +regex = "(?is)a.(?-is)a." +input = "A\nab" +matches = [[0, 4]] + +[[tests]] +name = "6" +regex = "(?is)a.(?-is)a." +input = "A\na\n" +matches = [] + +[[tests]] +name = "7" +regex = "(?is)a.(?-is:a.)?" +input = "A\na\n" +matches = [[0, 2]] +match_limit = 1 + +[[tests]] +name = "8" +regex = "(?U)a+" +input = "aa" +matches = [[0, 1]] +match_limit = 1 + +[[tests]] +name = "9" +regex = "(?U)a+?" +input = "aa" +matches = [[0, 2]] + +[[tests]] +name = "10" +regex = "(?U)(?-U)a+" +input = "aa" +matches = [[0, 2]] + +[[tests]] +name = "11" +regex = '(?m)(?:^\d+$\n?)+' +input = "123\n456\n789" +matches = [[0, 11]] diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/basic.toml b/vendor/regex-automata-0.2.0/tests/data/fowler/basic.toml new file mode 100644 index 000000000..c965f26ff --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/basic.toml @@ -0,0 +1,1638 @@ +# !!! DO NOT EDIT !!! +# Automatically generated by scripts/fowler-to-toml. +# Numbers in the test names correspond to the line number of the test from +# the original dat file. + +[[tests]] +name = "basic3" +regex = '''abracadabra$''' +input = '''abracadabracadabra''' +captures = [[[7, 18]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic4" +regex = '''a...b''' +input = '''abababbb''' +captures = [[[2, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic5" +regex = '''XXXXXX''' +input = '''..XXXXXX''' +captures = [[[2, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic6" +regex = '''\)''' +input = '''()''' +captures = [[[1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic7" +regex = '''a]''' +input = '''a]a''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic9" +regex = '''\}''' +input = '''}''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic10" +regex = '''\]''' +input = ''']''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic12" +regex = ''']''' +input = ''']''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic15" +regex = '''^a''' +input = '''ax''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic16" +regex = '''\^a''' +input = '''a^a''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic17" +regex = '''a\^''' +input = '''a^''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic18" +regex = '''a$''' +input = '''aa''' +captures = [[[1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic19" +regex = '''a\$''' +input = '''a$''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic20" +regex = '''^$''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic21" +regex = '''$^''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic22" +regex = '''a($)''' +input = '''aa''' +captures = [[[1, 2], [2, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic23" +regex = '''a*(^a)''' +input = '''aa''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic24" +regex = '''(..)*(...)*''' +input = '''a''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic25" +regex = '''(..)*(...)*''' +input = '''abcd''' +captures = [[[0, 4], [2, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic26" +regex = '''(ab|a)(bc|c)''' +input = '''abc''' +captures = [[[0, 3], [0, 2], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic27" +regex = '''(ab)c|abc''' +input = '''abc''' +captures = [[[0, 3], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic28" +regex = '''a{0}b''' +input = '''ab''' +captures = [[[1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic29" +regex = '''(a*)(b?)(b+)b{3}''' +input = '''aaabbbbbbb''' +captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic30" +regex = '''(a*)(b{0,1})(b{1,})b{3}''' +input = '''aaabbbbbbb''' +captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic32" +regex = '''((a|a)|a)''' +input = '''a''' +captures = [[[0, 1], [0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic33" +regex = '''(a*)(a|aa)''' +input = '''aaaa''' +captures = [[[0, 4], [0, 3], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic34" +regex = '''a*(a.|aa)''' +input = '''aaaa''' +captures = [[[0, 4], [2, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic35" +regex = '''a(b)|c(d)|a(e)f''' +input = '''aef''' +captures = [[[0, 3], [], [], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic36" +regex = '''(a|b)?.*''' +input = '''b''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic37" +regex = '''(a|b)c|a(b|c)''' +input = '''ac''' +captures = [[[0, 2], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic38" +regex = '''(a|b)c|a(b|c)''' +input = '''ab''' +captures = [[[0, 2], [], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic39" +regex = '''(a|b)*c|(a|ab)*c''' +input = '''abc''' +captures = [[[0, 3], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic40" +regex = '''(a|b)*c|(a|ab)*c''' +input = '''xc''' +captures = [[[1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic41" +regex = '''(.a|.b).*|.*(.a|.b)''' +input = '''xa''' +captures = [[[0, 2], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic42" +regex = '''a?(ab|ba)ab''' +input = '''abab''' +captures = [[[0, 4], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic43" +regex = '''a?(ac{0}b|ba)ab''' +input = '''abab''' +captures = [[[0, 4], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic44" +regex = '''ab|abab''' +input = '''abbabab''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic45" +regex = '''aba|bab|bba''' +input = '''baaabbbaba''' +captures = [[[5, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic46" +regex = '''aba|bab''' +input = '''baaabbbaba''' +captures = [[[6, 9]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic47" +regex = '''(aa|aaa)*|(a|aaaaa)''' +input = '''aa''' +captures = [[[0, 2], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic48" +regex = '''(a.|.a.)*|(a|.a...)''' +input = '''aa''' +captures = [[[0, 2], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic49" +regex = '''ab|a''' +input = '''xabc''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic50" +regex = '''ab|a''' +input = '''xxabc''' +captures = [[[2, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic51" +regex = '''(Ab|cD)*''' +input = '''aBcD''' +captures = [[[0, 4], [2, 4]]] +match_limit = 1 +unescape = true +case_insensitive = true + +[[tests]] +name = "basic52" +regex = '''[^-]''' +input = '''--a''' +captures = [[[2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic53" +regex = '''[a-]*''' +input = '''--a''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic54" +regex = '''[a-m-]*''' +input = '''--amoma--''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic55" +regex = ''':::1:::0:|:::1:1:0:''' +input = ''':::0:::1:::1:::0:''' +captures = [[[8, 17]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic56" +regex = ''':::1:::0:|:::1:1:1:''' +input = ''':::0:::1:::1:::0:''' +captures = [[[8, 17]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic57" +regex = '''[[:upper:]]''' +input = '''A''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic58" +regex = '''[[:lower:]]+''' +input = '''`az{''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic59" +regex = '''[[:upper:]]+''' +input = '''@AZ[''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic65" +regex = '''\n''' +input = '''\n''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic66" +regex = '''\n''' +input = '''\n''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic67" +regex = '''[^a]''' +input = '''\n''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic68" +regex = '''\na''' +input = '''\na''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic69" +regex = '''(a)(b)(c)''' +input = '''abc''' +captures = [[[0, 3], [0, 1], [1, 2], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic70" +regex = '''xxx''' +input = '''xxx''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic71" +regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' +input = '''feb 6,''' +captures = [[[0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic72" +regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' +input = '''2/7''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic73" +regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' +input = '''feb 1,Feb 6''' +captures = [[[5, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic74" +regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))''' +input = '''x''' +captures = [[[0, 1], [0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic75" +regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*''' +input = '''xx''' +captures = [[[0, 2], [1, 2], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic76" +regex = '''a?(ab|ba)*''' +input = '''ababababababababababababababababababababababababababababababababababababababababa''' +captures = [[[0, 81], [79, 81]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic77" +regex = '''abaa|abbaa|abbbaa|abbbbaa''' +input = '''ababbabbbabbbabbbbabbbbaa''' +captures = [[[18, 25]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic78" +regex = '''abaa|abbaa|abbbaa|abbbbaa''' +input = '''ababbabbbabbbabbbbabaa''' +captures = [[[18, 22]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic79" +regex = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc''' +input = '''baaabbbabac''' +captures = [[[7, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic80" +regex = '''.*''' +input = '''\x01\x7f''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic81" +regex = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll''' +input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa''' +captures = [[[53, 57]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic83" +regex = '''a*a*a*a*a*b''' +input = '''aaaaaaaaab''' +captures = [[[0, 10]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic84" +regex = '''^''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic85" +regex = '''$''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic86" +regex = '''^$''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic87" +regex = '''^a$''' +input = '''a''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic88" +regex = '''abc''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic89" +regex = '''abc''' +input = '''xabcy''' +captures = [[[1, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic90" +regex = '''abc''' +input = '''ababc''' +captures = [[[2, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic91" +regex = '''ab*c''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic92" +regex = '''ab*bc''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic93" +regex = '''ab*bc''' +input = '''abbc''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic94" +regex = '''ab*bc''' +input = '''abbbbc''' +captures = [[[0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic95" +regex = '''ab+bc''' +input = '''abbc''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic96" +regex = '''ab+bc''' +input = '''abbbbc''' +captures = [[[0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic97" +regex = '''ab?bc''' +input = '''abbc''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic98" +regex = '''ab?bc''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic99" +regex = '''ab?c''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic100" +regex = '''^abc$''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic101" +regex = '''^abc''' +input = '''abcc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic102" +regex = '''abc$''' +input = '''aabc''' +captures = [[[1, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic103" +regex = '''^''' +input = '''abc''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic104" +regex = '''$''' +input = '''abc''' +captures = [[[3, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic105" +regex = '''a.c''' +input = '''abc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic106" +regex = '''a.c''' +input = '''axc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic107" +regex = '''a.*c''' +input = '''axyzc''' +captures = [[[0, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic108" +regex = '''a[bc]d''' +input = '''abd''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic109" +regex = '''a[b-d]e''' +input = '''ace''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic110" +regex = '''a[b-d]''' +input = '''aac''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic111" +regex = '''a[-b]''' +input = '''a-''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic112" +regex = '''a[b-]''' +input = '''a-''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic113" +regex = '''a]''' +input = '''a]''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic114" +regex = '''a[]]b''' +input = '''a]b''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic115" +regex = '''a[^bc]d''' +input = '''aed''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic116" +regex = '''a[^-b]c''' +input = '''adc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic117" +regex = '''a[^]b]c''' +input = '''adc''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic118" +regex = '''ab|cd''' +input = '''abc''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic119" +regex = '''ab|cd''' +input = '''abcd''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic120" +regex = '''a\(b''' +input = '''a(b''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic121" +regex = '''a\(*b''' +input = '''ab''' +captures = [[[0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic122" +regex = '''a\(*b''' +input = '''a((b''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic123" +regex = '''((a))''' +input = '''abc''' +captures = [[[0, 1], [0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic124" +regex = '''(a)b(c)''' +input = '''abc''' +captures = [[[0, 3], [0, 1], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic125" +regex = '''a+b+c''' +input = '''aabbabc''' +captures = [[[4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic126" +regex = '''a*''' +input = '''aaa''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic128" +regex = '''(a*)*''' +input = '''-''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic129" +regex = '''(a*)+''' +input = '''-''' +captures = [[[0, 0], [0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic131" +regex = '''(a*|b)*''' +input = '''-''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic132" +regex = '''(a+|b)*''' +input = '''ab''' +captures = [[[0, 2], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic133" +regex = '''(a+|b)+''' +input = '''ab''' +captures = [[[0, 2], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic134" +regex = '''(a+|b)?''' +input = '''ab''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic135" +regex = '''[^ab]*''' +input = '''cde''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic137" +regex = '''(^)*''' +input = '''-''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic138" +regex = '''a*''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic139" +regex = '''([abc])*d''' +input = '''abbbcd''' +captures = [[[0, 6], [4, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic140" +regex = '''([abc])*bcd''' +input = '''abcd''' +captures = [[[0, 4], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic141" +regex = '''a|b|c|d|e''' +input = '''e''' +captures = [[[0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic142" +regex = '''(a|b|c|d|e)f''' +input = '''ef''' +captures = [[[0, 2], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic144" +regex = '''((a*|b))*''' +input = '''-''' +captures = [[[0, 0], [], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic145" +regex = '''abcd*efg''' +input = '''abcdefg''' +captures = [[[0, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic146" +regex = '''ab*''' +input = '''xabyabbbz''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic147" +regex = '''ab*''' +input = '''xayabbbz''' +captures = [[[1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic148" +regex = '''(ab|cd)e''' +input = '''abcde''' +captures = [[[2, 5], [2, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic149" +regex = '''[abhgefdc]ij''' +input = '''hij''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic150" +regex = '''(a|b)c*d''' +input = '''abcd''' +captures = [[[1, 4], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic151" +regex = '''(ab|ab*)bc''' +input = '''abc''' +captures = [[[0, 3], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic152" +regex = '''a([bc]*)c*''' +input = '''abc''' +captures = [[[0, 3], [1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic153" +regex = '''a([bc]*)(c*d)''' +input = '''abcd''' +captures = [[[0, 4], [1, 3], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic154" +regex = '''a([bc]+)(c*d)''' +input = '''abcd''' +captures = [[[0, 4], [1, 3], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic155" +regex = '''a([bc]*)(c+d)''' +input = '''abcd''' +captures = [[[0, 4], [1, 2], [2, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic156" +regex = '''a[bcd]*dcdcde''' +input = '''adcdcde''' +captures = [[[0, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic157" +regex = '''(ab|a)b*c''' +input = '''abc''' +captures = [[[0, 3], [0, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic158" +regex = '''((a)(b)c)(d)''' +input = '''abcd''' +captures = [[[0, 4], [0, 3], [0, 1], [1, 2], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic159" +regex = '''[A-Za-z_][A-Za-z0-9_]*''' +input = '''alpha''' +captures = [[[0, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic160" +regex = '''^a(bc+|b[eh])g|.h$''' +input = '''abh''' +captures = [[[1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic161" +regex = '''(bc+d$|ef*g.|h?i(j|k))''' +input = '''effgz''' +captures = [[[0, 5], [0, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic162" +regex = '''(bc+d$|ef*g.|h?i(j|k))''' +input = '''ij''' +captures = [[[0, 2], [0, 2], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic163" +regex = '''(bc+d$|ef*g.|h?i(j|k))''' +input = '''reffgz''' +captures = [[[1, 6], [1, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic164" +regex = '''(((((((((a)))))))))''' +input = '''a''' +captures = [[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic165" +regex = '''multiple words''' +input = '''multiple words yeah''' +captures = [[[0, 14]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic166" +regex = '''(.*)c(.*)''' +input = '''abcde''' +captures = [[[0, 5], [0, 2], [3, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic167" +regex = '''abcd''' +input = '''abcd''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic168" +regex = '''a(bc)d''' +input = '''abcd''' +captures = [[[0, 4], [1, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic169" +regex = '''a[\x01-\x03]?c''' +input = '''a\x02c''' +captures = [[[0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic170" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Qaddafi''' +captures = [[[0, 15], [], [10, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic171" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Mo'ammar Gadhafi''' +captures = [[[0, 16], [], [11, 13]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic172" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Kaddafi''' +captures = [[[0, 15], [], [10, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic173" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Qadhafi''' +captures = [[[0, 15], [], [10, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic174" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Gadafi''' +captures = [[[0, 14], [], [10, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic175" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Mu'ammar Qadafi''' +captures = [[[0, 15], [], [11, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic176" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Moamar Gaddafi''' +captures = [[[0, 14], [], [9, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic177" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Mu'ammar Qadhdhafi''' +captures = [[[0, 18], [], [13, 15]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic178" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Khaddafi''' +captures = [[[0, 16], [], [11, 13]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic179" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Ghaddafy''' +captures = [[[0, 16], [], [11, 13]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic180" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Ghadafi''' +captures = [[[0, 15], [], [11, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic181" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Ghaddafi''' +captures = [[[0, 16], [], [11, 13]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic182" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muamar Kaddafi''' +captures = [[[0, 14], [], [9, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic183" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Quathafi''' +captures = [[[0, 16], [], [11, 13]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic184" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Gheddafi''' +captures = [[[0, 16], [], [11, 13]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic185" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Moammar Khadafy''' +captures = [[[0, 15], [], [11, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic186" +regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Moammar Qudhafi''' +captures = [[[0, 15], [], [10, 12]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic187" +regex = '''a+(b|c)*d+''' +input = '''aabcdd''' +captures = [[[0, 6], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic188" +regex = '''^.+$''' +input = '''vivi''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic189" +regex = '''^(.+)$''' +input = '''vivi''' +captures = [[[0, 4], [0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic190" +regex = '''^([^!.]+).att.com!(.+)$''' +input = '''gryphon.att.com!eby''' +captures = [[[0, 19], [0, 7], [16, 19]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic191" +regex = '''^([^!]+!)?([^!]+)$''' +input = '''bas''' +captures = [[[0, 3], [], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic192" +regex = '''^([^!]+!)?([^!]+)$''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic193" +regex = '''^([^!]+!)?([^!]+)$''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic194" +regex = '''^.+!([^!]+!)([^!]+)$''' +input = '''foo!bar!bas''' +captures = [[[0, 11], [4, 8], [8, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic195" +regex = '''((foo)|(bar))!bas''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 3], [], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic196" +regex = '''((foo)|(bar))!bas''' +input = '''foo!bar!bas''' +captures = [[[4, 11], [4, 7], [], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic197" +regex = '''((foo)|(bar))!bas''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 3], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic198" +regex = '''((foo)|bar)!bas''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic199" +regex = '''((foo)|bar)!bas''' +input = '''foo!bar!bas''' +captures = [[[4, 11], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic200" +regex = '''((foo)|bar)!bas''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 3], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic201" +regex = '''(foo|(bar))!bas''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 3], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic202" +regex = '''(foo|(bar))!bas''' +input = '''foo!bar!bas''' +captures = [[[4, 11], [4, 7], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic203" +regex = '''(foo|(bar))!bas''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic204" +regex = '''(foo|bar)!bas''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic205" +regex = '''(foo|bar)!bas''' +input = '''foo!bar!bas''' +captures = [[[4, 11], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic206" +regex = '''(foo|bar)!bas''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic207" +regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''foo!bar!bas''' +captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic208" +regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''bas''' +captures = [[[0, 3], [], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic209" +regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic210" +regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''foo!bar!bas''' +captures = [[[0, 11], [], [], [4, 8], [8, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic211" +regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic212" +regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''bas''' +captures = [[[0, 3], [0, 3], [], [0, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic213" +regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''bar!bas''' +captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic214" +regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''foo!bar!bas''' +captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic215" +regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''foo!bas''' +captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic216" +regex = '''.*(/XXX).*''' +input = '''/XXX''' +captures = [[[0, 4], [0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic217" +regex = '''.*(\\XXX).*''' +input = '''\\XXX''' +captures = [[[0, 4], [0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic218" +regex = '''\\XXX''' +input = '''\\XXX''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic219" +regex = '''.*(/000).*''' +input = '''/000''' +captures = [[[0, 4], [0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic220" +regex = '''.*(\\000).*''' +input = '''\\000''' +captures = [[[0, 4], [0, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "basic221" +regex = '''\\000''' +input = '''\\000''' +captures = [[[0, 4]]] +match_limit = 1 +unescape = true + diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/dat/README b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/README new file mode 100644 index 000000000..e70072500 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/README @@ -0,0 +1,24 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +Unfortunately, the above link is now dead, but the test data lives on. + +The LICENSE in this directory corresponds to the LICENSE that the data was +originally released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +After some number of years, these tests were transformed into a TOML format +using the fowler-to-toml script in the 'scripts' directory. To re-generate the +TOML files, then run the following from the root of this repository: + + ./scripts/fowler-to-toml tests/data/fowler tests/data/fowler/dat/*.dat + +which brings them into a sensible structured format in which other tests can +be written. diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/dat/basic.dat b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/basic.dat new file mode 100644 index 000000000..e55efaeec --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/dat/nullsubexpr.dat b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/nullsubexpr.dat new file mode 100644 index 000000000..2e18fbb91 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/dat/repetition-expensive.dat b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/repetition-expensive.dat new file mode 100644 index 000000000..c91580236 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/repetition-expensive.dat @@ -0,0 +1,85 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/dat/repetition.dat b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/repetition.dat new file mode 100644 index 000000000..2dac0823f --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/dat/repetition.dat @@ -0,0 +1,83 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/nullsubexpr.toml b/vendor/regex-automata-0.2.0/tests/data/fowler/nullsubexpr.toml new file mode 100644 index 000000000..55d1d5b43 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/nullsubexpr.toml @@ -0,0 +1,405 @@ +# !!! DO NOT EDIT !!! +# Automatically generated by scripts/fowler-to-toml. +# Numbers in the test names correspond to the line number of the test from +# the original dat file. + +[[tests]] +name = "nullsubexpr3" +regex = '''(a*)*''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr5" +regex = '''(a*)*''' +input = '''x''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr6" +regex = '''(a*)*''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr7" +regex = '''(a*)*''' +input = '''aaaaaax''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr8" +regex = '''(a*)+''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr9" +regex = '''(a*)+''' +input = '''x''' +captures = [[[0, 0], [0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr10" +regex = '''(a*)+''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr11" +regex = '''(a*)+''' +input = '''aaaaaax''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr12" +regex = '''(a+)*''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr13" +regex = '''(a+)*''' +input = '''x''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr14" +regex = '''(a+)*''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr15" +regex = '''(a+)*''' +input = '''aaaaaax''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr16" +regex = '''(a+)+''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr17" +regex = '''(a+)+''' +input = '''x''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr18" +regex = '''(a+)+''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr19" +regex = '''(a+)+''' +input = '''aaaaaax''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr21" +regex = '''([a]*)*''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr23" +regex = '''([a]*)*''' +input = '''x''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr24" +regex = '''([a]*)*''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr25" +regex = '''([a]*)*''' +input = '''aaaaaax''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr26" +regex = '''([a]*)+''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr27" +regex = '''([a]*)+''' +input = '''x''' +captures = [[[0, 0], [0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr28" +regex = '''([a]*)+''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr29" +regex = '''([a]*)+''' +input = '''aaaaaax''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr30" +regex = '''([^b]*)*''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr32" +regex = '''([^b]*)*''' +input = '''b''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr33" +regex = '''([^b]*)*''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr34" +regex = '''([^b]*)*''' +input = '''aaaaaab''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr35" +regex = '''([ab]*)*''' +input = '''a''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr36" +regex = '''([ab]*)*''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr37" +regex = '''([ab]*)*''' +input = '''ababab''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr38" +regex = '''([ab]*)*''' +input = '''bababa''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr39" +regex = '''([ab]*)*''' +input = '''b''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr40" +regex = '''([ab]*)*''' +input = '''bbbbbb''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr41" +regex = '''([ab]*)*''' +input = '''aaaabcde''' +captures = [[[0, 5], [0, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr42" +regex = '''([^a]*)*''' +input = '''b''' +captures = [[[0, 1], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr43" +regex = '''([^a]*)*''' +input = '''bbbbbb''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr45" +regex = '''([^a]*)*''' +input = '''aaaaaa''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr46" +regex = '''([^ab]*)*''' +input = '''ccccxx''' +captures = [[[0, 6], [0, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr48" +regex = '''([^ab]*)*''' +input = '''ababab''' +captures = [[[0, 0], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr50" +regex = '''((z)+|a)*''' +input = '''zabcde''' +captures = [[[0, 2], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr69" +regex = '''(a*)*(x)''' +input = '''x''' +captures = [[[0, 1], [], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr70" +regex = '''(a*)*(x)''' +input = '''ax''' +captures = [[[0, 2], [0, 1], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr71" +regex = '''(a*)*(x)''' +input = '''axa''' +captures = [[[0, 2], [0, 1], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr73" +regex = '''(a*)+(x)''' +input = '''x''' +captures = [[[0, 1], [0, 0], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr74" +regex = '''(a*)+(x)''' +input = '''ax''' +captures = [[[0, 2], [0, 1], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr75" +regex = '''(a*)+(x)''' +input = '''axa''' +captures = [[[0, 2], [0, 1], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr77" +regex = '''(a*){2}(x)''' +input = '''x''' +captures = [[[0, 1], [0, 0], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr78" +regex = '''(a*){2}(x)''' +input = '''ax''' +captures = [[[0, 2], [1, 1], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "nullsubexpr79" +regex = '''(a*){2}(x)''' +input = '''axa''' +captures = [[[0, 2], [1, 1], [1, 2]]] +match_limit = 1 +unescape = true + diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/repetition-expensive.toml b/vendor/regex-automata-0.2.0/tests/data/fowler/repetition-expensive.toml new file mode 100644 index 000000000..81a896452 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/repetition-expensive.toml @@ -0,0 +1,341 @@ +# !!! DO NOT EDIT !!! +# Automatically generated by scripts/fowler-to-toml. +# Numbers in the test names correspond to the line number of the test from +# the original dat file. + +[[tests]] +name = "repetition-expensive12" +regex = '''X(.?){0,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive13" +regex = '''X(.?){1,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive14" +regex = '''X(.?){2,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive15" +regex = '''X(.?){3,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive16" +regex = '''X(.?){4,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive17" +regex = '''X(.?){5,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive18" +regex = '''X(.?){6,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive19" +regex = '''X(.?){7,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive20" +regex = '''X(.?){8,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive22" +regex = '''X(.?){0,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive24" +regex = '''X(.?){1,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive26" +regex = '''X(.?){2,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive28" +regex = '''X(.?){3,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive30" +regex = '''X(.?){4,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive32" +regex = '''X(.?){5,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive34" +regex = '''X(.?){6,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive36" +regex = '''X(.?){7,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive37" +regex = '''X(.?){8,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive48" +regex = '''(a|ab|c|bcd){0,}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive49" +regex = '''(a|ab|c|bcd){1,}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive50" +regex = '''(a|ab|c|bcd){2,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive51" +regex = '''(a|ab|c|bcd){3,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive52" +regex = '''(a|ab|c|bcd){4,}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive53" +regex = '''(a|ab|c|bcd){0,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive54" +regex = '''(a|ab|c|bcd){1,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive55" +regex = '''(a|ab|c|bcd){2,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive56" +regex = '''(a|ab|c|bcd){3,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive57" +regex = '''(a|ab|c|bcd){4,10}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive58" +regex = '''(a|ab|c|bcd)*(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive59" +regex = '''(a|ab|c|bcd)+(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive65" +regex = '''(ab|a|c|bcd){0,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive67" +regex = '''(ab|a|c|bcd){1,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive69" +regex = '''(ab|a|c|bcd){2,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive71" +regex = '''(ab|a|c|bcd){3,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive72" +regex = '''(ab|a|c|bcd){4,}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive74" +regex = '''(ab|a|c|bcd){0,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive76" +regex = '''(ab|a|c|bcd){1,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive78" +regex = '''(ab|a|c|bcd){2,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive80" +regex = '''(ab|a|c|bcd){3,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive81" +regex = '''(ab|a|c|bcd){4,10}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive83" +regex = '''(ab|a|c|bcd)*(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-expensive85" +regex = '''(ab|a|c|bcd)+(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/repetition-long.toml b/vendor/regex-automata-0.2.0/tests/data/fowler/repetition-long.toml new file mode 100644 index 000000000..fa24c834a --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/repetition-long.toml @@ -0,0 +1,341 @@ +# !!! DO NOT EDIT !!! +# Automatically generated by scripts/fowler-to-toml. +# Numbers in the test names correspond to the line number of the test from +# the original dat file. + +[[tests]] +name = "repetition-long12" +regex = '''X(.?){0,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long13" +regex = '''X(.?){1,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long14" +regex = '''X(.?){2,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long15" +regex = '''X(.?){3,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long16" +regex = '''X(.?){4,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long17" +regex = '''X(.?){5,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long18" +regex = '''X(.?){6,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long19" +regex = '''X(.?){7,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [7, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long20" +regex = '''X(.?){8,}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long22" +regex = '''X(.?){0,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long24" +regex = '''X(.?){1,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long26" +regex = '''X(.?){2,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long28" +regex = '''X(.?){3,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long30" +regex = '''X(.?){4,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long32" +regex = '''X(.?){5,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long34" +regex = '''X(.?){6,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long36" +regex = '''X(.?){7,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long37" +regex = '''X(.?){8,8}Y''' +input = '''X1234567Y''' +captures = [[[0, 9], [8, 8]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long48" +regex = '''(a|ab|c|bcd){0,}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long49" +regex = '''(a|ab|c|bcd){1,}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long50" +regex = '''(a|ab|c|bcd){2,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long51" +regex = '''(a|ab|c|bcd){3,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long52" +regex = '''(a|ab|c|bcd){4,}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long53" +regex = '''(a|ab|c|bcd){0,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long54" +regex = '''(a|ab|c|bcd){1,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long55" +regex = '''(a|ab|c|bcd){2,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long56" +regex = '''(a|ab|c|bcd){3,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [3, 6], [6, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long57" +regex = '''(a|ab|c|bcd){4,10}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long58" +regex = '''(a|ab|c|bcd)*(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long59" +regex = '''(a|ab|c|bcd)+(d*)''' +input = '''ababcd''' +captures = [[[0, 1], [0, 1], [1, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long65" +regex = '''(ab|a|c|bcd){0,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long67" +regex = '''(ab|a|c|bcd){1,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long69" +regex = '''(ab|a|c|bcd){2,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long71" +regex = '''(ab|a|c|bcd){3,}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long72" +regex = '''(ab|a|c|bcd){4,}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long74" +regex = '''(ab|a|c|bcd){0,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long76" +regex = '''(ab|a|c|bcd){1,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long78" +regex = '''(ab|a|c|bcd){2,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long80" +regex = '''(ab|a|c|bcd){3,10}(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long81" +regex = '''(ab|a|c|bcd){4,10}(d*)''' +input = '''ababcd''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long83" +regex = '''(ab|a|c|bcd)*(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition-long85" +regex = '''(ab|a|c|bcd)+(d*)''' +input = '''ababcd''' +captures = [[[0, 6], [4, 5], [5, 6]]] +match_limit = 1 +unescape = true + diff --git a/vendor/regex-automata-0.2.0/tests/data/fowler/repetition.toml b/vendor/regex-automata-0.2.0/tests/data/fowler/repetition.toml new file mode 100644 index 000000000..fc8da8df4 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/fowler/repetition.toml @@ -0,0 +1,397 @@ +# !!! DO NOT EDIT !!! +# Automatically generated by scripts/fowler-to-toml. +# Numbers in the test names correspond to the line number of the test from +# the original dat file. + +[[tests]] +name = "repetition10" +regex = '''((..)|(.))''' +input = '''''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition11" +regex = '''((..)|(.))((..)|(.))''' +input = '''''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition12" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition14" +regex = '''((..)|(.)){1}''' +input = '''''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition15" +regex = '''((..)|(.)){2}''' +input = '''''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition16" +regex = '''((..)|(.)){3}''' +input = '''''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition18" +regex = '''((..)|(.))*''' +input = '''''' +captures = [[[0, 0]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition20" +regex = '''((..)|(.))''' +input = '''a''' +captures = [[[0, 1], [0, 1], [], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition21" +regex = '''((..)|(.))((..)|(.))''' +input = '''a''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition22" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''a''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition24" +regex = '''((..)|(.)){1}''' +input = '''a''' +captures = [[[0, 1], [0, 1], [], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition25" +regex = '''((..)|(.)){2}''' +input = '''a''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition26" +regex = '''((..)|(.)){3}''' +input = '''a''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition28" +regex = '''((..)|(.))*''' +input = '''a''' +captures = [[[0, 1], [0, 1], [], [0, 1]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition30" +regex = '''((..)|(.))''' +input = '''aa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition31" +regex = '''((..)|(.))((..)|(.))''' +input = '''aa''' +captures = [[[0, 2], [0, 1], [], [0, 1], [1, 2], [], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition32" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aa''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition34" +regex = '''((..)|(.)){1}''' +input = '''aa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition35" +regex = '''((..)|(.)){2}''' +input = '''aa''' +captures = [[[0, 2], [1, 2], [], [1, 2]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition36" +regex = '''((..)|(.)){3}''' +input = '''aa''' +captures = [] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition38" +regex = '''((..)|(.))*''' +input = '''aa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition40" +regex = '''((..)|(.))''' +input = '''aaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition41" +regex = '''((..)|(.))((..)|(.))''' +input = '''aaa''' +captures = [[[0, 3], [0, 2], [0, 2], [], [2, 3], [], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition42" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaa''' +captures = [[[0, 3], [0, 1], [], [0, 1], [1, 2], [], [1, 2], [2, 3], [], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition44" +regex = '''((..)|(.)){1}''' +input = '''aaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition46" +regex = '''((..)|(.)){2}''' +input = '''aaa''' +captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition47" +regex = '''((..)|(.)){3}''' +input = '''aaa''' +captures = [[[0, 3], [2, 3], [], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition50" +regex = '''((..)|(.))*''' +input = '''aaa''' +captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition52" +regex = '''((..)|(.))''' +input = '''aaaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition53" +regex = '''((..)|(.))((..)|(.))''' +input = '''aaaa''' +captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition54" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaaa''' +captures = [[[0, 4], [0, 2], [0, 2], [], [2, 3], [], [2, 3], [3, 4], [], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition56" +regex = '''((..)|(.)){1}''' +input = '''aaaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition57" +regex = '''((..)|(.)){2}''' +input = '''aaaa''' +captures = [[[0, 4], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition59" +regex = '''((..)|(.)){3}''' +input = '''aaaa''' +captures = [[[0, 4], [3, 4], [0, 2], [3, 4]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition61" +regex = '''((..)|(.))*''' +input = '''aaaa''' +captures = [[[0, 4], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition63" +regex = '''((..)|(.))''' +input = '''aaaaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition64" +regex = '''((..)|(.))((..)|(.))''' +input = '''aaaaa''' +captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition65" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaaaa''' +captures = [[[0, 5], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 5], [], [4, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition67" +regex = '''((..)|(.)){1}''' +input = '''aaaaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition68" +regex = '''((..)|(.)){2}''' +input = '''aaaaa''' +captures = [[[0, 4], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition70" +regex = '''((..)|(.)){3}''' +input = '''aaaaa''' +captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition73" +regex = '''((..)|(.))*''' +input = '''aaaaa''' +captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition75" +regex = '''((..)|(.))''' +input = '''aaaaaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition76" +regex = '''((..)|(.))((..)|(.))''' +input = '''aaaaaa''' +captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition77" +regex = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaaaaa''' +captures = [[[0, 6], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 6], [4, 6], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition79" +regex = '''((..)|(.)){1}''' +input = '''aaaaaa''' +captures = [[[0, 2], [0, 2], [0, 2], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition80" +regex = '''((..)|(.)){2}''' +input = '''aaaaaa''' +captures = [[[0, 4], [2, 4], [2, 4], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition81" +regex = '''((..)|(.)){3}''' +input = '''aaaaaa''' +captures = [[[0, 6], [4, 6], [4, 6], []]] +match_limit = 1 +unescape = true + +[[tests]] +name = "repetition83" +regex = '''((..)|(.))*''' +input = '''aaaaaa''' +captures = [[[0, 6], [4, 6], [4, 6], []]] +match_limit = 1 +unescape = true + diff --git a/vendor/regex-automata-0.2.0/tests/data/iter.toml b/vendor/regex-automata-0.2.0/tests/data/iter.toml new file mode 100644 index 000000000..6c0539fd4 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/iter.toml @@ -0,0 +1,119 @@ +[[tests]] +name = "1" +regex = "a" +input = "aaa" +matches = [[0, 1], [1, 2], [2, 3]] + +[[tests]] +name = "2" +regex = "a" +input = "aba" +matches = [[0, 1], [2, 3]] + +[[tests]] +name = "empty1" +regex = '' +input = '' +matches = [[0, 0]] + +[[tests]] +name = "empty2" +regex = '' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty3" +regex = '()' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty4" +regex = '()*' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty5" +regex = '()+' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty6" +regex = '()?' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty7" +regex = '()()' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty8" +regex = '()+|z' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty9" +regex = 'z|()+' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty10" +regex = '()+|b' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "empty11" +regex = 'b|()+' +input = 'abc' +matches = [[0, 0], [1, 2], [3, 3]] + +[[tests]] +name = "start1" +regex = "^a" +input = "a" +matches = [[0, 1]] + +[[tests]] +name = "start2" +regex = "^a" +input = "aa" +matches = [[0, 1]] + +[[tests]] +name = "anchored1" +regex = "a" +input = "a" +matches = [[0, 1]] +anchored = true + +# This test is pretty subtle. It demonstrates the crucial difference between +# '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively +# matches at the start of a haystack and nowhere else. The latter regex has +# no such restriction, but its automaton is constructed such that it lacks a +# `.*?` prefix. So it can actually produce matches at multiple locations. +# The anchored3 test drives this point home. +[[tests]] +name = "anchored2" +regex = "a" +input = "aa" +matches = [[0, 1], [1, 2]] +anchored = true + +# Unlikely anchored2, this test stops matching anything after it sees `b` +# since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it +# determines that there are no remaining matches. +[[tests]] +name = "anchored3" +regex = "a" +input = "aaba" +matches = [[0, 1], [1, 2]] +anchored = true diff --git a/vendor/regex-automata-0.2.0/tests/data/misc.toml b/vendor/regex-automata-0.2.0/tests/data/misc.toml new file mode 100644 index 000000000..c05418dd6 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/misc.toml @@ -0,0 +1,99 @@ +[[tests]] +name = "ascii-literal" +regex = "a" +input = "a" +matches = [[0, 1]] + +[[tests]] +name = "ascii-literal-not" +regex = "a" +input = "z" +matches = [] + +[[tests]] +name = "ascii-literal-anchored" +regex = "a" +input = "a" +matches = [[0, 1]] +anchored = true + +[[tests]] +name = "ascii-literal-anchored-not" +regex = "a" +input = "z" +matches = [] +anchored = true + +[[tests]] +name = "anchor-start-end-line" +regex = '(?m)^bar$' +input = "foo\nbar\nbaz" +matches = [[4, 7]] + +[[tests]] +name = "prefix-literal-match" +regex = '^abc' +input = "abc" +matches = [[0, 3]] + +[[tests]] +name = "prefix-literal-match-ascii" +regex = '^abc' +input = "abc" +matches = [[0, 3]] +unicode = false +utf8 = false + +[[tests]] +name = "prefix-literal-no-match" +regex = '^abc' +input = "zabc" +matches = [] + +[[tests]] +name = "one-literal-edge" +regex = 'abc' +input = "xxxxxab" +matches = [] + +[[tests]] +name = "terminates" +regex = 'a$' +input = "a" +matches = [[0, 1]] + +[[tests]] +name = "suffix-100" +regex = '.*abcd' +input = "abcd" +matches = [[0, 4]] + +[[tests]] +name = "suffix-200" +regex = '.*(?:abcd)+' +input = "abcd" +matches = [[0, 4]] + +[[tests]] +name = "suffix-300" +regex = '.*(?:abcd)+' +input = "abcdabcd" +matches = [[0, 8]] + +[[tests]] +name = "suffix-400" +regex = '.*(?:abcd)+' +input = "abcdxabcd" +matches = [[0, 9]] + +[[tests]] +name = "suffix-500" +regex = '.*x(?:abcd)+' +input = "abcdxabcd" +matches = [[0, 9]] + +[[tests]] +name = "suffix-600" +regex = '[^abcd]*x(?:abcd)+' +input = "abcdxabcd" +matches = [[4, 9]] diff --git a/vendor/regex-automata-0.2.0/tests/data/multiline.toml b/vendor/regex-automata-0.2.0/tests/data/multiline.toml new file mode 100644 index 000000000..cefdb2629 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/multiline.toml @@ -0,0 +1,275 @@ +[[tests]] +name = "basic1" +regex = '(?m)^[a-z]+$' +input = "abc\ndef\nxyz" +matches = [[0, 3], [4, 7], [8, 11]] + +[[tests]] +name = "basic2" +regex = '(?m)^$' +input = "abc\ndef\nxyz" +matches = [] + +[[tests]] +name = "basic3" +regex = '(?m)^' +input = "abc\ndef\nxyz" +matches = [[0, 0], [4, 4], [8, 8]] + +[[tests]] +name = "basic4" +regex = '(?m)$' +input = "abc\ndef\nxyz" +matches = [[3, 3], [7, 7], [11, 11]] + +[[tests]] +name = "basic5" +regex = '(?m)^[a-z]' +input = "abc\ndef\nxyz" +matches = [[0, 1], [4, 5], [8, 9]] + +[[tests]] +name = "basic6" +regex = '(?m)[a-z]^' +input = "abc\ndef\nxyz" +matches = [] + +[[tests]] +name = "basic7" +regex = '(?m)[a-z]$' +input = "abc\ndef\nxyz" +matches = [[2, 3], [6, 7], [10, 11]] + +[[tests]] +name = "basic8" +regex = '(?m)$[a-z]' +input = "abc\ndef\nxyz" +matches = [] + +[[tests]] +name = "basic9" +regex = '(?m)^$' +input = "" +matches = [[0, 0]] + +[[tests]] +name = "repeat1" +regex = '(?m)(?:^$)*' +input = "a\nb\nc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] + +[[tests]] +name = "repeat1-no-multi" +regex = '(?:^$)*' +input = "a\nb\nc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] + +[[tests]] +name = "repeat2" +regex = '(?m)(?:^|a)+' +input = "a\naaa\n" +matches = [[0, 0], [2, 2], [3, 5], [6, 6]] + +[[tests]] +name = "repeat100" +regex = '(?m)(?:^|a)+' +input = "a\naaa\n" +matches = [[0, 0], [2, 2], [3, 5], [6, 6]] + +[[tests]] +name = "repeat2-no-multi" +regex = '(?:^|a)+' +input = "a\naaa\n" +matches = [[0, 0], [2, 5]] + +[[tests]] +name = "repeat3" +regex = '(?m)(?:^|a)*' +input = "a\naaa\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]] + +[[tests]] +name = "repeat3-no-multi" +regex = '(?:^|a)*' +input = "a\naaa\n" +matches = [[0, 0], [1, 1], [2, 5], [6, 6]] + +[[tests]] +name = "repeat4" +regex = '(?m)(?:^|a+)' +input = "a\naaa\n" +matches = [[0, 0], [2, 2], [3, 5], [6, 6]] + +[[tests]] +name = "repeat4-no-multi" +regex = '(?:^|a+)' +input = "a\naaa\n" +matches = [[0, 0], [2, 5]] + +[[tests]] +name = "repeat5" +regex = '(?m)(?:^|a*)' +input = "a\naaa\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]] + +[[tests]] +name = "repeat5-no-multi" +regex = '(?:^|a*)' +input = "a\naaa\n" +matches = [[0, 0], [1, 1], [2, 5], [6, 6]] + +[[tests]] +name = "repeat6" +regex = '(?m)(?:^[a-z])+' +input = "abc\ndef\nxyz" +matches = [[0, 1], [4, 5], [8, 9]] + +[[tests]] +name = "repeat6-no-multi" +regex = '(?:^[a-z])+' +input = "abc\ndef\nxyz" +matches = [[0, 1]] + +[[tests]] +name = "repeat7" +regex = '(?m)(?:^[a-z]{3}\n?)+' +input = "abc\ndef\nxyz" +matches = [[0, 11]] + +[[tests]] +name = "repeat7-no-multi" +regex = '(?:^[a-z]{3}\n?)+' +input = "abc\ndef\nxyz" +matches = [[0, 4]] + +[[tests]] +name = "repeat8" +regex = '(?m)(?:^[a-z]{3}\n?)*' +input = "abc\ndef\nxyz" +matches = [[0, 11]] + +[[tests]] +name = "repeat8-no-multi" +regex = '(?:^[a-z]{3}\n?)*' +input = "abc\ndef\nxyz" +matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]] + +[[tests]] +name = "repeat9" +regex = '(?m)(?:\n?[a-z]{3}$)+' +input = "abc\ndef\nxyz" +matches = [[0, 11]] + +[[tests]] +name = "repeat9-no-multi" +regex = '(?:\n?[a-z]{3}$)+' +input = "abc\ndef\nxyz" +matches = [[7, 11]] + +[[tests]] +name = "repeat10" +regex = '(?m)(?:\n?[a-z]{3}$)*' +input = "abc\ndef\nxyz" +matches = [[0, 11]] + +[[tests]] +name = "repeat10-no-multi" +regex = '(?:\n?[a-z]{3}$)*' +input = "abc\ndef\nxyz" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]] + +[[tests]] +name = "repeat11" +regex = '(?m)^*' +input = "\naa\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] + +[[tests]] +name = "repeat11-no-multi" +regex = '^*' +input = "\naa\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] + +[[tests]] +name = "repeat12" +regex = '(?m)^+' +input = "\naa\n" +matches = [[0, 0], [1, 1], [4, 4]] + +[[tests]] +name = "repeat12-no-multi" +regex = '^+' +input = "\naa\n" +matches = [[0, 0]] + +[[tests]] +name = "repeat13" +regex = '(?m)$*' +input = "\naa\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] + +[[tests]] +name = "repeat13-no-multi" +regex = '$*' +input = "\naa\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] + +[[tests]] +name = "repeat14" +regex = '(?m)$+' +input = "\naa\n" +matches = [[0, 0], [3, 3], [4, 4]] + +[[tests]] +name = "repeat14-no-multi" +regex = '$+' +input = "\naa\n" +matches = [[4, 4]] + +[[tests]] +name = "repeat15" +regex = '(?m)(?:$\n)+' +input = "\n\naaa\n\n" +matches = [[0, 2], [5, 7]] + +[[tests]] +name = "repeat15-no-multi" +regex = '(?:$\n)+' +input = "\n\naaa\n\n" +matches = [] + +[[tests]] +name = "repeat16" +regex = '(?m)(?:$\n)*' +input = "\n\naaa\n\n" +matches = [[0, 2], [3, 3], [4, 4], [5, 7]] + +[[tests]] +name = "repeat16-no-multi" +regex = '(?:$\n)*' +input = "\n\naaa\n\n" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]] + +[[tests]] +name = "repeat17" +regex = '(?m)(?:$\n^)+' +input = "\n\naaa\n\n" +matches = [[0, 2], [5, 7]] + +[[tests]] +name = "repeat17-no-multi" +regex = '(?:$\n^)+' +input = "\n\naaa\n\n" +matches = [] + +[[tests]] +name = "repeat18" +regex = '(?m)(?:^|$)+' +input = "\n\naaa\n\n" +matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]] + +[[tests]] +name = "repeat18-no-multi" +regex = '(?:^|$)+' +input = "\n\naaa\n\n" +matches = [[0, 0], [7, 7]] diff --git a/vendor/regex-automata-0.2.0/tests/data/no-unicode.toml b/vendor/regex-automata-0.2.0/tests/data/no-unicode.toml new file mode 100644 index 000000000..c7fc9664f --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/no-unicode.toml @@ -0,0 +1,158 @@ +[[tests]] +name = "invalid-utf8-literal1" +regex = '\xFF' +input = '\xFF' +matches = [[0, 1]] +unicode = false +utf8 = false +unescape = true + + +[[tests]] +name = "mixed" +regex = '(.+)(?-u)(.+)' +input = '\xCE\x93\xCE\x94\xFF' +matches = [[0, 5]] +utf8 = false +unescape = true + + +[[tests]] +name = "case1" +regex = "a" +input = "A" +matches = [[0, 1]] +case_insensitive = true +unicode = false + +[[tests]] +name = "case2" +regex = "[a-z]+" +input = "AaAaA" +matches = [[0, 5]] +case_insensitive = true +unicode = false + +[[tests]] +name = "case3" +regex = "[a-z]+" +input = "aA\u212AaA" +matches = [[0, 7]] +case_insensitive = true + +[[tests]] +name = "case4" +regex = "[a-z]+" +input = "aA\u212AaA" +matches = [[0, 2], [5, 7]] +case_insensitive = true +unicode = false + + +[[tests]] +name = "negate1" +regex = "[^a]" +input = "δ" +matches = [[0, 2]] + +[[tests]] +name = "negate2" +regex = "[^a]" +input = "δ" +matches = [[0, 1], [1, 2]] +unicode = false +utf8 = false + + +[[tests]] +name = "dotstar-prefix1" +regex = "a" +input = '\xFFa' +matches = [[1, 2]] +unicode = false +utf8 = false +unescape = true + +[[tests]] +name = "dotstar-prefix2" +regex = "a" +input = '\xFFa' +matches = [[1, 2]] +utf8 = false +unescape = true + + +[[tests]] +name = "null-bytes1" +regex = '[^\x00]+\x00' +input = 'foo\x00' +matches = [[0, 4]] +unicode = false +utf8 = false +unescape = true + + +[[tests]] +name = "word-ascii" +regex = '\w+' +input = "aδ" +matches = [[0, 1]] +unicode = false + +[[tests]] +name = "word-unicode" +regex = '\w+' +input = "aδ" +matches = [[0, 3]] + +[[tests]] +name = "decimal-ascii" +regex = '\d+' +input = "1२३9" +matches = [[0, 1], [7, 8]] +unicode = false + +[[tests]] +name = "decimal-unicode" +regex = '\d+' +input = "1२३9" +matches = [[0, 8]] + +[[tests]] +name = "space-ascii" +regex = '\s+' +input = " \u1680" +matches = [[0, 1]] +unicode = false + +[[tests]] +name = "space-unicode" +regex = '\s+' +input = " \u1680" +matches = [[0, 4]] + + +[[tests]] +# See: https://github.com/rust-lang/regex/issues/484 +name = "iter1-bytes" +regex = '' +input = "☃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +utf8 = false + +[[tests]] +# See: https://github.com/rust-lang/regex/issues/484 +name = "iter1-utf8" +regex = '' +input = "☃" +matches = [[0, 0], [3, 3]] + +[[tests]] +# See: https://github.com/rust-lang/regex/issues/484 +# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8. +name = "iter2-bytes" +regex = '' +input = 'b\xFFr' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +unescape = true +utf8 = false diff --git a/vendor/regex-automata-0.2.0/tests/data/overlapping.toml b/vendor/regex-automata-0.2.0/tests/data/overlapping.toml new file mode 100644 index 000000000..6662876b4 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/overlapping.toml @@ -0,0 +1,126 @@ +[[tests]] +name = "repetition-plus-leftmost-first-100" +regex = 'a+' +input = "aaa" +matches = [[0, 1], [0, 2], [0, 3]] +match_kind = "leftmost-first" +search_kind = "overlapping" + +[[tests]] +name = "repetition-plus-all-100" +regex = 'a+' +input = "aaa" +matches = [[0, 1], [0, 2], [0, 3]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "repetition-plus-leftmost-first-200" +regex = '(abc)+' +input = "zzabcabczzabc" +matches = [[2, 5], [2, 8]] +match_kind = "leftmost-first" +search_kind = "overlapping" + +[[tests]] +name = "repetition-plus-all-200" +regex = '(abc)+' +input = "zzabcabczzabc" +matches = [[2, 5], [2, 8], [10, 13]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "repetition-star-leftmost-first-100" +regex = 'a*' +input = "aaa" +matches = [[0, 0], [0, 1], [0, 2], [0, 3]] +match_kind = "leftmost-first" +search_kind = "overlapping" + +[[tests]] +name = "repetition-star-all-100" +regex = 'a*' +input = "aaa" +matches = [[0, 0], [0, 1], [0, 2], [0, 3]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "repetition-star-leftmost-first-200" +regex = '(abc)*' +input = "zzabcabczzabc" +matches = [[0, 0]] +match_kind = "leftmost-first" +search_kind = "overlapping" + +[[tests]] +name = "repetition-star-all-200" +regex = '(abc)*' +input = "zzabcabczzabc" +matches = [ + [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], + [2, 5], + [6, 6], [7, 7], + [2, 8], + [9, 9], [10, 10], [11, 11], [12, 12], + [10, 13], +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "start-end-rep-leftmost-first" +regex = '(^$)*' +input = "abc" +matches = [[0, 0]] +match_kind = "leftmost-first" +search_kind = "overlapping" + +[[tests]] +name = "start-end-rep-all" +regex = '(^$)*' +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "alt-leftmost-first-100" +regex = 'abc|a' +input = "zzabcazzaabc" +matches = [[2, 3], [2, 5]] +match_kind = "leftmost-first" +search_kind = "overlapping" + +[[tests]] +name = "alt-all-100" +regex = 'abc|a' +input = "zzabcazzaabc" +matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty-000" +regex = "" +input = "abc" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty-alt-000" +regex = "|b" +input = "abc" +matches = [[0, 0], [1, 1], [1, 2], [3, 3]] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty-alt-010" +regex = "b|" +input = "abc" +matches = [[0, 0], [1, 1], [1, 2], [3, 3]] +match_kind = "all" +search_kind = "overlapping" diff --git a/vendor/regex-automata-0.2.0/tests/data/regression.toml b/vendor/regex-automata-0.2.0/tests/data/regression.toml new file mode 100644 index 000000000..6a4dbb151 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/regression.toml @@ -0,0 +1,423 @@ +# See: https://github.com/rust-lang/regex/issues/48 +[[tests]] +name = "invalid-regex-no-crash-100" +regex = '(*)' +input = "" +matches = [] +compiles = false + +# See: https://github.com/rust-lang/regex/issues/48 +[[tests]] +name = "invalid-regex-no-crash-200" +regex = '(?:?)' +input = "" +matches = [] +compiles = false + +# See: https://github.com/rust-lang/regex/issues/48 +[[tests]] +name = "invalid-regex-no-crash-300" +regex = '(?)' +input = "" +matches = [] +compiles = false + +# See: https://github.com/rust-lang/regex/issues/48 +[[tests]] +name = "invalid-regex-no-crash-400" +regex = '*' +input = "" +matches = [] +compiles = false + +# See: https://github.com/rust-lang/regex/issues/75 +[[tests]] +name = "unsorted-binary-search-100" +regex = '(?i-u)[a_]+' +input = "A_" +matches = [[0, 2]] + +# See: https://github.com/rust-lang/regex/issues/75 +[[tests]] +name = "unsorted-binary-search-200" +regex = '(?i-u)[A_]+' +input = "a_" +matches = [[0, 2]] + +# See: https://github.com/rust-lang/regex/issues/76 +[[tests]] +name = "unicode-case-lower-nocase-flag" +regex = '(?i)\p{Ll}+' +input = "ΛΘΓΔα" +matches = [[0, 10]] + +# See: https://github.com/rust-lang/regex/issues/99 +[[tests]] +name = "negated-char-class-100" +regex = '(?i)[^x]' +input = "x" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/99 +[[tests]] +name = "negated-char-class-200" +regex = '(?i)[^x]' +input = "X" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/101 +[[tests]] +name = "ascii-word-underscore" +regex = '[[:word:]]' +input = "_" +matches = [[0, 1]] + +# See: https://github.com/rust-lang/regex/issues/129 +[[tests]] +name = "captures-repeat" +regex = '([a-f]){2}(?P<foo>[x-z])' +input = "abx" +captures = [ + [[0, 3], [0, 2], [2, 3]], +] + +# See: https://github.com/rust-lang/regex/issues/153 +[[tests]] +name = "alt-in-alt-100" +regex = 'ab?|$' +input = "az" +matches = [[0, 1], [2, 2]] + +# See: https://github.com/rust-lang/regex/issues/153 +[[tests]] +name = "alt-in-alt-200" +regex = '^(.*?)(\n|\r\n?|$)' +input = "ab\rcd" +matches = [[0, 3]] + +# See: https://github.com/rust-lang/regex/issues/169 +[[tests]] +name = "leftmost-first-prefix" +regex = 'z*azb' +input = "azb" +matches = [[0, 3]] + +# See: https://github.com/rust-lang/regex/issues/191 +[[tests]] +name = "many-alternates" +regex = '1|2|3|4|5|6|7|8|9|10|int' +input = "int" +matches = [[0, 3]] + +# See: https://github.com/rust-lang/regex/issues/204 +[[tests]] +name = "word-boundary-alone-100" +regex = '\b' +input = "Should this (work?)" +matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]] + +# See: https://github.com/rust-lang/regex/issues/204 +[[tests]] +name = "word-boundary-alone-200" +regex = '\b' +input = "a b c" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] + +# See: https://github.com/rust-lang/regex/issues/264 +[[tests]] +name = "word-boundary-ascii-no-capture" +regex = '\B' +input = "\U00028F3E" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +unicode = false +utf8 = false + +# See: https://github.com/rust-lang/regex/issues/264 +[[tests]] +name = "word-boundary-ascii-capture" +regex = '(\B)' +input = "\U00028F3E" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] +unicode = false +utf8 = false + +# See: https://github.com/rust-lang/regex/issues/268 +[[tests]] +name = "partial-anchor" +regex = '^a|b' +input = "ba" +matches = [[0, 1]] + +# See: https://github.com/rust-lang/regex/issues/271 +[[tests]] +name = "endl-or-word-boundary" +regex = '(?m:$)|(?-u:\b)' +input = "\U0006084E" +matches = [[4, 4]] + +# See: https://github.com/rust-lang/regex/issues/271 +[[tests]] +name = "zero-or-end" +regex = '(?i-u:\x00)|$' +input = "\U000E682F" +matches = [[4, 4]] + +# See: https://github.com/rust-lang/regex/issues/271 +[[tests]] +name = "y-or-endl" +regex = '(?i-u:y)|(?m:$)' +input = "\U000B4331" +matches = [[4, 4]] + +# See: https://github.com/rust-lang/regex/issues/271 +[[tests]] +name = "word-boundary-start-x" +regex = '(?u:\b)^(?-u:X)' +input = "X" +matches = [[0, 1]] + +# See: https://github.com/rust-lang/regex/issues/271 +[[tests]] +name = "word-boundary-ascii-start-x" +regex = '(?-u:\b)^(?-u:X)' +input = "X" +matches = [[0, 1]] + +# See: https://github.com/rust-lang/regex/issues/271 +[[tests]] +name = "end-not-word-boundary" +regex = '$\B' +input = "\U0005C124\U000B576C" +matches = [[8, 8]] +unicode = false +utf8 = false + +# See: https://github.com/rust-lang/regex/issues/280 +[[tests]] +name = "partial-anchor-alternate-begin" +regex = '^a|z' +input = "yyyyya" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/280 +[[tests]] +name = "partial-anchor-alternate-end" +regex = 'a$|z' +input = "ayyyyy" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/289 +[[tests]] +name = "lits-unambiguous-100" +regex = '(ABC|CDA|BC)X' +input = "CDAX" +matches = [[0, 4]] + +# See: https://github.com/rust-lang/regex/issues/291 +[[tests]] +name = "lits-unambiguous-200" +regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$' +input = "CIMG2341" +captures = [ + [[0, 8], [0, 4], [], [0, 4], [4, 8]], +] + +# See: https://github.com/rust-lang/regex/issues/303 +[[tests]] +name = "negated-full-byte-range" +regex = '[^\x00-\xFF]' +input = "" +matches = [] +compiles = false +unicode = false +utf8 = false + +# See: https://github.com/rust-lang/regex/issues/321 +[[tests]] +name = "strange-anchor-non-complete-prefix" +regex = 'a^{2}' +input = "" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/321 +[[tests]] +name = "strange-anchor-non-complete-suffix" +regex = '${2}a' +input = "" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/334 +# See: https://github.com/rust-lang/regex/issues/557 +[[tests]] +name = "captures-after-dfa-premature-end-100" +regex = 'a(b*(X|$))?' +input = "abcbX" +captures = [ + [[0, 1], [], []], +] + +# See: https://github.com/rust-lang/regex/issues/334 +# See: https://github.com/rust-lang/regex/issues/557 +[[tests]] +name = "captures-after-dfa-premature-end-200" +regex = 'a(bc*(X|$))?' +input = "abcbX" +captures = [ + [[0, 1], [], []], +] + +# See: https://github.com/rust-lang/regex/issues/334 +# See: https://github.com/rust-lang/regex/issues/557 +[[tests]] +name = "captures-after-dfa-premature-end-300" +regex = '(aa$)?' +input = "aaz" +captures = [ + [[0, 0]], + [[1, 1]], + [[2, 2]], + [[3, 3]], +] + +# See: https://github.com/rust-lang/regex/issues/437 +[[tests]] +name = "literal-panic" +regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+' +input = "test" +matches = [] + +# See: https://github.com/rust-lang/regex/issues/527 +[[tests]] +name = "empty-flag-expr" +regex = '(((?x)))' +input = "" +matches = [[0, 0]] + +# See: https://github.com/rust-lang/regex/issues/533 +[[tests]] +name = "blank-matches-nothing-between-space-and-tab" +regex = '[[:blank:]]' +input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F' +match = false +unescape = true + +# See: https://github.com/rust-lang/regex/issues/533 +[[tests]] +name = "blank-matches-nothing-between-space-and-tab-inverted" +regex = '^[[:^blank:]]+$' +input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F' +match = true +unescape = true + +# See: https://github.com/rust-lang/regex/issues/555 +[[tests]] +name = "invalid-repetition" +regex = '(?m){1,1}' +input = "" +matches = [] +compiles = false + +# See: https://github.com/rust-lang/regex/issues/640 +[[tests]] +name = "flags-are-unset" +regex = '((?i)foo)|Bar' +input = "foo Foo bar Bar" +matches = [[0, 3], [4, 7], [12, 15]] + +# Note that 'Ј' is not 'j', but cyrillic Je +# https://en.wikipedia.org/wiki/Je_(Cyrillic) +# +# See: https://github.com/rust-lang/regex/issues/659 +[[tests]] +name = "empty-group-with-unicode" +regex = '()Ј01' +input = 'zЈ01' +matches = [[1, 5]] + +# See: https://github.com/rust-lang/regex/issues/579 +[[tests]] +name = "word-boundary-weird" +regex = '\b..\b' +input = "I have 12, he has 2!" +matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]] + +# See: https://github.com/rust-lang/regex/issues/579 +[[tests]] +name = "word-boundary-weird-ascii" +regex = '\b..\b' +input = "I have 12, he has 2!" +matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]] +unicode = false +utf8 = false + +# See: https://github.com/rust-lang/regex/issues/579 +[[tests]] +name = "word-boundary-weird-minimal-ascii" +regex = '\b..\b' +input = "az,,b" +matches = [[0, 2], [2, 4]] +unicode = false +utf8 = false + +# See: https://github.com/BurntSushi/ripgrep/issues/1203 +[[tests]] +name = "reverse-suffix-100" +regex = '[0-4][0-4][0-4]000' +input = "153.230000" +matches = [[4, 10]] + +# See: https://github.com/BurntSushi/ripgrep/issues/1203 +[[tests]] +name = "reverse-suffix-200" +regex = '[0-9][0-9][0-9]000' +input = "153.230000\n" +matches = [[4, 10]] + +# See: https://github.com/BurntSushi/ripgrep/issues/1247 +[[tests]] +name = "stops" +regex = '\bs(?:[ab])' +input = 's\xE4' +matches = [] +unescape = true + +# See: https://github.com/BurntSushi/ripgrep/issues/1247 +[[tests]] +name = "stops-ascii" +regex = '(?-u:\b)s(?:[ab])' +input = 's\xE4' +matches = [] +unescape = true + +# There is no issue for this bug. +[[tests]] +name = "anchored-prefix-100" +regex = '^a[[:^space:]]' +input = "a " +matches = [] + +# There is no issue for this bug. +[[tests]] +name = "anchored-prefix-200" +regex = '^a[[:^space:]]' +input = "foo boo a" +matches = [] + +# There is no issue for this bug. +[[tests]] +name = "anchored-prefix-300" +regex = '^-[a-z]' +input = "r-f" +matches = [] + +# Tests that a possible Aho-Corasick optimization works correctly. It only +# kicks in when we have a lot of literals. By "works correctly," we mean that +# leftmost-first match semantics are properly respected. That is, samwise +# should match, not sam. +# +# There is no issue for this bug. +[[tests]] +name = "aho-corasick-100" +regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z' +input = "samwise" +matches = [[0, 7]] diff --git a/vendor/regex-automata-0.2.0/tests/data/set.toml b/vendor/regex-automata-0.2.0/tests/data/set.toml new file mode 100644 index 000000000..e0eb0583e --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/set.toml @@ -0,0 +1,523 @@ +[[tests]] +name = "basic10" +regexes = ["a", "a"] +input = "a" +matches = [ + { id = 0, offsets = [0, 1] }, + { id = 1, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic10-leftmost-first" +regexes = ["a", "a"] +input = "a" +matches = [ + { id = 0, offsets = [0, 1] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "basic20" +regexes = ["a", "a"] +input = "ba" +matches = [ + { id = 0, offsets = [1, 2] }, + { id = 1, offsets = [1, 2] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic30" +regexes = ["a", "b"] +input = "a" +matches = [ + { id = 0, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic40" +regexes = ["a", "b"] +input = "b" +matches = [ + { id = 1, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic50" +regexes = ["a|b", "b|a"] +input = "b" +matches = [ + { id = 0, offsets = [0, 1] }, + { id = 1, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic60" +regexes = ["foo", "oo"] +input = "foo" +matches = [ + { id = 0, offsets = [0, 3] }, + { id = 1, offsets = [1, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic60-leftmost-first" +regexes = ["foo", "oo"] +input = "foo" +matches = [ + { id = 0, offsets = [0, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "basic61" +regexes = ["oo", "foo"] +input = "foo" +matches = [ + { id = 1, offsets = [0, 3] }, + { id = 0, offsets = [1, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic61-leftmost-first" +regexes = ["oo", "foo"] +input = "foo" +matches = [ + { id = 1, offsets = [0, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "basic70" +regexes = ["abcd", "bcd", "cd", "d"] +input = "abcd" +matches = [ + { id = 0, offsets = [0, 4] }, + { id = 1, offsets = [1, 4] }, + { id = 2, offsets = [2, 4] }, + { id = 3, offsets = [3, 4] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic71" +regexes = ["bcd", "cd", "d", "abcd"] +input = "abcd" +matches = [ + { id = 3, offsets = [0, 4] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "basic80" +regexes = ["^foo", "bar$"] +input = "foo" +matches = [ + { id = 0, offsets = [0, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic81" +regexes = ["^foo", "bar$"] +input = "foo bar" +matches = [ + { id = 0, offsets = [0, 3] }, + { id = 1, offsets = [4, 7] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic82" +regexes = ["^foo", "bar$"] +input = "bar" +matches = [ + { id = 1, offsets = [0, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic90" +regexes = ["[a-z]+$", "foo"] +input = "01234 foo" +matches = [ + { id = 0, offsets = [6, 9] }, + { id = 1, offsets = [6, 9] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic91" +regexes = ["[a-z]+$", "foo"] +input = "foo 01234" +matches = [ + { id = 1, offsets = [0, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic100" +regexes = [".*?", "a"] +input = "zzza" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [0, 1] }, + { id = 0, offsets = [0, 2] }, + { id = 0, offsets = [0, 3] }, + { id = 0, offsets = [0, 4] }, + { id = 1, offsets = [3, 4] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic101" +regexes = [".*", "a"] +input = "zzza" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [0, 1] }, + { id = 0, offsets = [0, 2] }, + { id = 0, offsets = [0, 3] }, + { id = 0, offsets = [0, 4] }, + { id = 1, offsets = [3, 4] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic102" +regexes = [".*", "a"] +input = "zzz" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [0, 1] }, + { id = 0, offsets = [0, 2] }, + { id = 0, offsets = [0, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic110" +regexes = ['\ba\b'] +input = "hello a bye" +matches = [ + { id = 0, offsets = [6, 7] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic111" +regexes = ['\ba\b', '\be\b'] +input = "hello a bye e" +matches = [ + { id = 0, offsets = [6, 7] }, + { id = 1, offsets = [12, 13] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic120" +regexes = ["a"] +input = "a" +matches = [ + { id = 0, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic121" +regexes = [".*a"] +input = "a" +matches = [ + { id = 0, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic122" +regexes = [".*a", "β"] +input = "β" +matches = [ + { id = 1, offsets = [0, 2] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "basic130" +regexes = ["ab", "b"] +input = "ba" +matches = [ + { id = 1, offsets = [0, 1] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty10" +regexes = ["", "a"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 1, offsets = [0, 1] }, + { id = 0, offsets = [1, 1] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty10-leftmost-first" +regexes = ["", "a"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [1, 1] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "empty11" +regexes = ["a", ""] +input = "abc" +matches = [ + { id = 1, offsets = [0, 0] }, + { id = 0, offsets = [0, 1] }, + { id = 1, offsets = [1, 1] }, + { id = 1, offsets = [2, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty11-leftmost-first" +regexes = ["a", ""] +input = "abc" +matches = [ + { id = 0, offsets = [0, 1] }, + { id = 1, offsets = [2, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "empty20" +regexes = ["", "b"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [1, 1] }, + { id = 1, offsets = [1, 2] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty20-leftmost-first" +regexes = ["", "b"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [1, 1] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "empty21" +regexes = ["b", ""] +input = "abc" +matches = [ + { id = 1, offsets = [0, 0] }, + { id = 1, offsets = [1, 1] }, + { id = 0, offsets = [1, 2] }, + { id = 1, offsets = [2, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty21-leftmost-first" +regexes = ["b", ""] +input = "abc" +matches = [ + { id = 1, offsets = [0, 0] }, + { id = 0, offsets = [1, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "empty22" +regexes = ["(?:)", "b"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [1, 1] }, + { id = 1, offsets = [1, 2] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty23" +regexes = ["b", "(?:)"] +input = "abc" +matches = [ + { id = 1, offsets = [0, 0] }, + { id = 1, offsets = [1, 1] }, + { id = 0, offsets = [1, 2] }, + { id = 1, offsets = [2, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty30" +regexes = ["", "z"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [1, 1] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty30-leftmost-first" +regexes = ["", "z"] +input = "abc" +matches = [ + { id = 0, offsets = [0, 0] }, + { id = 0, offsets = [1, 1] }, + { id = 0, offsets = [2, 2] }, + { id = 0, offsets = [3, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "empty31" +regexes = ["z", ""] +input = "abc" +matches = [ + { id = 1, offsets = [0, 0] }, + { id = 1, offsets = [1, 1] }, + { id = 1, offsets = [2, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty31-leftmost-first" +regexes = ["z", ""] +input = "abc" +matches = [ + { id = 1, offsets = [0, 0] }, + { id = 1, offsets = [1, 1] }, + { id = 1, offsets = [2, 2] }, + { id = 1, offsets = [3, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "empty40" +regexes = ["c(?:)", "b"] +input = "abc" +matches = [ + { id = 1, offsets = [1, 2] }, + { id = 0, offsets = [2, 3] }, +] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "empty40-leftmost-first" +regexes = ["c(?:)", "b"] +input = "abc" +matches = [ + { id = 1, offsets = [1, 2] }, + { id = 0, offsets = [2, 3] }, +] +match_kind = "leftmost-first" +search_kind = "leftmost" + +[[tests]] +name = "nomatch10" +regexes = ["a", "a"] +input = "b" +matches = [] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "nomatch20" +regexes = ["^foo", "bar$"] +input = "bar foo" +matches = [] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "nomatch30" +regexes = [] +input = "a" +matches = [] +match_kind = "all" +search_kind = "overlapping" + +[[tests]] +name = "nomatch40" +regexes = ["^rooted$", '\.log$'] +input = "notrooted" +matches = [] +match_kind = "all" +search_kind = "overlapping" diff --git a/vendor/regex-automata-0.2.0/tests/data/unicode.toml b/vendor/regex-automata-0.2.0/tests/data/unicode.toml new file mode 100644 index 000000000..016bbfd9b --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/unicode.toml @@ -0,0 +1,514 @@ +# Basic Unicode literal support. +[[tests]] +name = "literal1" +regex = '☃' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "literal2" +regex = '☃+' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "literal3" +regex = '(?i)☃+' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "literal4" +regex = '(?i)Δ' +input = "δ" +matches = [[0, 2]] + +# Unicode word boundaries. +[[tests]] +name = "wb-100" +regex = '\d\b' +input = "6δ" +matches = [] + +[[tests]] +name = "wb-200" +regex = '\d\b' +input = "6 " +matches = [[0, 1]] + +[[tests]] +name = "wb-300" +regex = '\d\B' +input = "6δ" +matches = [[0, 1]] + +[[tests]] +name = "wb-400" +regex = '\d\B' +input = "6 " +matches = [] + +# Unicode character class support. +[[tests]] +name = "class1" +regex = '[☃Ⅰ]+' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "class2" +regex = '\pN' +input = "Ⅰ" +matches = [[0, 3]] + +[[tests]] +name = "class3" +regex = '\pN+' +input = "Ⅰ1Ⅱ2" +matches = [[0, 8]] + +[[tests]] +name = "class4" +regex = '\PN+' +input = "abⅠ" +matches = [[0, 2]] + +[[tests]] +name = "class5" +regex = '[\PN]+' +input = "abⅠ" +matches = [[0, 2]] + +[[tests]] +name = "class6" +regex = '[^\PN]+' +input = "abⅠ" +matches = [[2, 5]] + +[[tests]] +name = "class7" +regex = '\p{Lu}+' +input = "ΛΘΓΔα" +matches = [[0, 8]] + +[[tests]] +name = "class8" +regex = '(?i)\p{Lu}+' +input = "ΛΘΓΔα" +matches = [[0, 10]] + +[[tests]] +name = "class9" +regex = '\pL+' +input = "ΛΘΓΔα" +matches = [[0, 10]] + +[[tests]] +name = "class10" +regex = '\p{Ll}+' +input = "ΛΘΓΔα" +matches = [[8, 10]] + +# Unicode aware "Perl" character classes. +[[tests]] +name = "perl1" +regex = '\w+' +input = "dδd" +matches = [[0, 4]] + +[[tests]] +name = "perl2" +regex = '\w+' +input = "⥡" +matches = [] + +[[tests]] +name = "perl3" +regex = '\W+' +input = "⥡" +matches = [[0, 3]] + +[[tests]] +name = "perl4" +regex = '\d+' +input = "1२३9" +matches = [[0, 8]] + +[[tests]] +name = "perl5" +regex = '\d+' +input = "Ⅱ" +matches = [] + +[[tests]] +name = "perl6" +regex = '\D+' +input = "Ⅱ" +matches = [[0, 3]] + +[[tests]] +name = "perl7" +regex = '\s+' +input = " " +matches = [[0, 3]] + +[[tests]] +name = "perl8" +regex = '\s+' +input = "☃" +matches = [] + +[[tests]] +name = "perl9" +regex = '\S+' +input = "☃" +matches = [[0, 3]] + +# Specific tests for Unicode general category classes. +[[tests]] +name = "class-gencat1" +regex = '\p{Cased_Letter}' +input = "A" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat2" +regex = '\p{Close_Punctuation}' +input = "❯" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat3" +regex = '\p{Connector_Punctuation}' +input = "⁀" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat4" +regex = '\p{Control}' +input = "\u009F" +matches = [[0, 2]] + +[[tests]] +name = "class-gencat5" +regex = '\p{Currency_Symbol}' +input = "£" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat6" +regex = '\p{Dash_Punctuation}' +input = "〰" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat7" +regex = '\p{Decimal_Number}' +input = "𑓙" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat8" +regex = '\p{Enclosing_Mark}' +input = "\uA672" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat9" +regex = '\p{Final_Punctuation}' +input = "⸡" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat10" +regex = '\p{Format}' +input = "\U000E007F" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat11" +regex = '\p{Initial_Punctuation}' +input = "⸜" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat12" +regex = '\p{Letter}' +input = "Έ" +matches = [[0, 2]] + +[[tests]] +name = "class-gencat13" +regex = '\p{Letter_Number}' +input = "ↂ" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat14" +regex = '\p{Line_Separator}' +input = "\u2028" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat15" +regex = '\p{Lowercase_Letter}' +input = "ϛ" +matches = [[0, 2]] + +[[tests]] +name = "class-gencat16" +regex = '\p{Mark}' +input = "\U000E01EF" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat17" +regex = '\p{Math}' +input = "⋿" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat18" +regex = '\p{Modifier_Letter}' +input = "𖭃" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat19" +regex = '\p{Modifier_Symbol}' +input = "🏿" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat20" +regex = '\p{Nonspacing_Mark}' +input = "\U0001E94A" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat21" +regex = '\p{Number}' +input = "⓿" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat22" +regex = '\p{Open_Punctuation}' +input = "⦅" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat23" +regex = '\p{Other}' +input = "\u0BC9" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat24" +regex = '\p{Other_Letter}' +input = "ꓷ" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat25" +regex = '\p{Other_Number}' +input = "㉏" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat26" +regex = '\p{Other_Punctuation}' +input = "𞥞" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat27" +regex = '\p{Other_Symbol}' +input = "⅌" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat28" +regex = '\p{Paragraph_Separator}' +input = "\u2029" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat29" +regex = '\p{Private_Use}' +input = "\U0010FFFD" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat30" +regex = '\p{Punctuation}' +input = "𑁍" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat31" +regex = '\p{Separator}' +input = "\u3000" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat32" +regex = '\p{Space_Separator}' +input = "\u205F" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat33" +regex = '\p{Spacing_Mark}' +input = "\U00016F7E" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat34" +regex = '\p{Symbol}' +input = "⯈" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat35" +regex = '\p{Titlecase_Letter}' +input = "ῼ" +matches = [[0, 3]] + +[[tests]] +name = "class-gencat36" +regex = '\p{Unassigned}' +input = "\U0010FFFF" +matches = [[0, 4]] + +[[tests]] +name = "class-gencat37" +regex = '\p{Uppercase_Letter}' +input = "Ꝋ" +matches = [[0, 3]] + + +# Tests for Unicode emoji properties. +[[tests]] +name = "class-emoji1" +regex = '\p{Emoji}' +input = "\u23E9" +matches = [[0, 3]] + +[[tests]] +name = "class-emoji2" +regex = '\p{emoji}' +input = "\U0001F21A" +matches = [[0, 4]] + +[[tests]] +name = "class-emoji3" +regex = '\p{extendedpictographic}' +input = "\U0001FA6E" +matches = [[0, 4]] + +[[tests]] +name = "class-emoji4" +regex = '\p{extendedpictographic}' +input = "\U0001FFFD" +matches = [[0, 4]] + + +# Tests for Unicode grapheme cluster properties. +[[tests]] +name = "class-gcb1" +regex = '\p{grapheme_cluster_break=prepend}' +input = "\U00011D46" +matches = [[0, 4]] + +[[tests]] +name = "class-gcb2" +regex = '\p{gcb=regional_indicator}' +input = "\U0001F1E6" +matches = [[0, 4]] + +[[tests]] +name = "class-gcb3" +regex = '\p{gcb=ri}' +input = "\U0001F1E7" +matches = [[0, 4]] + +[[tests]] +name = "class-gcb4" +regex = '\p{regionalindicator}' +input = "\U0001F1FF" +matches = [[0, 4]] + +[[tests]] +name = "class-gcb5" +regex = '\p{gcb=lvt}' +input = "\uC989" +matches = [[0, 3]] + +[[tests]] +name = "class-gcb6" +regex = '\p{gcb=zwj}' +input = "\u200D" +matches = [[0, 3]] + +# Tests for Unicode word boundary properties. +[[tests]] +name = "class-word-break1" +regex = '\p{word_break=Hebrew_Letter}' +input = "\uFB46" +matches = [[0, 3]] + +[[tests]] +name = "class-word-break2" +regex = '\p{wb=hebrewletter}' +input = "\uFB46" +matches = [[0, 3]] + +[[tests]] +name = "class-word-break3" +regex = '\p{wb=ExtendNumLet}' +input = "\uFF3F" +matches = [[0, 3]] + +[[tests]] +name = "class-word-break4" +regex = '\p{wb=WSegSpace}' +input = "\u3000" +matches = [[0, 3]] + +[[tests]] +name = "class-word-break5" +regex = '\p{wb=numeric}' +input = "\U0001E950" +matches = [[0, 4]] + +# Tests for Unicode sentence boundary properties. +[[tests]] +name = "class-sentence-break1" +regex = '\p{sentence_break=Lower}' +input = "\u0469" +matches = [[0, 2]] + +[[tests]] +name = "class-sentence-break2" +regex = '\p{sb=lower}' +input = "\u0469" +matches = [[0, 2]] + +[[tests]] +name = "class-sentence-break3" +regex = '\p{sb=Close}' +input = "\uFF60" +matches = [[0, 3]] + +[[tests]] +name = "class-sentence-break4" +regex = '\p{sb=Close}' +input = "\U0001F677" +matches = [[0, 4]] + +[[tests]] +name = "class-sentence-break5" +regex = '\p{sb=SContinue}' +input = "\uFF64" +matches = [[0, 3]] diff --git a/vendor/regex-automata-0.2.0/tests/data/word-boundary.toml b/vendor/regex-automata-0.2.0/tests/data/word-boundary.toml new file mode 100644 index 000000000..e84b25c2a --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/data/word-boundary.toml @@ -0,0 +1,771 @@ +# Some of these are cribbed from RE2's test suite. + +# These test \b. Below are tests for \B. +[[tests]] +name = "wb1" +regex = '\b' +input = "" +matches = [] +unicode = false + +[[tests]] +name = "wb2" +regex = '\b' +input = "a" +matches = [[0, 0], [1, 1]] +unicode = false + +[[tests]] +name = "wb3" +regex = '\b' +input = "ab" +matches = [[0, 0], [2, 2]] +unicode = false + +[[tests]] +name = "wb4" +regex = '^\b' +input = "ab" +matches = [[0, 0]] +unicode = false + +[[tests]] +name = "wb5" +regex = '\b$' +input = "ab" +matches = [[2, 2]] +unicode = false + +[[tests]] +name = "wb6" +regex = '^\b$' +input = "ab" +matches = [] +unicode = false + +[[tests]] +name = "wb7" +regex = '\bbar\b' +input = "nobar bar foo bar" +matches = [[6, 9], [14, 17]] +unicode = false + +[[tests]] +name = "wb8" +regex = 'a\b' +input = "faoa x" +matches = [[3, 4]] +unicode = false + +[[tests]] +name = "wb9" +regex = '\bbar' +input = "bar x" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb10" +regex = '\bbar' +input = "foo\nbar x" +matches = [[4, 7]] +unicode = false + +[[tests]] +name = "wb11" +regex = 'bar\b' +input = "foobar" +matches = [[3, 6]] +unicode = false + +[[tests]] +name = "wb12" +regex = 'bar\b' +input = "foobar\nxxx" +matches = [[3, 6]] +unicode = false + +[[tests]] +name = "wb13" +regex = '(foo|bar|[A-Z])\b' +input = "foo" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb14" +regex = '(foo|bar|[A-Z])\b' +input = "foo\n" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb15" +regex = '\b(foo|bar|[A-Z])' +input = "foo" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb16" +regex = '\b(foo|bar|[A-Z])\b' +input = "X" +matches = [[0, 1]] +unicode = false + +[[tests]] +name = "wb17" +regex = '\b(foo|bar|[A-Z])\b' +input = "XY" +matches = [] +unicode = false + +[[tests]] +name = "wb18" +regex = '\b(foo|bar|[A-Z])\b' +input = "bar" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb19" +regex = '\b(foo|bar|[A-Z])\b' +input = "foo" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb20" +regex = '\b(foo|bar|[A-Z])\b' +input = "foo\n" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb21" +regex = '\b(foo|bar|[A-Z])\b' +input = "ffoo bbar N x" +matches = [[10, 11]] +unicode = false + +[[tests]] +name = "wb22" +regex = '\b(fo|foo)\b' +input = "fo" +matches = [[0, 2]] +unicode = false + +[[tests]] +name = "wb23" +regex = '\b(fo|foo)\b' +input = "foo" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb24" +regex = '\b\b' +input = "" +matches = [] +unicode = false + +[[tests]] +name = "wb25" +regex = '\b\b' +input = "a" +matches = [[0, 0], [1, 1]] +unicode = false + +[[tests]] +name = "wb26" +regex = '\b$' +input = "" +matches = [] +unicode = false + +[[tests]] +name = "wb27" +regex = '\b$' +input = "x" +matches = [[1, 1]] +unicode = false + +[[tests]] +name = "wb28" +regex = '\b$' +input = "y x" +matches = [[3, 3]] +unicode = false + +[[tests]] +name = "wb29" +regex = '(?-u:\b).$' +input = "x" +matches = [[0, 1]] + +[[tests]] +name = "wb30" +regex = '^\b(fo|foo)\b' +input = "fo" +matches = [[0, 2]] +unicode = false + +[[tests]] +name = "wb31" +regex = '^\b(fo|foo)\b' +input = "foo" +matches = [[0, 3]] +unicode = false + +[[tests]] +name = "wb32" +regex = '^\b$' +input = "" +matches = [] +unicode = false + +[[tests]] +name = "wb33" +regex = '^\b$' +input = "x" +matches = [] +unicode = false + +[[tests]] +name = "wb34" +regex = '^(?-u:\b).$' +input = "x" +matches = [[0, 1]] + +[[tests]] +name = "wb35" +regex = '^(?-u:\b).(?-u:\b)$' +input = "x" +matches = [[0, 1]] + +[[tests]] +name = "wb36" +regex = '^^^^^\b$$$$$' +input = "" +matches = [] +unicode = false + +[[tests]] +name = "wb37" +regex = '^^^^^(?-u:\b).$$$$$' +input = "x" +matches = [[0, 1]] + +[[tests]] +name = "wb38" +regex = '^^^^^\b$$$$$' +input = "x" +matches = [] +unicode = false + +[[tests]] +name = "wb39" +regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$' +input = "x" +matches = [[0, 1]] + +[[tests]] +name = "wb40" +regex = '(?-u:\b).+(?-u:\b)' +input = "$$abc$$" +matches = [[2, 5]] + +[[tests]] +name = "wb41" +regex = '\b' +input = "a b c" +matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] +unicode = false + +[[tests]] +name = "wb42" +regex = '\bfoo\b' +input = "zzz foo zzz" +matches = [[4, 7]] +unicode = false + +[[tests]] +name = "wb43" +regex = '\b^' +input = "ab" +matches = [[0, 0]] +unicode = false + +[[tests]] +name = "wb44" +regex = '$\b' +input = "ab" +matches = [[2, 2]] +unicode = false + + +# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we +# have to disable it for most of these tests. This is because \B can match at +# non-UTF-8 boundaries. +[[tests]] +name = "nb1" +regex = '\Bfoo\B' +input = "n foo xfoox that" +matches = [[7, 10]] +unicode = false +utf8 = false + +[[tests]] +name = "nb2" +regex = 'a\B' +input = "faoa x" +matches = [[1, 2]] +unicode = false +utf8 = false + +[[tests]] +name = "nb3" +regex = '\Bbar' +input = "bar x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb4" +regex = '\Bbar' +input = "foo\nbar x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb5" +regex = 'bar\B' +input = "foobar" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb6" +regex = 'bar\B' +input = "foobar\nxxx" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb7" +regex = '(foo|bar|[A-Z])\B' +input = "foox" +matches = [[0, 3]] +unicode = false +utf8 = false + +[[tests]] +name = "nb8" +regex = '(foo|bar|[A-Z])\B' +input = "foo\n" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb9" +regex = '\B' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb10" +regex = '\B' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb11" +regex = '\B(foo|bar|[A-Z])' +input = "foo" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb12" +regex = '\B(foo|bar|[A-Z])\B' +input = "xXy" +matches = [[1, 2]] +unicode = false +utf8 = false + +[[tests]] +name = "nb13" +regex = '\B(foo|bar|[A-Z])\B' +input = "XY" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb14" +regex = '\B(foo|bar|[A-Z])\B' +input = "XYZ" +matches = [[1, 2]] +unicode = false +utf8 = false + +[[tests]] +name = "nb15" +regex = '\B(foo|bar|[A-Z])\B' +input = "abara" +matches = [[1, 4]] +unicode = false +utf8 = false + +[[tests]] +name = "nb16" +regex = '\B(foo|bar|[A-Z])\B' +input = "xfoo_" +matches = [[1, 4]] +unicode = false +utf8 = false + +[[tests]] +name = "nb17" +regex = '\B(foo|bar|[A-Z])\B' +input = "xfoo\n" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb18" +regex = '\B(foo|bar|[A-Z])\B' +input = "foo bar vNX" +matches = [[9, 10]] +unicode = false +utf8 = false + +[[tests]] +name = "nb19" +regex = '\B(fo|foo)\B' +input = "xfoo" +matches = [[1, 3]] +unicode = false +utf8 = false + +[[tests]] +name = "nb20" +regex = '\B(foo|fo)\B' +input = "xfooo" +matches = [[1, 4]] +unicode = false +utf8 = false + +[[tests]] +name = "nb21" +regex = '\B\B' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb22" +regex = '\B\B' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb23" +regex = '\B$' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb24" +regex = '\B$' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb25" +regex = '\B$' +input = "y x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb26" +regex = '\B.$' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb27" +regex = '^\B(fo|foo)\B' +input = "fo" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb28" +regex = '^\B(fo|foo)\B' +input = "fo" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb29" +regex = '^\B' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb30" +regex = '^\B' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb31" +regex = '^\B\B' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb32" +regex = '^\B\B' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb33" +regex = '^\B$' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb34" +regex = '^\B$' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb35" +regex = '^\B.$' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb36" +regex = '^\B.\B$' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb37" +regex = '^^^^^\B$$$$$' +input = "" +matches = [[0, 0]] +unicode = false +utf8 = false + +[[tests]] +name = "nb38" +regex = '^^^^^\B.$$$$$' +input = "x" +matches = [] +unicode = false +utf8 = false + +[[tests]] +name = "nb39" +regex = '^^^^^\B$$$$$' +input = "x" +matches = [] +unicode = false +utf8 = false + + +# unicode1* and unicode2* work for both Unicode and ASCII because all matches +# are reported as byte offsets, and « and » do not correspond to word +# boundaries at either the character or byte level. +[[tests]] +name = "unicode1" +regex = '\bx\b' +input = "«x" +matches = [[2, 3]] + +[[tests]] +name = "unicode1-only-ascii" +regex = '\bx\b' +input = "«x" +matches = [[2, 3]] +unicode = false + +[[tests]] +name = "unicode2" +regex = '\bx\b' +input = "x»" +matches = [[0, 1]] + +[[tests]] +name = "unicode2-only-ascii" +regex = '\bx\b' +input = "x»" +matches = [[0, 1]] +unicode = false + +# ASCII word boundaries are completely oblivious to Unicode characters, so +# even though β is a character, an ASCII \b treats it as a word boundary +# when it is adjacent to another ASCII character. (The ASCII \b only looks +# at the leading byte of β.) For Unicode \b, the tests are precisely inverted. +[[tests]] +name = "unicode3" +regex = '\bx\b' +input = 'áxβ' +matches = [] + +[[tests]] +name = "unicode3-only-ascii" +regex = '\bx\b' +input = 'áxβ' +matches = [[2, 3]] +unicode = false + +[[tests]] +name = "unicode4" +regex = '\Bx\B' +input = 'áxβ' +matches = [[2, 3]] + +[[tests]] +name = "unicode4-only-ascii" +regex = '\Bx\B' +input = 'áxβ' +matches = [] +unicode = false +utf8 = false + +# The same as above, but with \b instead of \B as a sanity check. +[[tests]] +name = "unicode5" +regex = '\b' +input = "0\U0007EF5E" +matches = [[0, 0], [1, 1]] + +[[tests]] +name = "unicode5-only-ascii" +regex = '\b' +input = "0\U0007EF5E" +matches = [[0, 0], [1, 1]] +unicode = false +utf8 = false + +[[tests]] +name = "unicode5-noutf8" +regex = '\b' +input = '0\xFF\xFF\xFF\xFF' +matches = [[0, 0], [1, 1]] +unescape = true +utf8 = false + +[[tests]] +name = "unicode5-noutf8-only-ascii" +regex = '\b' +input = '0\xFF\xFF\xFF\xFF' +matches = [[0, 0], [1, 1]] +unescape = true +unicode = false +utf8 = false + +# Weird special case to ensure that ASCII \B treats each individual code unit +# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary +# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the +# \w character class.) +[[tests]] +name = "unicode5-not" +regex = '\B' +input = "0\U0007EF5E" +matches = [[5, 5]] + +[[tests]] +name = "unicode5-not-only-ascii" +regex = '\B' +input = "0\U0007EF5E" +matches = [[2, 2], [3, 3], [4, 4], [5, 5]] +unicode = false +utf8 = false + +# This gets no matches since \B only matches in the presence of valid UTF-8 +# when Unicode is enabled, even when UTF-8 mode is disabled. +[[tests]] +name = "unicode5-not-noutf8" +regex = '\B' +input = '0\xFF\xFF\xFF\xFF' +matches = [] +unescape = true +utf8 = false + +# But this DOES get matches since \B in ASCII mode only looks at individual +# bytes. +[[tests]] +name = "unicode5-not-noutf8-only-ascii" +regex = '\B' +input = '0\xFF\xFF\xFF\xFF' +matches = [[2, 2], [3, 3], [4, 4], [5, 5]] +unescape = true +unicode = false +utf8 = false + +# Some tests of no particular significance. +[[tests]] +name = "unicode6" +regex = '\b[0-9]+\b' +input = "foo 123 bar 456 quux 789" +matches = [[4, 7], [12, 15], [21, 24]] + +[[tests]] +name = "unicode7" +regex = '\b[0-9]+\b' +input = "foo 123 bar a456 quux 789" +matches = [[4, 7], [22, 25]] + +[[tests]] +name = "unicode8" +regex = '\b[0-9]+\b' +input = "foo 123 bar 456a quux 789" +matches = [[4, 7], [22, 25]] diff --git a/vendor/regex-automata-0.2.0/tests/dfa/api.rs b/vendor/regex-automata-0.2.0/tests/dfa/api.rs new file mode 100644 index 000000000..80d7d704c --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/dfa/api.rs @@ -0,0 +1,133 @@ +use std::error::Error; + +use regex_automata::{ + dfa::{dense, regex::Regex, Automaton, OverlappingState}, + nfa::thompson, + HalfMatch, MatchError, MatchKind, MultiMatch, +}; + +use crate::util::{BunkPrefilter, SubstringPrefilter}; + +// Tests that quit bytes in the forward direction work correctly. +#[test] +fn quit_fwd() -> Result<(), Box<dyn Error>> { + let dfa = dense::Builder::new() + .configure(dense::Config::new().quit(b'x', true)) + .build("[[:word:]]+$")?; + + assert_eq!( + dfa.find_earliest_fwd(b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_fwd(b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_overlapping_fwd(b"abcxyz", &mut OverlappingState::start()), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that quit bytes in the reverse direction work correctly. +#[test] +fn quit_rev() -> Result<(), Box<dyn Error>> { + let dfa = dense::Builder::new() + .configure(dense::Config::new().quit(b'x', true)) + .thompson(thompson::Config::new().reverse(true)) + .build("^[[:word:]]+")?; + + assert_eq!( + dfa.find_earliest_rev(b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_rev(b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that if we heuristically enable Unicode word boundaries but then +// instruct that a non-ASCII byte should NOT be a quit byte, then the builder +// will panic. +#[test] +#[should_panic] +fn quit_panics() { + dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false); +} + +// Tests that if we attempt an overlapping search using a regex without a +// reverse DFA compiled with 'starts_for_each_pattern', then we get a panic. +#[test] +#[should_panic] +fn incorrect_config_overlapping_search_panics() { + let forward = dense::DFA::new(r"abca").unwrap(); + let reverse = dense::Builder::new() + .configure( + dense::Config::new() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(false), + ) + .thompson(thompson::Config::new().reverse(true)) + .build(r"abca") + .unwrap(); + + let re = Regex::builder().build_from_dfas(forward, reverse); + let haystack = "bar abcabcabca abca foo".as_bytes(); + re.find_overlapping(haystack, &mut OverlappingState::start()); +} + +// This tests an intesting case where even if the Unicode word boundary option +// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode +// word boundaries to be enabled. +#[test] +fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> { + let mut config = dense::Config::new(); + for b in 0x80..=0xFF { + config = config.quit(b, true); + } + let dfa = dense::Builder::new().configure(config).build(r"\b")?; + let expected = HalfMatch::must(0, 1); + assert_eq!(dfa.find_leftmost_fwd(b" a"), Ok(Some(expected))); + Ok(()) +} + +// Tests that we can provide a prefilter to a Regex, and the search reports +// correct results. +#[test] +fn prefilter_works() -> Result<(), Box<dyn Error>> { + let re = Regex::new(r"a[0-9]+") + .unwrap() + .with_prefilter(SubstringPrefilter::new("a")); + let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; + let matches: Vec<(usize, usize)> = + re.find_leftmost_iter(text).map(|m| (m.start(), m.end())).collect(); + assert_eq!( + matches, + vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] + ); + Ok(()) +} + +// This test confirms that a prefilter is active by using a prefilter that +// reports false negatives. +#[test] +fn prefilter_is_active() -> Result<(), Box<dyn Error>> { + let text = b"za123"; + let re = Regex::new(r"a[0-9]+") + .unwrap() + .with_prefilter(SubstringPrefilter::new("a")); + assert_eq!(re.find_leftmost(b"za123"), Some(MultiMatch::must(0, 1, 5))); + assert_eq!(re.find_leftmost(b"a123"), Some(MultiMatch::must(0, 0, 4))); + let re = re.with_prefilter(BunkPrefilter::new()); + assert_eq!(re.find_leftmost(b"za123"), None); + // This checks that the prefilter is used when first starting the search, + // instead of waiting until at least one transition has occurred. + assert_eq!(re.find_leftmost(b"a123"), None); + Ok(()) +} diff --git a/vendor/regex-automata-0.2.0/tests/dfa/mod.rs b/vendor/regex-automata-0.2.0/tests/dfa/mod.rs new file mode 100644 index 000000000..f4299510c --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/dfa/mod.rs @@ -0,0 +1,2 @@ +mod api; +mod suite; diff --git a/vendor/regex-automata-0.2.0/tests/dfa/suite.rs b/vendor/regex-automata-0.2.0/tests/dfa/suite.rs new file mode 100644 index 000000000..426ae346d --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/dfa/suite.rs @@ -0,0 +1,280 @@ +use regex_automata::{ + dfa::{self, dense, regex::Regex, sparse, Automaton}, + nfa::thompson, + MatchKind, SyntaxConfig, +}; +use regex_syntax as syntax; + +use regex_test::{ + bstr::{BString, ByteSlice}, + CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, + SearchKind as TestSearchKind, TestResult, TestRunner, +}; + +use crate::{suite, Result}; + +/// Runs the test suite with the default configuration. +#[test] +fn unminimized_default() -> Result<()> { + let builder = Regex::builder(); + TestRunner::new()? + .test_iter(suite()?.iter(), dense_compiler(builder)) + .assert(); + Ok(()) +} + +/// Runs the test suite with byte classes disabled. +#[test] +fn unminimized_no_byte_class() -> Result<()> { + let mut builder = Regex::builder(); + builder.dense(dense::Config::new().byte_classes(false)); + + TestRunner::new()? + .test_iter(suite()?.iter(), dense_compiler(builder)) + .assert(); + Ok(()) +} + +/// Runs the test suite with NFA shrinking disabled. +#[test] +fn unminimized_no_nfa_shrink() -> Result<()> { + let mut builder = Regex::builder(); + builder.thompson(thompson::Config::new().shrink(false)); + + TestRunner::new()? + .test_iter(suite()?.iter(), dense_compiler(builder)) + .assert(); + Ok(()) +} + +/// Runs the test suite on a minimized DFA with an otherwise default +/// configuration. +#[test] +fn minimized_default() -> Result<()> { + let mut builder = Regex::builder(); + builder.dense(dense::Config::new().minimize(true)); + TestRunner::new()? + // These regexes tend to be too big. Minimization takes... forever. + .blacklist("expensive") + .test_iter(suite()?.iter(), dense_compiler(builder)) + .assert(); + Ok(()) +} + +/// Runs the test suite on a minimized DFA with byte classes disabled. +#[test] +fn minimized_no_byte_class() -> Result<()> { + let mut builder = Regex::builder(); + builder.dense(dense::Config::new().minimize(true).byte_classes(false)); + + TestRunner::new()? + // These regexes tend to be too big. Minimization takes... forever. + .blacklist("expensive") + .test_iter(suite()?.iter(), dense_compiler(builder)) + .assert(); + Ok(()) +} + +/// Runs the test suite on a sparse unminimized DFA. +#[test] +fn sparse_unminimized_default() -> Result<()> { + let builder = Regex::builder(); + TestRunner::new()? + .test_iter(suite()?.iter(), sparse_compiler(builder)) + .assert(); + Ok(()) +} + +/// Another basic sanity test that checks we can serialize and then deserialize +/// a regex, and that the resulting regex can be used for searching correctly. +#[test] +fn serialization_unminimized_default() -> Result<()> { + let builder = Regex::builder(); + let my_compiler = |builder| { + compiler(builder, |builder, re| { + let builder = builder.clone(); + let (fwd_bytes, _) = re.forward().to_bytes_native_endian(); + let (rev_bytes, _) = re.reverse().to_bytes_native_endian(); + Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { + let fwd: dense::DFA<&[u32]> = + dense::DFA::from_bytes(&fwd_bytes).unwrap().0; + let rev: dense::DFA<&[u32]> = + dense::DFA::from_bytes(&rev_bytes).unwrap().0; + let re = builder.build_from_dfas(fwd, rev); + + run_test(&re, test) + })) + }) + }; + TestRunner::new()? + .test_iter(suite()?.iter(), my_compiler(builder)) + .assert(); + Ok(()) +} + +/// A basic sanity test that checks we can serialize and then deserialize a +/// regex using sparse DFAs, and that the resulting regex can be used for +/// searching correctly. +#[test] +fn sparse_serialization_unminimized_default() -> Result<()> { + let builder = Regex::builder(); + let my_compiler = |builder| { + compiler(builder, |builder, re| { + let builder = builder.clone(); + let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian(); + let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian(); + Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { + let fwd: sparse::DFA<&[u8]> = + sparse::DFA::from_bytes(&fwd_bytes).unwrap().0; + let rev: sparse::DFA<&[u8]> = + sparse::DFA::from_bytes(&rev_bytes).unwrap().0; + let re = builder.build_from_dfas(fwd, rev); + run_test(&re, test) + })) + }) + }; + TestRunner::new()? + .test_iter(suite()?.iter(), my_compiler(builder)) + .assert(); + Ok(()) +} + +fn dense_compiler( + builder: dfa::regex::Builder, +) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> { + compiler(builder, |_, re| { + Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { + run_test(&re, test) + })) + }) +} + +fn sparse_compiler( + builder: dfa::regex::Builder, +) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> { + compiler(builder, |builder, re| { + let fwd = re.forward().to_sparse()?; + let rev = re.reverse().to_sparse()?; + let re = builder.build_from_dfas(fwd, rev); + Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { + run_test(&re, test) + })) + }) +} + +fn compiler( + mut builder: dfa::regex::Builder, + mut create_matcher: impl FnMut( + &dfa::regex::Builder, + Regex, + ) -> Result<CompiledRegex>, +) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> { + move |test, regexes| { + let regexes = regexes + .iter() + .map(|r| r.to_str().map(|s| s.to_string())) + .collect::<std::result::Result<Vec<String>, _>>()?; + + // Check if our regex contains things that aren't supported by DFAs. + // That is, Unicode word boundaries when searching non-ASCII text. + let mut thompson = thompson::Builder::new(); + thompson.configure(config_thompson(test)); + // TODO: Modify Hir to report facts like this, instead of needing to + // build an NFA to do it. + if let Ok(nfa) = thompson.build_many(®exes) { + let non_ascii = test.input().iter().any(|&b| !b.is_ascii()); + if nfa.has_word_boundary_unicode() && non_ascii { + return Ok(CompiledRegex::skip()); + } + } + if !configure_regex_builder(test, &mut builder) { + return Ok(CompiledRegex::skip()); + } + create_matcher(&builder, builder.build_many(®exes)?) + } +} + +fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> Vec<TestResult> { + let is_match = if re.is_match(test.input()) { + TestResult::matched() + } else { + TestResult::no_match() + }; + let is_match = is_match.name("is_match"); + + let find_matches = match test.search_kind() { + TestSearchKind::Earliest => { + let it = re + .find_earliest_iter(test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_earliest_iter") + } + TestSearchKind::Leftmost => { + let it = re + .find_leftmost_iter(test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_leftmost_iter") + } + TestSearchKind::Overlapping => { + let it = re + .find_overlapping_iter(test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_overlapping_iter") + } + }; + + vec![is_match, find_matches] +} + +/// Configures the given regex builder with all relevant settings on the given +/// regex test. +/// +/// If the regex test has a setting that is unsupported, then this returns +/// false (implying the test should be skipped). +fn configure_regex_builder( + test: &RegexTest, + builder: &mut dfa::regex::Builder, +) -> bool { + let match_kind = match test.match_kind() { + TestMatchKind::All => MatchKind::All, + TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst, + TestMatchKind::LeftmostLongest => return false, + }; + + let syntax_config = SyntaxConfig::new() + .case_insensitive(test.case_insensitive()) + .unicode(test.unicode()) + .utf8(test.utf8()); + let dense_config = dense::Config::new() + .anchored(test.anchored()) + .match_kind(match_kind) + .unicode_word_boundary(true); + let regex_config = Regex::config().utf8(test.utf8()); + + builder + .configure(regex_config) + .syntax(syntax_config) + .thompson(config_thompson(test)) + .dense(dense_config); + true +} + +/// Configuration of a Thompson NFA compiler from a regex test. +fn config_thompson(test: &RegexTest) -> thompson::Config { + thompson::Config::new().utf8(test.utf8()) +} diff --git a/vendor/regex-automata-0.2.0/tests/hybrid/api.rs b/vendor/regex-automata-0.2.0/tests/hybrid/api.rs new file mode 100644 index 000000000..9a834dbb8 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/hybrid/api.rs @@ -0,0 +1,195 @@ +use std::error::Error; + +use regex_automata::{ + hybrid::{ + dfa::{self, DFA}, + regex::Regex, + OverlappingState, + }, + nfa::thompson, + HalfMatch, MatchError, MatchKind, MultiMatch, +}; + +use crate::util::{BunkPrefilter, SubstringPrefilter}; + +// Tests that too many cache resets cause the lazy DFA to quit. +// +// We only test this on 64-bit because the test is gingerly crafted based on +// implementation details of cache sizes. It's not a great test because of +// that, but it does check some interesting properties around how positions are +// reported when a search "gives up." +#[test] +#[cfg(target_pointer_width = "64")] +fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> { + // This is a carefully chosen regex. The idea is to pick one that requires + // some decent number of states (hence the bounded repetition). But we + // specifically choose to create a class with an ASCII letter and a + // non-ASCII letter so that we can check that no new states are created + // once the cache is full. Namely, if we fill up the cache on a haystack + // of 'a's, then in order to match one 'β', a new state will need to be + // created since a 'β' is encoded with multiple bytes. Since there's no + // room for this state, the search should quit at the very first position. + let pattern = r"[aβ]{100}"; + let dfa = DFA::builder() + .configure( + // Configure it so that we have the minimum cache capacity + // possible. And that if any resets occur, the search quits. + DFA::config() + .skip_cache_capacity_check(true) + .cache_capacity(0) + .minimum_cache_clear_count(Some(0)), + ) + .build(pattern)?; + let mut cache = dfa.create_cache(); + + let haystack = "a".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 25 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone())); + assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone())); + assert_eq!( + dfa.find_overlapping_fwd( + &mut cache, + &haystack, + &mut OverlappingState::start() + ), + Err(err.clone()) + ); + + let haystack = "β".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 0 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + // no need to test that other find routines quit, since we did that above + + // OK, if we reset the cache, then we should be able to create more states + // and make more progress with searching for betas. + cache.reset(&dfa); + let err = MatchError::GaveUp { offset: 26 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + + // ... switching back to ASCII still makes progress since it just needs to + // set transitions on existing states! + let haystack = "a".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 13 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + + Ok(()) +} + +// Tests that quit bytes in the forward direction work correctly. +#[test] +fn quit_fwd() -> Result<(), Box<dyn Error>> { + let dfa = DFA::builder() + .configure(DFA::config().quit(b'x', true)) + .build("[[:word:]]+$")?; + let mut cache = dfa.create_cache(); + + assert_eq!( + dfa.find_earliest_fwd(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_fwd(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_overlapping_fwd( + &mut cache, + b"abcxyz", + &mut OverlappingState::start() + ), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that quit bytes in the reverse direction work correctly. +#[test] +fn quit_rev() -> Result<(), Box<dyn Error>> { + let dfa = DFA::builder() + .configure(DFA::config().quit(b'x', true)) + .thompson(thompson::Config::new().reverse(true)) + .build("^[[:word:]]+")?; + let mut cache = dfa.create_cache(); + + assert_eq!( + dfa.find_earliest_rev(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_rev(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that if we heuristically enable Unicode word boundaries but then +// instruct that a non-ASCII byte should NOT be a quit byte, then the builder +// will panic. +#[test] +#[should_panic] +fn quit_panics() { + DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); +} + +// This tests an intesting case where even if the Unicode word boundary option +// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode +// word boundaries to be enabled. +#[test] +fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> { + let mut config = DFA::config(); + for b in 0x80..=0xFF { + config = config.quit(b, true); + } + let dfa = DFA::builder().configure(config).build(r"\b")?; + let mut cache = dfa.create_cache(); + let expected = HalfMatch::must(0, 1); + assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected))); + Ok(()) +} + +// Tests that we can provide a prefilter to a Regex, and the search reports +// correct results. +#[test] +fn prefilter_works() -> Result<(), Box<dyn Error>> { + let mut re = Regex::new(r"a[0-9]+").unwrap(); + re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); + let mut cache = re.create_cache(); + + let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; + let matches: Vec<(usize, usize)> = re + .find_leftmost_iter(&mut cache, text) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!( + matches, + vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] + ); + Ok(()) +} + +// This test confirms that a prefilter is active by using a prefilter that +// reports false negatives. +#[test] +fn prefilter_is_active() -> Result<(), Box<dyn Error>> { + let text = b"za123"; + let mut re = Regex::new(r"a[0-9]+").unwrap(); + let mut cache = re.create_cache(); + + re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); + assert_eq!( + re.find_leftmost(&mut cache, b"za123"), + Some(MultiMatch::must(0, 1, 5)) + ); + assert_eq!( + re.find_leftmost(&mut cache, b"a123"), + Some(MultiMatch::must(0, 0, 4)) + ); + re.set_prefilter(Some(Box::new(BunkPrefilter::new()))); + assert_eq!(re.find_leftmost(&mut cache, b"za123"), None); + // This checks that the prefilter is used when first starting the search, + // instead of waiting until at least one transition has occurred. + assert_eq!(re.find_leftmost(&mut cache, b"a123"), None); + Ok(()) +} diff --git a/vendor/regex-automata-0.2.0/tests/hybrid/mod.rs b/vendor/regex-automata-0.2.0/tests/hybrid/mod.rs new file mode 100644 index 000000000..f4299510c --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/hybrid/mod.rs @@ -0,0 +1,2 @@ +mod api; +mod suite; diff --git a/vendor/regex-automata-0.2.0/tests/hybrid/suite.rs b/vendor/regex-automata-0.2.0/tests/hybrid/suite.rs new file mode 100644 index 000000000..d60570d84 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/hybrid/suite.rs @@ -0,0 +1,212 @@ +use regex_automata::{ + hybrid::{ + dfa::DFA, + regex::{self, Regex}, + }, + nfa::thompson, + MatchKind, SyntaxConfig, +}; +use regex_syntax as syntax; + +use regex_test::{ + bstr::{BString, ByteSlice}, + CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, + SearchKind as TestSearchKind, TestResult, TestRunner, +}; + +use crate::{suite, Result}; + +/// Tests the default configuration of the hybrid NFA/DFA. +#[test] +fn default() -> Result<()> { + let builder = Regex::builder(); + TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); + Ok(()) +} + +/// Tests the hybrid NFA/DFA with NFA shrinking disabled. +/// +/// This is actually the typical configuration one wants for a lazy DFA. NFA +/// shrinking is mostly only advantageous when building a full DFA since it +/// can sharply decrease the amount of time determinization takes. But NFA +/// shrinking is itself otherwise fairly expensive. Since a lazy DFA has +/// no compilation time (other than for building the NFA of course) before +/// executing a search, it's usually worth it to forgo NFA shrinking. +#[test] +fn no_nfa_shrink() -> Result<()> { + let mut builder = Regex::builder(); + builder.thompson(thompson::Config::new().shrink(false)); + TestRunner::new()? + // Without NFA shrinking, this test blows the default cache capacity. + .blacklist("expensive/regression-many-repeat-no-stack-overflow") + .test_iter(suite()?.iter(), compiler(builder)) + .assert(); + Ok(()) +} + +/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled. +#[test] +fn starts_for_each_pattern() -> Result<()> { + let mut builder = Regex::builder(); + builder.dfa(DFA::config().starts_for_each_pattern(true)); + TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); + Ok(()) +} + +/// Tests the hybrid NFA/DFA when byte classes are disabled. +/// +/// N.B. Disabling byte classes doesn't avoid any indirection at search time. +/// All it does is cause every byte value to be its own distinct equivalence +/// class. +#[test] +fn no_byte_classes() -> Result<()> { + let mut builder = Regex::builder(); + builder.dfa(DFA::config().byte_classes(false)); + TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); + Ok(()) +} + +/// Tests that hybrid NFA/DFA never clears its cache for any test with the +/// default capacity. +/// +/// N.B. If a regex suite test is added that causes the cache to be cleared, +/// then this should just skip that test. (Which can be done by calling the +/// 'blacklist' method on 'TestRunner'.) +#[test] +fn no_cache_clearing() -> Result<()> { + let mut builder = Regex::builder(); + builder.dfa(DFA::config().minimum_cache_clear_count(Some(0))); + TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); + Ok(()) +} + +/// Tests the hybrid NFA/DFA when the minimum cache capacity is set. +#[test] +fn min_cache_capacity() -> Result<()> { + let mut builder = Regex::builder(); + builder + .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true)); + TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); + Ok(()) +} + +fn compiler( + mut builder: regex::Builder, +) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> { + move |test, regexes| { + let regexes = regexes + .iter() + .map(|r| r.to_str().map(|s| s.to_string())) + .collect::<std::result::Result<Vec<String>, _>>()?; + + // Check if our regex contains things that aren't supported by DFAs. + // That is, Unicode word boundaries when searching non-ASCII text. + let mut thompson = thompson::Builder::new(); + thompson.syntax(config_syntax(test)).configure(config_thompson(test)); + if let Ok(nfa) = thompson.build_many(®exes) { + let non_ascii = test.input().iter().any(|&b| !b.is_ascii()); + if nfa.has_word_boundary_unicode() && non_ascii { + return Ok(CompiledRegex::skip()); + } + } + if !configure_regex_builder(test, &mut builder) { + return Ok(CompiledRegex::skip()); + } + let re = builder.build_many(®exes)?; + let mut cache = re.create_cache(); + Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { + run_test(&re, &mut cache, test) + })) + } +} + +fn run_test( + re: &Regex, + cache: &mut regex::Cache, + test: &RegexTest, +) -> Vec<TestResult> { + let is_match = if re.is_match(cache, test.input()) { + TestResult::matched() + } else { + TestResult::no_match() + }; + let is_match = is_match.name("is_match"); + + let find_matches = match test.search_kind() { + TestSearchKind::Earliest => { + let it = re + .find_earliest_iter(cache, test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_earliest_iter") + } + TestSearchKind::Leftmost => { + let it = re + .find_leftmost_iter(cache, test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_leftmost_iter") + } + TestSearchKind::Overlapping => { + let it = re + .find_overlapping_iter(cache, test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_overlapping_iter") + } + }; + vec![is_match, find_matches] +} + +/// Configures the given regex builder with all relevant settings on the given +/// regex test. +/// +/// If the regex test has a setting that is unsupported, then this returns +/// false (implying the test should be skipped). +fn configure_regex_builder( + test: &RegexTest, + builder: &mut regex::Builder, +) -> bool { + let match_kind = match test.match_kind() { + TestMatchKind::All => MatchKind::All, + TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst, + TestMatchKind::LeftmostLongest => return false, + }; + + let dense_config = DFA::config() + .anchored(test.anchored()) + .match_kind(match_kind) + .unicode_word_boundary(true); + let regex_config = Regex::config().utf8(test.utf8()); + builder + .configure(regex_config) + .syntax(config_syntax(test)) + .thompson(config_thompson(test)) + .dfa(dense_config); + true +} + +/// Configuration of a Thompson NFA compiler from a regex test. +fn config_thompson(test: &RegexTest) -> thompson::Config { + thompson::Config::new().utf8(test.utf8()) +} + +/// Configuration of the regex parser from a regex test. +fn config_syntax(test: &RegexTest) -> SyntaxConfig { + SyntaxConfig::new() + .case_insensitive(test.case_insensitive()) + .unicode(test.unicode()) + .utf8(test.utf8()) +} diff --git a/vendor/regex-automata-0.2.0/tests/nfa/mod.rs b/vendor/regex-automata-0.2.0/tests/nfa/mod.rs new file mode 100644 index 000000000..326862147 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/nfa/mod.rs @@ -0,0 +1 @@ +mod thompson; diff --git a/vendor/regex-automata-0.2.0/tests/nfa/thompson/mod.rs b/vendor/regex-automata-0.2.0/tests/nfa/thompson/mod.rs new file mode 100644 index 000000000..3a03f52ce --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/nfa/thompson/mod.rs @@ -0,0 +1 @@ +mod pikevm; diff --git a/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/api.rs b/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/api.rs new file mode 100644 index 000000000..c8199f709 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/api.rs @@ -0,0 +1,191 @@ +/* +use std::error::Error; + +use regex_automata::{ + hybrid::{ + dfa::{self, DFA}, + regex::Regex, + OverlappingState, + }, + nfa::thompson, + HalfMatch, MatchError, MatchKind, MultiMatch, +}; + +use crate::util::{BunkPrefilter, SubstringPrefilter}; + +// Tests that too many cache resets cause the lazy DFA to quit. +#[test] +fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> { + // This is a carefully chosen regex. The idea is to pick one that requires + // some decent number of states (hence the bounded repetition). But we + // specifically choose to create a class with an ASCII letter and a + // non-ASCII letter so that we can check that no new states are created + // once the cache is full. Namely, if we fill up the cache on a haystack + // of 'a's, then in order to match one 'β', a new state will need to be + // created since a 'β' is encoded with multiple bytes. Since there's no + // room for this state, the search should quit at the very first position. + let pattern = r"[aβ]{100}"; + let dfa = DFA::builder() + .configure( + // Configure it so that we have the minimum cache capacity + // possible. And that if any resets occur, the search quits. + DFA::config() + .skip_cache_capacity_check(true) + .cache_capacity(0) + .minimum_cache_clear_count(Some(0)), + ) + .build(pattern)?; + let mut cache = dfa.create_cache(); + + let haystack = "a".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 25 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone())); + assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone())); + assert_eq!( + dfa.find_overlapping_fwd( + &mut cache, + &haystack, + &mut OverlappingState::start() + ), + Err(err.clone()) + ); + + let haystack = "β".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 0 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + // no need to test that other find routines quit, since we did that above + + // OK, if we reset the cache, then we should be able to create more states + // and make more progress with searching for betas. + cache.reset(&dfa); + let err = MatchError::GaveUp { offset: 26 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + + // ... switching back to ASCII still makes progress since it just needs to + // set transitions on existing states! + let haystack = "a".repeat(101).into_bytes(); + let err = MatchError::GaveUp { offset: 13 }; + assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err)); + + Ok(()) +} + +// Tests that quit bytes in the forward direction work correctly. +#[test] +fn quit_fwd() -> Result<(), Box<dyn Error>> { + let dfa = DFA::builder() + .configure(DFA::config().quit(b'x', true)) + .build("[[:word:]]+$")?; + let mut cache = dfa.create_cache(); + + assert_eq!( + dfa.find_earliest_fwd(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_fwd(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_overlapping_fwd( + &mut cache, + b"abcxyz", + &mut OverlappingState::start() + ), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that quit bytes in the reverse direction work correctly. +#[test] +fn quit_rev() -> Result<(), Box<dyn Error>> { + let dfa = DFA::builder() + .configure(DFA::config().quit(b'x', true)) + .thompson(thompson::Config::new().reverse(true)) + .build("^[[:word:]]+")?; + let mut cache = dfa.create_cache(); + + assert_eq!( + dfa.find_earliest_rev(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + assert_eq!( + dfa.find_leftmost_rev(&mut cache, b"abcxyz"), + Err(MatchError::Quit { byte: b'x', offset: 3 }) + ); + + Ok(()) +} + +// Tests that if we heuristically enable Unicode word boundaries but then +// instruct that a non-ASCII byte should NOT be a quit byte, then the builder +// will panic. +#[test] +#[should_panic] +fn quit_panics() { + DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); +} + +// This tests an intesting case where even if the Unicode word boundary option +// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode +// word boundaries to be enabled. +#[test] +fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> { + let mut config = DFA::config(); + for b in 0x80..=0xFF { + config = config.quit(b, true); + } + let dfa = DFA::builder().configure(config).build(r"\b")?; + let mut cache = dfa.create_cache(); + let expected = HalfMatch::must(0, 1); + assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected))); + Ok(()) +} + +// Tests that we can provide a prefilter to a Regex, and the search reports +// correct results. +#[test] +fn prefilter_works() -> Result<(), Box<dyn Error>> { + let mut re = Regex::new(r"a[0-9]+").unwrap(); + re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); + let mut cache = re.create_cache(); + + let text = b"foo abc foo a1a2a3 foo a123 bar aa456"; + let matches: Vec<(usize, usize)> = re + .find_leftmost_iter(&mut cache, text) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!( + matches, + vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),] + ); + Ok(()) +} + +// This test confirms that a prefilter is active by using a prefilter that +// reports false negatives. +#[test] +fn prefilter_is_active() -> Result<(), Box<dyn Error>> { + let text = b"za123"; + let mut re = Regex::new(r"a[0-9]+").unwrap(); + let mut cache = re.create_cache(); + + re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a")))); + assert_eq!( + re.find_leftmost(&mut cache, b"za123"), + Some(MultiMatch::must(0, 1, 5)) + ); + assert_eq!( + re.find_leftmost(&mut cache, b"a123"), + Some(MultiMatch::must(0, 0, 4)) + ); + re.set_prefilter(Some(Box::new(BunkPrefilter::new()))); + assert_eq!(re.find_leftmost(&mut cache, b"za123"), None); + // This checks that the prefilter is used when first starting the search, + // instead of waiting until at least one transition has occurred. + assert_eq!(re.find_leftmost(&mut cache, b"a123"), None); + Ok(()) +} +*/ diff --git a/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/mod.rs b/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/mod.rs new file mode 100644 index 000000000..f4299510c --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/mod.rs @@ -0,0 +1,2 @@ +mod api; +mod suite; diff --git a/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/suite.rs b/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/suite.rs new file mode 100644 index 000000000..e5505d59a --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/nfa/thompson/pikevm/suite.rs @@ -0,0 +1,109 @@ +use regex_automata::{ + nfa::thompson::{ + self, + pikevm::{self, PikeVM}, + }, + MatchKind, SyntaxConfig, +}; +use regex_syntax as syntax; + +use regex_test::{ + bstr::{BString, ByteSlice}, + CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests, + SearchKind as TestSearchKind, TestResult, TestRunner, +}; + +use crate::{suite, Result}; + +/// Tests the default configuration of the hybrid NFA/DFA. +#[test] +fn default() -> Result<()> { + let builder = PikeVM::builder(); + TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert(); + Ok(()) +} + +fn compiler( + mut builder: pikevm::Builder, +) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> { + move |test, regexes| { + let regexes = regexes + .iter() + .map(|r| r.to_str().map(|s| s.to_string())) + .collect::<std::result::Result<Vec<String>, _>>()?; + if !configure_pikevm_builder(test, &mut builder) { + return Ok(CompiledRegex::skip()); + } + let re = builder.build_many(®exes)?; + let mut cache = re.create_cache(); + Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> { + run_test(&re, &mut cache, test) + })) + } +} + +fn run_test( + re: &PikeVM, + cache: &mut pikevm::Cache, + test: &RegexTest, +) -> Vec<TestResult> { + // let is_match = if re.is_match(cache, test.input()) { + // TestResult::matched() + // } else { + // TestResult::no_match() + // }; + // let is_match = is_match.name("is_match"); + + let find_matches = match test.search_kind() { + TestSearchKind::Earliest => { + TestResult::skip().name("find_earliest_iter") + } + TestSearchKind::Leftmost => { + let it = re + .find_leftmost_iter(cache, test.input()) + .take(test.match_limit().unwrap_or(std::usize::MAX)) + .map(|m| Match { + id: m.pattern().as_usize(), + start: m.start(), + end: m.end(), + }); + TestResult::matches(it).name("find_leftmost_iter") + } + TestSearchKind::Overlapping => { + TestResult::skip().name("find_overlapping_iter") + } + }; + // vec![is_match, find_matches] + vec![find_matches] +} + +/// Configures the given regex builder with all relevant settings on the given +/// regex test. +/// +/// If the regex test has a setting that is unsupported, then this returns +/// false (implying the test should be skipped). +fn configure_pikevm_builder( + test: &RegexTest, + builder: &mut pikevm::Builder, +) -> bool { + let pikevm_config = + PikeVM::config().anchored(test.anchored()).utf8(test.utf8()); + builder + .configure(pikevm_config) + .syntax(config_syntax(test)) + .thompson(config_thompson(test)); + true +} + +/// Configuration of a Thompson NFA compiler from a regex test. +fn config_thompson(test: &RegexTest) -> thompson::Config { + thompson::Config::new().utf8(test.utf8()) +} + +/// Configuration of the regex parser from a regex test. +fn config_syntax(test: &RegexTest) -> SyntaxConfig { + SyntaxConfig::new() + .case_insensitive(test.case_insensitive()) + .unicode(test.unicode()) + .utf8(test.utf8()) +} diff --git a/vendor/regex-automata-0.2.0/tests/regression.rs b/vendor/regex-automata-0.2.0/tests/regression.rs new file mode 100644 index 000000000..e5355fed7 --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/regression.rs @@ -0,0 +1,44 @@ +use regex_automata::{ + dfa::{dense, Automaton}, + MatchError, +}; + +// A regression test for checking that minimization correctly translates +// whether a state is a match state or not. Previously, it was possible for +// minimization to mark a non-matching state as matching. +#[test] +fn minimize_sets_correct_match_states() { + let pattern = + // This is a subset of the grapheme matching regex. I couldn't seem + // to get a repro any smaller than this unfortunately. + r"(?x) + (?: + \p{gcb=Prepend}* + (?: + (?: + (?: + \p{gcb=L}* + (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT}) + \p{gcb=T}* + ) + | + \p{gcb=L}+ + | + \p{gcb=T}+ + ) + | + \p{Extended_Pictographic} + (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})* + | + [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] + ) + [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]* + ) + "; + + let dfa = dense::Builder::new() + .configure(dense::Config::new().anchored(true).minimize(true)) + .build(pattern) + .unwrap(); + assert_eq!(Ok(None), dfa.find_leftmost_fwd(b"\xE2")); +} diff --git a/vendor/regex-automata-0.2.0/tests/tests.rs b/vendor/regex-automata-0.2.0/tests/tests.rs new file mode 100644 index 000000000..e4728470c --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/tests.rs @@ -0,0 +1,44 @@ +#![allow(warnings)] + +use regex_test::RegexTests; + +mod dfa; +mod hybrid; +mod nfa; +mod regression; +mod util; + +type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>; + +fn suite() -> Result<RegexTests> { + let mut tests = RegexTests::new(); + macro_rules! load { + ($name:expr) => {{ + const DATA: &[u8] = + include_bytes!(concat!("data/", $name, ".toml")); + tests.load_slice($name, DATA)?; + }}; + } + + load!("bytes"); + load!("crazy"); + load!("earliest"); + load!("empty"); + load!("expensive"); + load!("flags"); + load!("iter"); + load!("misc"); + load!("multiline"); + load!("no-unicode"); + load!("overlapping"); + load!("regression"); + load!("set"); + load!("unicode"); + load!("word-boundary"); + load!("fowler/basic"); + load!("fowler/nullsubexpr"); + load!("fowler/repetition"); + load!("fowler/repetition-expensive"); + + Ok(tests) +} diff --git a/vendor/regex-automata-0.2.0/tests/util.rs b/vendor/regex-automata-0.2.0/tests/util.rs new file mode 100644 index 000000000..499aa8c6d --- /dev/null +++ b/vendor/regex-automata-0.2.0/tests/util.rs @@ -0,0 +1,57 @@ +use regex_automata::util::prefilter::{self, Candidate, Prefilter}; + +#[derive(Clone, Debug)] +pub struct SubstringPrefilter(bstr::Finder<'static>); + +impl SubstringPrefilter { + pub fn new<B: AsRef<[u8]>>(needle: B) -> SubstringPrefilter { + SubstringPrefilter(bstr::Finder::new(needle.as_ref()).into_owned()) + } +} + +impl Prefilter for SubstringPrefilter { + #[inline] + fn next_candidate( + &self, + state: &mut prefilter::State, + haystack: &[u8], + at: usize, + ) -> Candidate { + self.0 + .find(&haystack[at..]) + .map(|i| Candidate::PossibleStartOfMatch(at + i)) + .unwrap_or(Candidate::None) + } + + fn heap_bytes(&self) -> usize { + self.0.needle().len() + } +} + +/// A prefilter that always returns `Candidate::None`, even if it's a false +/// negative. This is useful for confirming that a prefilter is actually +/// active by asserting an incorrect result. +#[derive(Clone, Debug)] +pub struct BunkPrefilter(()); + +impl BunkPrefilter { + pub fn new() -> BunkPrefilter { + BunkPrefilter(()) + } +} + +impl Prefilter for BunkPrefilter { + #[inline] + fn next_candidate( + &self, + _state: &mut prefilter::State, + _haystack: &[u8], + _at: usize, + ) -> Candidate { + Candidate::None + } + + fn heap_bytes(&self) -> usize { + 0 + } +} |