diff options
Diffstat (limited to 'vendor/regex-automata')
52 files changed, 15262 insertions, 0 deletions
diff --git a/vendor/regex-automata/.cargo-checksum.json b/vendor/regex-automata/.cargo-checksum.json new file mode 100644 index 000000000..a8c689c8b --- /dev/null +++ b/vendor/regex-automata/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"674fda607d585e7a9d1d07e6fee2807e6a1a3709ca8d5a507dac051cac84dcf1","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"34ebd8d165fbd934198653a6d619d62788ff72f0e058139459d4369683423551","TODO":"daea9f7378f543311d657e6ef3d2a09d51e82b9e70d0026140130862c32b3c08","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","data/fowler-tests/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/fowler-tests/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","data/fowler-tests/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/fowler-tests/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/fowler-tests/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","data/tests/crazy.toml":"b6e644a74b990a4344b15e7366da36e5b3f73a183944e249082f74c23ff01e5f","data/tests/flags.toml":"aefd9483c1c9c52c3669a9f2e88cd494c293f2e14c59aecb1d94dbb82546a705","data/tests/fowler/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/tests/fowler/README":"e9f049297023d5a81c5c600280016fe0271e7d0eda898c41399eb61431820404","data/tests/fowler/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/tests/fowler/basic.toml":"7b043231ca8c89dbd10cef0de3b0be18c9ae442be1e99a657cd412b8b7edec21","data/tests/fowler/fowler-to-toml":"5bb78b924f3b6b1c27278b37baae556115fe03c864c1d33a7c53718b99885515","data/tests/fowler/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/tests/fowler/nullsubexpr.toml":"7e4bf9fec1c4a8aca04cc96e74b3f51ed6b8c3f85e4bfc7acc9c74ab95166976","data/tests/fowler/repetition-long.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","data/tests/fowler/repetition-long.toml":"3eb7199d936b3f7eb9863ebc3b0c94648cfc32192f626dcfa33ddf352918c1c0","data/tests/fowler/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","data/tests/fowler/repetition.toml":"ccf21430a325c4e1dae4eb6c52e3cea5d3c1847559ba6e75466bdb6bbd98204d","data/tests/iter.toml":"99adc397fe0a00c759eb659531d3e69445b43f5ecd5771c549117933b73bd43e","data/tests/no-unicode.toml":"f329ee939c2d07a17e51f0090d9f2431395e47dac8e0b982fb5e16e0555b75e3","data/tests/unicode.toml":"0ff418de5bc238e4595956b66981fe02018938d57d76d11cab840606b9da60ba","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/byteorder.rs":"0827852aa563e3c5b3ffaf484ce8a34537e82719a3606d4b948bc8a1e21d8b18","src/classes.rs":"706c8a8a9bf70260b9c92ff865891fc26de0453495afca7b325afdf5e6a3e242","src/codegen.rs":"5686b97fec69158c7264183a71ad9a1ff8e74db02fa0fcfccaa0a516cbfc7d1d","src/dense.rs":"7561f35019b20642f2ee75fd20365e21a4c8260deb7cee84fa3f8264b9fd9a4b","src/determinize.rs":"876c844d0470854dbbe3eb4386611fd57d95a5a4ae38ee937fbb14676f0a383a","src/dfa.rs":"032f09d187ec8dd06ef09940515690af045ca9f7ef7f819c31a97607df1432e5","src/error.rs":"d07ecdc617e243a43a99e911398b9c37721afd2b9548153c5f359b8c4605c749","src/lib.rs":"520781bdd60d425b16ef72f03330362e7c2aec274338e73f309d730bea4d7ab0","src/minimize.rs":"dfa7b6a6f36bb2dedaee8bfc5c4bb943f59e0cf98cde5358822e70cbdb284a7e","src/nfa/compiler.rs":"f43901929f44efa420e441cbff8687e05059ceae88492a2ed6c49fdd5a6a6b04","src/nfa/map.rs":"b7e2e561d6fe5775716e27eded1ae3e2277a50073a2e182f3dabedcda5c30d27","src/nfa/mod.rs":"93e7dee804751fcf66d48ca48b3467a4ab5155063461e69c428e46bcf977711d","src/nfa/range_trie.rs":"3a3d2853987619688ab5b61acef575f216d5bdd7b9e15fa508e0ba6f29c641a9","src/regex.rs":"2f3868a3fa52b2a040fd0fb9f12386b1af1f0f650d948e821c7ba83f087826f0","src/sparse.rs":"976540bcd134a225e5d39e1aef688f63b02b3d745249a3a95fec387a7ffb88cc","src/sparse_set.rs":"81bef5057781e26da39855b0f38b02ddfd09183bc62d30cf454ec706885e3a70","src/state_id.rs":"44c4bf1a5d091b97e8c1ce872bafe45d806905b07a73a6f82b1655b7897e7b5f","src/transducer.rs":"28c728ef45a3f6177d5a3ac589f166764c11d6c66bd5d916bcf30ad2be187a0c","tests/collection.rs":"2907cc0a32e5e59ceca4b34fe582f9275c12ee1a8d6e73d689056bdfd5357b9a","tests/regression.rs":"5a9b2654f88b1b07401c5b1fe925f62421bff67be7d80cae7a985eb66ed9886b","tests/suite.rs":"8148247667b34b370855c247ffcc9c6339f8f72d6fe481b79936afbb165dd6bd","tests/tests.rs":"f1b407d3d288a9c2b1500151205f9d0bcc0668b2ab38c5094ee459d6d4893e18","tests/unescape.rs":"67a7c466ba5c873a3c29f7e00649535ddc2921fcc14ac92cb207f43b4b6e461d"},"package":"6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"}
\ No newline at end of file diff --git a/vendor/regex-automata/COPYING b/vendor/regex-automata/COPYING new file mode 100644 index 000000000..bb9c20a09 --- /dev/null +++ b/vendor/regex-automata/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/vendor/regex-automata/Cargo.toml b/vendor/regex-automata/Cargo.toml new file mode 100644 index 000000000..b4fcd7a0d --- /dev/null +++ b/vendor/regex-automata/Cargo.toml @@ -0,0 +1,86 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +name = "regex-automata" +version = "0.1.10" +authors = ["Andrew Gallant <jamslam@gmail.com>"] +exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug"] +autoexamples = false +autotests = false +description = "Automata construction and matching using regular expressions." +homepage = "https://github.com/BurntSushi/regex-automata" +documentation = "https://docs.rs/regex-automata" +readme = "README.md" +keywords = ["regex", "dfa", "automata", "automaton", "nfa"] +categories = ["text-processing"] +license = "Unlicense/MIT" +repository = "https://github.com/BurntSushi/regex-automata" +[profile.bench] +debug = true + +[profile.dev] +opt-level = 3 +debug = true + +[profile.release] +debug = true + +[profile.test] +opt-level = 3 +debug = true + +[lib] +bench = false + +[[test]] +name = "default" +path = "tests/tests.rs" +[dependencies.fst] +version = "0.4.0" +optional = true + +[dependencies.regex-syntax] +version = "0.6.16" +optional = true +[dev-dependencies.bstr] +version = "0.2" +features = ["std"] +default-features = false + +[dev-dependencies.lazy_static] +version = "1.2.0" + +[dev-dependencies.regex] +version = "1.1" + +[dev-dependencies.serde] +version = "1.0.82" + +[dev-dependencies.serde_bytes] +version = "0.11" + +[dev-dependencies.serde_derive] +version = "1.0.82" + +[dev-dependencies.toml] +version = "0.4.10" + +[features] +default = ["std"] +std = ["regex-syntax"] +transducer = ["std", "fst"] +[badges.appveyor] +repository = "BurntSushi/regex-automata" + +[badges.travis-ci] +repository = "BurntSushi/regex-automata" diff --git a/vendor/regex-automata/LICENSE-MIT b/vendor/regex-automata/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/vendor/regex-automata/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/regex-automata/README.md b/vendor/regex-automata/README.md new file mode 100644 index 000000000..8eaf03f04 --- /dev/null +++ b/vendor/regex-automata/README.md @@ -0,0 +1,223 @@ +regex-automata +============== +A low level regular expression library that uses deterministic finite automata. +It supports a rich syntax with Unicode support, has extensive options for +configuring the best space vs time trade off for your use case and provides +support for cheap deserialization of automata for use in `no_std` environments. + +[![Build status](https://github.com/BurntSushi/regex-automata/workflows/ci/badge.svg)](https://github.com/BurntSushi/regex-automata/actions) +[![on crates.io](https://meritbadge.herokuapp.com/regex-automata)](https://crates.io/crates/regex-automata) +![Minimum Supported Rust Version 1.41](https://img.shields.io/badge/rustc-1.41-green) + +Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). + + +### Documentation + +https://docs.rs/regex-automata + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +regex-automata = "0.1" +``` + +and this to your crate root (if you're using Rust 2015): + +```rust +extern crate regex_automata; +``` + + +### Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +```rust +use regex_automata::Regex; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +``` + +For more examples and information about the various knobs that can be turned, +please see the [docs](https://docs.rs/regex-automata). + + +### Support for `no_std` + +This crate comes with a `std` feature that is enabled by default. When the +`std` feature is enabled, the API of this crate will include the facilities +necessary for compiling, serializing, deserializing and searching with regular +expressions. When the `std` feature is disabled, the API of this crate will +shrink such that it only includes the facilities necessary for deserializing +and searching with regular expressions. + +The intended workflow for `no_std` environments is thus as follows: + +* Write a program with the `std` feature that compiles and serializes a + regular expression. Serialization should only happen after first converting + the DFAs to use a fixed size state identifier instead of the default `usize`. + You may also need to serialize both little and big endian versions of each + DFA. (So that's 4 DFAs in total for each regex.) +* In your `no_std` environment, follow the examples above for deserializing + your previously serialized DFAs into regexes. You can then search with them + as you would any regex. + +Deserialization can happen anywhere. For example, with bytes embedded into a +binary or with a file memory mapped at runtime. + +Note that the +[`ucd-generate`](https://github.com/BurntSushi/ucd-generate) +tool will do the first step for you with its `dfa` or `regex` sub-commands. + + +### Cargo features + +* `std` - **Enabled** by default. This enables the ability to compile finite + automata. This requires the `regex-syntax` dependency. Without this feature + enabled, finite automata can only be used for searching (using the approach + described above). +* `transducer` - **Disabled** by default. This provides implementations of the + `Automaton` trait found in the `fst` crate. This permits using finite + automata generated by this crate to search finite state transducers. This + requires the `fst` dependency. + + +### Differences with the regex crate + +The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a +general purpose regular expression engine. It aims to automatically balance low +compile times, fast search times and low memory usage, while also providing +a convenient API for users. In contrast, this crate provides a lower level +regular expression interface that is a bit less convenient while providing more +explicit control over memory usage and search times. + +Here are some specific negative differences: + +* **Compilation can take an exponential amount of time and space** in the size + of the regex pattern. While most patterns do not exhibit worst case + exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will + build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should + not be compiled with this library. (In the future, the API may expose an + option to return an error if the DFA gets too big.) +* This crate does not support sub-match extraction, which can be achieved with + the regex crate's "captures" API. This may be added in the future, but is + unlikely. +* While the regex crate doesn't necessarily sport fast compilation times, the + regexes in this crate are almost universally slow to compile, especially when + they contain large Unicode character classes. For example, on my system, + compiling `\w{3}` with byte classes enabled takes just over 1 second and + almost 5MB of memory! (Compiling a sparse regex takes about the same time + but only uses about 500KB of memory.) Conversly, compiling the same regex + without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and + less than 5KB of memory. For this reason, you should only use Unicode + character classes if you absolutely need them! +* This crate does not support regex sets. +* This crate does not support zero-width assertions such as `^`, `$`, `\b` or + `\B`. +* As a lower level crate, this library does not do literal optimizations. In + exchange, you get predictable performance regardless of input. The + philosophy here is that literal optimizations should be applied at a higher + level, although there is no easy support for this in the ecosystem yet. +* There is no `&str` API like in the regex crate. In this crate, all APIs + operate on `&[u8]`. By default, match indices are guaranteed to fall on + UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled. + +With some of the downsides out of the way, here are some positive differences: + +* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply + deserialized. Deserialization always takes constant time since searching can + be performed directly on the raw serialized bytes of a DFA. +* This crate was specifically designed so that the searching phase of a DFA has + minimal runtime requirements, and can therefore be used in `no_std` + environments. While `no_std` environments cannot compile regexes, they can + deserialize pre-compiled regexes. +* Since this crate builds DFAs ahead of time, it will generally out-perform + the `regex` crate on equivalent tasks. The performance difference is likely + not large. However, because of a complex set of optimizations in the regex + crate (like literal optimizations), an accurate performance comparison may be + difficult to do. +* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search + performance a small amount, but uses much less storage space. Potentially + even less than what the regex crate uses. +* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`, + which enables one to do less work in some cases. For example, if you only + need the end of a match and not the start of a match, then you can use a DFA + directly without building a `Regex`, which always requires a second DFA to + find the start of a match. +* Aside from choosing between dense and sparse DFAs, there are several options + for configuring the space usage vs search time trade off. These include + things like choosing a smaller state identifier representation, to + premultiplying state identifiers and splitting a DFA's alphabet into + equivalence classes. Finally, DFA minimization is also provided, but can + increase compilation times dramatically. + + +### Future work + +* Look into being smarter about generating NFA states for large Unicode + character classes. These can create a lot of additional work for both the + determinizer and the minimizer, and I suspect this is the key thing we'll + want to improve if we want to make DFA compile times faster. I *believe* + it's possible to potentially build minimal or nearly minimal NFAs for the + special case of Unicode character classes by leveraging Daciuk's algorithms + for building minimal automata in linear time for sets of strings. See + https://blog.burntsushi.net/transducers/#construction for more details. The + key adaptation I think we need to make is to modify the algorithm to operate + on byte ranges instead of enumerating every codepoint in the set. Otherwise, + it might not be worth doing. +* Add support for regex sets. It should be possible to do this by "simply" + introducing more match states. I think we can also report the positions at + each match, similar to how Aho-Corasick works. I think the long pole in the + tent here is probably the API design work and arranging it so that we don't + introduce extra overhead into the non-regex-set case without duplicating a + lot of code. It seems doable. +* Stretch goal: support capturing groups by implementing "tagged" DFA + (transducers). Laurikari's paper is the usual reference here, but Trofimovich + has a much more thorough treatment here: + https://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf + I've only read the paper once. I suspect it will require at least a few more + read throughs before I understand it. + See also: https://re2c.org +* Possibly less ambitious goal: can we select a portion of Trofimovich's work + to make small fixed length look-around work? It would be really nice to + support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $. +* Experiment with code generating Rust code. There is an early experiment in + src/codegen.rs that is thoroughly bit-rotted. At the time, I was + experimenting with whether or not codegen would significant decrease the size + of a DFA, since if you squint hard enough, it's kind of like a sparse + representation. However, it didn't shrink as much as I thought it would, so + I gave up. The other problem is that Rust doesn't support gotos, so I don't + even know whether the "match on each state" in a loop thing will be fast + enough. Either way, it's probably a good option to have. For one thing, it + would be endian independent where as the serialization format of the DFAs in + this crate are endian dependent (so you need two versions of every DFA, but + you only need to compile one of them for any given arch). +* Experiment with unrolling the match loops and fill out the benchmarks. +* Add some kind of streaming API. I believe users of the library can already + implement something for this outside of the crate, but it would be good to + provide an official API. The key thing here is figuring out the API. I + suspect we might want to support several variants. +* Make a decision on whether or not there is room for literal optimizations + in this crate. My original intent was to not let this crate sink down into + that very very very deep rabbit hole. But instead, we might want to provide + some way for literal optimizations to hook into the match routines. The right + path forward here is to probably build something outside of the crate and + then see about integrating it. After all, users can implement their own + match routines just as efficiently as what the crate provides. +* A key downside of DFAs is that they can take up a lot of memory and can be + quite costly to build. Their worst case compilation time is O(2^n), where + n is the number of NFA states. A paper by Yang and Prasanna (2011) actually + seems to provide a way to character state blow up such that it is detectable. + If we could know whether a regex will exhibit state explosion or not, then + we could make an intelligent decision about whether to ahead-of-time compile + a DFA. + See: https://www.researchgate.net/profile/Xu-Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000/Characterization-of-a-global-germplasm-collection-and-its-potential-utilization-for-analysis-of-complex-quantitative-traits-in-maize.pdf diff --git a/vendor/regex-automata/TODO b/vendor/regex-automata/TODO new file mode 100644 index 000000000..bc3b7aab9 --- /dev/null +++ b/vendor/regex-automata/TODO @@ -0,0 +1,10 @@ +* Remove the `empty` constructors for DFAs and replace them with + `never_match` and `always_match` constructors. +* Consider refactoring the NFA representation such that it can be instantly + loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this + could negatively impact using the NFA with deserialization costs. Before + doing this, we should write PikeVM and backtracking implementations so that + they can be benchmarked. +* Add captures and anchors to NFA. +* Once we're happy, re-organize the public API such that NFAs are exported + and usable on their own. diff --git a/vendor/regex-automata/UNLICENSE b/vendor/regex-automata/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/vendor/regex-automata/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/vendor/regex-automata/data/fowler-tests/LICENSE b/vendor/regex-automata/data/fowler-tests/LICENSE new file mode 100644 index 000000000..f47dbf4c4 --- /dev/null +++ b/vendor/regex-automata/data/fowler-tests/LICENSE @@ -0,0 +1,19 @@ +The following license covers testregex.c and all associated test data. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do +so, subject to the following disclaimer: + +THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/regex-automata/data/fowler-tests/README b/vendor/regex-automata/data/fowler-tests/README new file mode 100644 index 000000000..6efc2dad3 --- /dev/null +++ b/vendor/regex-automata/data/fowler-tests/README @@ -0,0 +1,17 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +The LICENSE in this directory corresponds to the LICENSE that the data was +released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +Note that these files are read by 'scripts/regex-match-tests.py' and turned +into Rust tests found in 'regex_macros/tests/matches.rs'. + diff --git a/vendor/regex-automata/data/fowler-tests/basic.dat b/vendor/regex-automata/data/fowler-tests/basic.dat new file mode 100644 index 000000000..e55efaeec --- /dev/null +++ b/vendor/regex-automata/data/fowler-tests/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/vendor/regex-automata/data/fowler-tests/nullsubexpr.dat b/vendor/regex-automata/data/fowler-tests/nullsubexpr.dat new file mode 100644 index 000000000..2e18fbb91 --- /dev/null +++ b/vendor/regex-automata/data/fowler-tests/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/vendor/regex-automata/data/fowler-tests/repetition.dat b/vendor/regex-automata/data/fowler-tests/repetition.dat new file mode 100644 index 000000000..3bb212118 --- /dev/null +++ b/vendor/regex-automata/data/fowler-tests/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/vendor/regex-automata/data/tests/crazy.toml b/vendor/regex-automata/data/tests/crazy.toml new file mode 100644 index 000000000..30c4b314d --- /dev/null +++ b/vendor/regex-automata/data/tests/crazy.toml @@ -0,0 +1,177 @@ +[[tests]] +name = "crazy-misc1" +pattern = '[-+]?[0-9]*\.?[0-9]+' +input = "0.1" +matches = [[0, 3]] + +[[tests]] +name = "crazy-misc2" +pattern = '[-+]?[0-9]*\.?[0-9]+' +input = "0.1.2" +matches = [[0, 3]] + +[[tests]] +name = "crazy-misc3" +pattern = '[-+]?[0-9]*\.?[0-9]+' +input = "a1.2" +matches = [[1, 4]] + +[[tests]] +options = ["case-insensitive"] +name = "crazy-misc4" +pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' +input = "mine is jam.slam@gmail.com " +matches = [[8, 26]] + +[[tests]] +options = ["case-insensitive"] +name = "crazy-misc5" +pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' +input = "mine is jam.slam@gmail " +matches = [] + +[[tests]] +name = "crazy-misc6" +pattern = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''' +input = "mine is jam.slam@gmail.com " +matches = [[8, 26]] + +[[tests]] +name = "crazy-misc7" +pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])' +input = "1900-01-01" +matches = [[0, 10]] + +[[tests]] +name = "crazy-misc8" +pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])' +input = "1900-00-01" +matches = [] + +[[tests]] +name = "crazy-misc9" +pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])' +input = "1900-13-01" +matches = [] + + +[[tests]] +name = "crazy-negclass1" +pattern = "[^ac]" +input = "acx" +matches = [[2, 3]] + +[[tests]] +name = "crazy-negclass2" +pattern = "[^a,]" +input = "a,x" +matches = [[2, 3]] + +[[tests]] +name = "crazy-negclass3" +pattern = '[^a\s]' +input = "a x" +matches = [[2, 3]] + +[[tests]] +name = "crazy-negclass4" +pattern = "[^,]" +input = ",,x" +matches = [[2, 3]] + +[[tests]] +name = "crazy-negclass5" +pattern = '[^\s]' +input = " a" +matches = [[1, 2]] + +[[tests]] +name = "crazy-negclass6" +pattern = '[^,\s]' +input = ", a" +matches = [[2, 3]] + +[[tests]] +name = "crazy-negclass7" +pattern = '[^\s,]' +input = " ,a" +matches = [[2, 3]] + +[[tests]] +name = "crazy-negclass8" +pattern = "[^[:alpha:]Z]" +input = "A1" +matches = [[1, 2]] + + +[[tests]] +name = "crazy-empty-repeat1" +pattern = "((.*)*?)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat2" +pattern = "((.?)*?)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat3" +pattern = "((.*)+?)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat4" +pattern = "((.?)+?)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat5" +pattern = "((.*){1,}?)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat6" +pattern = "((.*){1,2}?)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat7" +pattern = "((.*)*)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat8" +pattern = "((.?)*)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat9" +pattern = "((.*)+)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat10" +pattern = "((.?)+)=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat11" +pattern = "((.*){1,})=" +input = "a=b" +matches = [[0, 2]] + +[[tests]] +name = "crazy-empty-repeat12" +pattern = "((.*){1,2})=" +input = "a=b" +matches = [[0, 2]] diff --git a/vendor/regex-automata/data/tests/flags.toml b/vendor/regex-automata/data/tests/flags.toml new file mode 100644 index 000000000..98024d9f7 --- /dev/null +++ b/vendor/regex-automata/data/tests/flags.toml @@ -0,0 +1,59 @@ +[[tests]] +name = "flags1" +pattern = "(?i)abc" +input = "ABC" +matches = [[0, 3]] + +[[tests]] +name = "flags2" +pattern = "(?i)a(?-i)bc" +input = "Abc" +matches = [[0, 3]] + +[[tests]] +name = "flags3" +pattern = "(?i)a(?-i)bc" +input = "ABC" +matches = [] + +[[tests]] +name = "flags4" +pattern = "(?is)a." +input = "A\n" +matches = [[0, 2]] + +[[tests]] +name = "flags5" +pattern = "(?is)a.(?-is)a." +input = "A\nab" +matches = [[0, 4]] + +[[tests]] +name = "flags6" +pattern = "(?is)a.(?-is)a." +input = "A\na\n" +matches = [] + +[[tests]] +name = "flags7" +pattern = "(?is)a.(?-is:a.)?" +input = "A\na\n" +matches = [[0, 2]] + +[[tests]] +name = "flags8" +pattern = "(?U)a+" +input = "aa" +matches = [[0, 1]] + +[[tests]] +name = "flags9" +pattern = "(?U)a+?" +input = "aa" +matches = [[0, 2]] + +[[tests]] +name = "flags10" +pattern = "(?U)(?-U)a+" +input = "aa" +matches = [[0, 2]] diff --git a/vendor/regex-automata/data/tests/fowler/LICENSE b/vendor/regex-automata/data/tests/fowler/LICENSE new file mode 100644 index 000000000..f47dbf4c4 --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/LICENSE @@ -0,0 +1,19 @@ +The following license covers testregex.c and all associated test data. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do +so, subject to the following disclaimer: + +THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/regex-automata/data/tests/fowler/README b/vendor/regex-automata/data/tests/fowler/README new file mode 100644 index 000000000..55507f03f --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/README @@ -0,0 +1,23 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +Unfortunately, the above link is now dead, but the test data lives on. + +The LICENSE in this directory corresponds to the LICENSE that the data was +originally released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +After some number of years, these tests were transformed into a JSON format +using the fowler-to-json script in this directory, e.g., + + ./fowler-to-json basic.dat > basic.json + +which brings them into a sensible structured format in which other tests can +be written. diff --git a/vendor/regex-automata/data/tests/fowler/basic.dat b/vendor/regex-automata/data/tests/fowler/basic.dat new file mode 100644 index 000000000..e55efaeec --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/vendor/regex-automata/data/tests/fowler/basic.toml b/vendor/regex-automata/data/tests/fowler/basic.toml new file mode 100644 index 000000000..3eeebd799 --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/basic.toml @@ -0,0 +1,1428 @@ +[[tests]] +name = "basic3" +options = ['escaped'] +pattern = '''abracadabra$''' +input = '''abracadabracadabra''' +matches = [[7, 18]] + +[[tests]] +name = "basic4" +options = ['escaped'] +pattern = '''a...b''' +input = '''abababbb''' +matches = [[2, 7]] + +[[tests]] +name = "basic5" +options = ['escaped'] +pattern = '''XXXXXX''' +input = '''..XXXXXX''' +matches = [[2, 8]] + +[[tests]] +name = "basic6" +options = ['escaped'] +pattern = '''\)''' +input = '''()''' +matches = [[1, 2]] + +[[tests]] +name = "basic7" +options = ['escaped'] +pattern = '''a]''' +input = '''a]a''' +matches = [[0, 2]] + +[[tests]] +name = "basic9" +options = ['escaped'] +pattern = '''\}''' +input = '''}''' +matches = [[0, 1]] + +[[tests]] +name = "basic10" +options = ['escaped'] +pattern = '''\]''' +input = ''']''' +matches = [[0, 1]] + +[[tests]] +name = "basic12" +options = ['escaped'] +pattern = ''']''' +input = ''']''' +matches = [[0, 1]] + +[[tests]] +name = "basic15" +options = ['escaped'] +pattern = '''^a''' +input = '''ax''' +matches = [[0, 1]] + +[[tests]] +name = "basic16" +options = ['escaped'] +pattern = '''\^a''' +input = '''a^a''' +matches = [[1, 3]] + +[[tests]] +name = "basic17" +options = ['escaped'] +pattern = '''a\^''' +input = '''a^''' +matches = [[0, 2]] + +[[tests]] +name = "basic18" +options = ['escaped'] +pattern = '''a$''' +input = '''aa''' +matches = [[1, 2]] + +[[tests]] +name = "basic19" +options = ['escaped'] +pattern = '''a\$''' +input = '''a$''' +matches = [[0, 2]] + +[[tests]] +name = "basic20" +options = ['escaped'] +pattern = '''^$''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "basic21" +options = ['escaped'] +pattern = '''$^''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "basic22" +options = ['escaped'] +pattern = '''a($)''' +input = '''aa''' +matches = [[1, 2]] + +[[tests]] +name = "basic23" +options = ['escaped'] +pattern = '''a*(^a)''' +input = '''aa''' +matches = [[0, 1]] + +[[tests]] +name = "basic24" +options = ['escaped'] +pattern = '''(..)*(...)*''' +input = '''a''' +matches = [[0, 0]] + +[[tests]] +name = "basic25" +options = ['escaped'] +pattern = '''(..)*(...)*''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic26" +options = ['escaped'] +pattern = '''(ab|a)(bc|c)''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic27" +options = ['escaped'] +pattern = '''(ab)c|abc''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic28" +options = ['escaped'] +pattern = '''a{0}b''' +input = '''ab''' +matches = [[1, 2]] + +[[tests]] +name = "basic29" +options = ['escaped'] +pattern = '''(a*)(b?)(b+)b{3}''' +input = '''aaabbbbbbb''' +matches = [[0, 10]] + +[[tests]] +name = "basic30" +options = ['escaped'] +pattern = '''(a*)(b{0,1})(b{1,})b{3}''' +input = '''aaabbbbbbb''' +matches = [[0, 10]] + +[[tests]] +name = "basic32" +options = ['escaped'] +pattern = '''((a|a)|a)''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "basic33" +options = ['escaped'] +pattern = '''(a*)(a|aa)''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "basic34" +options = ['escaped'] +pattern = '''a*(a.|aa)''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "basic35" +options = ['escaped'] +pattern = '''a(b)|c(d)|a(e)f''' +input = '''aef''' +matches = [[0, 3]] + +[[tests]] +name = "basic36" +options = ['escaped'] +pattern = '''(a|b)?.*''' +input = '''b''' +matches = [[0, 1]] + +[[tests]] +name = "basic37" +options = ['escaped'] +pattern = '''(a|b)c|a(b|c)''' +input = '''ac''' +matches = [[0, 2]] + +[[tests]] +name = "basic38" +options = ['escaped'] +pattern = '''(a|b)c|a(b|c)''' +input = '''ab''' +matches = [[0, 2]] + +[[tests]] +name = "basic39" +options = ['escaped'] +pattern = '''(a|b)*c|(a|ab)*c''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic40" +options = ['escaped'] +pattern = '''(a|b)*c|(a|ab)*c''' +input = '''xc''' +matches = [[1, 2]] + +[[tests]] +name = "basic41" +options = ['escaped'] +pattern = '''(.a|.b).*|.*(.a|.b)''' +input = '''xa''' +matches = [[0, 2]] + +[[tests]] +name = "basic42" +options = ['escaped'] +pattern = '''a?(ab|ba)ab''' +input = '''abab''' +matches = [[0, 4]] + +[[tests]] +name = "basic43" +options = ['escaped'] +pattern = '''a?(ac{0}b|ba)ab''' +input = '''abab''' +matches = [[0, 4]] + +[[tests]] +name = "basic44" +options = ['escaped'] +pattern = '''ab|abab''' +input = '''abbabab''' +matches = [[0, 2]] + +[[tests]] +name = "basic45" +options = ['escaped'] +pattern = '''aba|bab|bba''' +input = '''baaabbbaba''' +matches = [[5, 8]] + +[[tests]] +name = "basic46" +options = ['escaped'] +pattern = '''aba|bab''' +input = '''baaabbbaba''' +matches = [[6, 9]] + +[[tests]] +name = "basic47" +options = ['escaped'] +pattern = '''(aa|aaa)*|(a|aaaaa)''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "basic48" +options = ['escaped'] +pattern = '''(a.|.a.)*|(a|.a...)''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "basic49" +options = ['escaped'] +pattern = '''ab|a''' +input = '''xabc''' +matches = [[1, 3]] + +[[tests]] +name = "basic50" +options = ['escaped'] +pattern = '''ab|a''' +input = '''xxabc''' +matches = [[2, 4]] + +[[tests]] +name = "basic51" +options = ['escaped', 'case-insensitive'] +pattern = '''(Ab|cD)*''' +input = '''aBcD''' +matches = [[0, 4]] + +[[tests]] +name = "basic52" +options = ['escaped'] +pattern = '''[^-]''' +input = '''--a''' +matches = [[2, 3]] + +[[tests]] +name = "basic53" +options = ['escaped'] +pattern = '''[a-]*''' +input = '''--a''' +matches = [[0, 3]] + +[[tests]] +name = "basic54" +options = ['escaped'] +pattern = '''[a-m-]*''' +input = '''--amoma--''' +matches = [[0, 4]] + +[[tests]] +name = "basic55" +options = ['escaped'] +pattern = ''':::1:::0:|:::1:1:0:''' +input = ''':::0:::1:::1:::0:''' +matches = [[8, 17]] + +[[tests]] +name = "basic56" +options = ['escaped'] +pattern = ''':::1:::0:|:::1:1:1:''' +input = ''':::0:::1:::1:::0:''' +matches = [[8, 17]] + +[[tests]] +name = "basic57" +options = ['escaped'] +pattern = '''[[:upper:]]''' +input = '''A''' +matches = [[0, 1]] + +[[tests]] +name = "basic58" +options = ['escaped'] +pattern = '''[[:lower:]]+''' +input = '''`az{''' +matches = [[1, 3]] + +[[tests]] +name = "basic59" +options = ['escaped'] +pattern = '''[[:upper:]]+''' +input = '''@AZ[''' +matches = [[1, 3]] + +[[tests]] +name = "basic65" +options = ['escaped'] +pattern = '''\n''' +input = '''\n''' +matches = [[0, 1]] + +[[tests]] +name = "basic66" +options = ['escaped'] +pattern = '''\n''' +input = '''\n''' +matches = [[0, 1]] + +[[tests]] +name = "basic67" +options = ['escaped'] +pattern = '''[^a]''' +input = '''\n''' +matches = [[0, 1]] + +[[tests]] +name = "basic68" +options = ['escaped'] +pattern = '''\na''' +input = '''\na''' +matches = [[0, 2]] + +[[tests]] +name = "basic69" +options = ['escaped'] +pattern = '''(a)(b)(c)''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic70" +options = ['escaped'] +pattern = '''xxx''' +input = '''xxx''' +matches = [[0, 3]] + +[[tests]] +name = "basic71" +options = ['escaped'] +pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' +input = '''feb 6,''' +matches = [[0, 6]] + +[[tests]] +name = "basic72" +options = ['escaped'] +pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' +input = '''2/7''' +matches = [[0, 3]] + +[[tests]] +name = "basic73" +options = ['escaped'] +pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' +input = '''feb 1,Feb 6''' +matches = [[5, 11]] + +[[tests]] +name = "basic74" +options = ['escaped'] +pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))''' +input = '''x''' +matches = [[0, 1]] + +[[tests]] +name = "basic75" +options = ['escaped'] +pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*''' +input = '''xx''' +matches = [[0, 2]] + +[[tests]] +name = "basic76" +options = ['escaped'] +pattern = '''a?(ab|ba)*''' +input = '''ababababababababababababababababababababababababababababababababababababababababa''' +matches = [[0, 81]] + +[[tests]] +name = "basic77" +options = ['escaped'] +pattern = '''abaa|abbaa|abbbaa|abbbbaa''' +input = '''ababbabbbabbbabbbbabbbbaa''' +matches = [[18, 25]] + +[[tests]] +name = "basic78" +options = ['escaped'] +pattern = '''abaa|abbaa|abbbaa|abbbbaa''' +input = '''ababbabbbabbbabbbbabaa''' +matches = [[18, 22]] + +[[tests]] +name = "basic79" +options = ['escaped'] +pattern = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc''' +input = '''baaabbbabac''' +matches = [[7, 11]] + +[[tests]] +name = "basic80" +options = ['escaped'] +pattern = '''.*''' +input = '''\x01\x7f''' +matches = [[0, 2]] + +[[tests]] +name = "basic81" +options = ['escaped'] +pattern = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll''' +input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa''' +matches = [[53, 57]] + +[[tests]] +name = "basic83" +options = ['escaped'] +pattern = '''a*a*a*a*a*b''' +input = '''aaaaaaaaab''' +matches = [[0, 10]] + +[[tests]] +name = "basic84" +options = ['escaped'] +pattern = '''^''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "basic85" +options = ['escaped'] +pattern = '''$''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "basic86" +options = ['escaped'] +pattern = '''^$''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "basic87" +options = ['escaped'] +pattern = '''^a$''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "basic88" +options = ['escaped'] +pattern = '''abc''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic89" +options = ['escaped'] +pattern = '''abc''' +input = '''xabcy''' +matches = [[1, 4]] + +[[tests]] +name = "basic90" +options = ['escaped'] +pattern = '''abc''' +input = '''ababc''' +matches = [[2, 5]] + +[[tests]] +name = "basic91" +options = ['escaped'] +pattern = '''ab*c''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic92" +options = ['escaped'] +pattern = '''ab*bc''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic93" +options = ['escaped'] +pattern = '''ab*bc''' +input = '''abbc''' +matches = [[0, 4]] + +[[tests]] +name = "basic94" +options = ['escaped'] +pattern = '''ab*bc''' +input = '''abbbbc''' +matches = [[0, 6]] + +[[tests]] +name = "basic95" +options = ['escaped'] +pattern = '''ab+bc''' +input = '''abbc''' +matches = [[0, 4]] + +[[tests]] +name = "basic96" +options = ['escaped'] +pattern = '''ab+bc''' +input = '''abbbbc''' +matches = [[0, 6]] + +[[tests]] +name = "basic97" +options = ['escaped'] +pattern = '''ab?bc''' +input = '''abbc''' +matches = [[0, 4]] + +[[tests]] +name = "basic98" +options = ['escaped'] +pattern = '''ab?bc''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic99" +options = ['escaped'] +pattern = '''ab?c''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic100" +options = ['escaped'] +pattern = '''^abc$''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic101" +options = ['escaped'] +pattern = '''^abc''' +input = '''abcc''' +matches = [[0, 3]] + +[[tests]] +name = "basic102" +options = ['escaped'] +pattern = '''abc$''' +input = '''aabc''' +matches = [[1, 4]] + +[[tests]] +name = "basic103" +options = ['escaped'] +pattern = '''^''' +input = '''abc''' +matches = [[0, 0]] + +[[tests]] +name = "basic104" +options = ['escaped'] +pattern = '''$''' +input = '''abc''' +matches = [[3, 3]] + +[[tests]] +name = "basic105" +options = ['escaped'] +pattern = '''a.c''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic106" +options = ['escaped'] +pattern = '''a.c''' +input = '''axc''' +matches = [[0, 3]] + +[[tests]] +name = "basic107" +options = ['escaped'] +pattern = '''a.*c''' +input = '''axyzc''' +matches = [[0, 5]] + +[[tests]] +name = "basic108" +options = ['escaped'] +pattern = '''a[bc]d''' +input = '''abd''' +matches = [[0, 3]] + +[[tests]] +name = "basic109" +options = ['escaped'] +pattern = '''a[b-d]e''' +input = '''ace''' +matches = [[0, 3]] + +[[tests]] +name = "basic110" +options = ['escaped'] +pattern = '''a[b-d]''' +input = '''aac''' +matches = [[1, 3]] + +[[tests]] +name = "basic111" +options = ['escaped'] +pattern = '''a[-b]''' +input = '''a-''' +matches = [[0, 2]] + +[[tests]] +name = "basic112" +options = ['escaped'] +pattern = '''a[b-]''' +input = '''a-''' +matches = [[0, 2]] + +[[tests]] +name = "basic113" +options = ['escaped'] +pattern = '''a]''' +input = '''a]''' +matches = [[0, 2]] + +[[tests]] +name = "basic114" +options = ['escaped'] +pattern = '''a[]]b''' +input = '''a]b''' +matches = [[0, 3]] + +[[tests]] +name = "basic115" +options = ['escaped'] +pattern = '''a[^bc]d''' +input = '''aed''' +matches = [[0, 3]] + +[[tests]] +name = "basic116" +options = ['escaped'] +pattern = '''a[^-b]c''' +input = '''adc''' +matches = [[0, 3]] + +[[tests]] +name = "basic117" +options = ['escaped'] +pattern = '''a[^]b]c''' +input = '''adc''' +matches = [[0, 3]] + +[[tests]] +name = "basic118" +options = ['escaped'] +pattern = '''ab|cd''' +input = '''abc''' +matches = [[0, 2]] + +[[tests]] +name = "basic119" +options = ['escaped'] +pattern = '''ab|cd''' +input = '''abcd''' +matches = [[0, 2]] + +[[tests]] +name = "basic120" +options = ['escaped'] +pattern = '''a\(b''' +input = '''a(b''' +matches = [[0, 3]] + +[[tests]] +name = "basic121" +options = ['escaped'] +pattern = '''a\(*b''' +input = '''ab''' +matches = [[0, 2]] + +[[tests]] +name = "basic122" +options = ['escaped'] +pattern = '''a\(*b''' +input = '''a((b''' +matches = [[0, 4]] + +[[tests]] +name = "basic123" +options = ['escaped'] +pattern = '''((a))''' +input = '''abc''' +matches = [[0, 1]] + +[[tests]] +name = "basic124" +options = ['escaped'] +pattern = '''(a)b(c)''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic125" +options = ['escaped'] +pattern = '''a+b+c''' +input = '''aabbabc''' +matches = [[4, 7]] + +[[tests]] +name = "basic126" +options = ['escaped'] +pattern = '''a*''' +input = '''aaa''' +matches = [[0, 3]] + +[[tests]] +name = "basic128" +options = ['escaped'] +pattern = '''(a*)*''' +input = '''-''' +matches = [[0, 0]] + +[[tests]] +name = "basic129" +options = ['escaped'] +pattern = '''(a*)+''' +input = '''-''' +matches = [[0, 0]] + +[[tests]] +name = "basic131" +options = ['escaped'] +pattern = '''(a*|b)*''' +input = '''-''' +matches = [[0, 0]] + +[[tests]] +name = "basic132" +options = ['escaped'] +pattern = '''(a+|b)*''' +input = '''ab''' +matches = [[0, 2]] + +[[tests]] +name = "basic133" +options = ['escaped'] +pattern = '''(a+|b)+''' +input = '''ab''' +matches = [[0, 2]] + +[[tests]] +name = "basic134" +options = ['escaped'] +pattern = '''(a+|b)?''' +input = '''ab''' +matches = [[0, 1]] + +[[tests]] +name = "basic135" +options = ['escaped'] +pattern = '''[^ab]*''' +input = '''cde''' +matches = [[0, 3]] + +[[tests]] +name = "basic137" +options = ['escaped'] +pattern = '''(^)*''' +input = '''-''' +matches = [[0, 0]] + +[[tests]] +name = "basic138" +options = ['escaped'] +pattern = '''a*''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "basic139" +options = ['escaped'] +pattern = '''([abc])*d''' +input = '''abbbcd''' +matches = [[0, 6]] + +[[tests]] +name = "basic140" +options = ['escaped'] +pattern = '''([abc])*bcd''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic141" +options = ['escaped'] +pattern = '''a|b|c|d|e''' +input = '''e''' +matches = [[0, 1]] + +[[tests]] +name = "basic142" +options = ['escaped'] +pattern = '''(a|b|c|d|e)f''' +input = '''ef''' +matches = [[0, 2]] + +[[tests]] +name = "basic144" +options = ['escaped'] +pattern = '''((a*|b))*''' +input = '''-''' +matches = [[0, 0]] + +[[tests]] +name = "basic145" +options = ['escaped'] +pattern = '''abcd*efg''' +input = '''abcdefg''' +matches = [[0, 7]] + +[[tests]] +name = "basic146" +options = ['escaped'] +pattern = '''ab*''' +input = '''xabyabbbz''' +matches = [[1, 3]] + +[[tests]] +name = "basic147" +options = ['escaped'] +pattern = '''ab*''' +input = '''xayabbbz''' +matches = [[1, 2]] + +[[tests]] +name = "basic148" +options = ['escaped'] +pattern = '''(ab|cd)e''' +input = '''abcde''' +matches = [[2, 5]] + +[[tests]] +name = "basic149" +options = ['escaped'] +pattern = '''[abhgefdc]ij''' +input = '''hij''' +matches = [[0, 3]] + +[[tests]] +name = "basic150" +options = ['escaped'] +pattern = '''(a|b)c*d''' +input = '''abcd''' +matches = [[1, 4]] + +[[tests]] +name = "basic151" +options = ['escaped'] +pattern = '''(ab|ab*)bc''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic152" +options = ['escaped'] +pattern = '''a([bc]*)c*''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic153" +options = ['escaped'] +pattern = '''a([bc]*)(c*d)''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic154" +options = ['escaped'] +pattern = '''a([bc]+)(c*d)''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic155" +options = ['escaped'] +pattern = '''a([bc]*)(c+d)''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic156" +options = ['escaped'] +pattern = '''a[bcd]*dcdcde''' +input = '''adcdcde''' +matches = [[0, 7]] + +[[tests]] +name = "basic157" +options = ['escaped'] +pattern = '''(ab|a)b*c''' +input = '''abc''' +matches = [[0, 3]] + +[[tests]] +name = "basic158" +options = ['escaped'] +pattern = '''((a)(b)c)(d)''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic159" +options = ['escaped'] +pattern = '''[A-Za-z_][A-Za-z0-9_]*''' +input = '''alpha''' +matches = [[0, 5]] + +[[tests]] +name = "basic160" +options = ['escaped'] +pattern = '''^a(bc+|b[eh])g|.h$''' +input = '''abh''' +matches = [[1, 3]] + +[[tests]] +name = "basic161" +options = ['escaped'] +pattern = '''(bc+d$|ef*g.|h?i(j|k))''' +input = '''effgz''' +matches = [[0, 5]] + +[[tests]] +name = "basic162" +options = ['escaped'] +pattern = '''(bc+d$|ef*g.|h?i(j|k))''' +input = '''ij''' +matches = [[0, 2]] + +[[tests]] +name = "basic163" +options = ['escaped'] +pattern = '''(bc+d$|ef*g.|h?i(j|k))''' +input = '''reffgz''' +matches = [[1, 6]] + +[[tests]] +name = "basic164" +options = ['escaped'] +pattern = '''(((((((((a)))))))))''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "basic165" +options = ['escaped'] +pattern = '''multiple words''' +input = '''multiple words yeah''' +matches = [[0, 14]] + +[[tests]] +name = "basic166" +options = ['escaped'] +pattern = '''(.*)c(.*)''' +input = '''abcde''' +matches = [[0, 5]] + +[[tests]] +name = "basic167" +options = ['escaped'] +pattern = '''abcd''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic168" +options = ['escaped'] +pattern = '''a(bc)d''' +input = '''abcd''' +matches = [[0, 4]] + +[[tests]] +name = "basic169" +options = ['escaped'] +pattern = '''a[\x01-\x03]?c''' +input = '''a\x02c''' +matches = [[0, 3]] + +[[tests]] +name = "basic170" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Qaddafi''' +matches = [[0, 15]] + +[[tests]] +name = "basic171" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Mo'ammar Gadhafi''' +matches = [[0, 16]] + +[[tests]] +name = "basic172" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Kaddafi''' +matches = [[0, 15]] + +[[tests]] +name = "basic173" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Qadhafi''' +matches = [[0, 15]] + +[[tests]] +name = "basic174" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Gadafi''' +matches = [[0, 14]] + +[[tests]] +name = "basic175" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Mu'ammar Qadafi''' +matches = [[0, 15]] + +[[tests]] +name = "basic176" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Moamar Gaddafi''' +matches = [[0, 14]] + +[[tests]] +name = "basic177" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Mu'ammar Qadhdhafi''' +matches = [[0, 18]] + +[[tests]] +name = "basic178" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Khaddafi''' +matches = [[0, 16]] + +[[tests]] +name = "basic179" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Ghaddafy''' +matches = [[0, 16]] + +[[tests]] +name = "basic180" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Ghadafi''' +matches = [[0, 15]] + +[[tests]] +name = "basic181" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Ghaddafi''' +matches = [[0, 16]] + +[[tests]] +name = "basic182" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muamar Kaddafi''' +matches = [[0, 14]] + +[[tests]] +name = "basic183" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Quathafi''' +matches = [[0, 16]] + +[[tests]] +name = "basic184" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Muammar Gheddafi''' +matches = [[0, 16]] + +[[tests]] +name = "basic185" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Moammar Khadafy''' +matches = [[0, 15]] + +[[tests]] +name = "basic186" +options = ['escaped'] +pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' +input = '''Moammar Qudhafi''' +matches = [[0, 15]] + +[[tests]] +name = "basic187" +options = ['escaped'] +pattern = '''a+(b|c)*d+''' +input = '''aabcdd''' +matches = [[0, 6]] + +[[tests]] +name = "basic188" +options = ['escaped'] +pattern = '''^.+$''' +input = '''vivi''' +matches = [[0, 4]] + +[[tests]] +name = "basic189" +options = ['escaped'] +pattern = '''^(.+)$''' +input = '''vivi''' +matches = [[0, 4]] + +[[tests]] +name = "basic190" +options = ['escaped'] +pattern = '''^([^!.]+).att.com!(.+)$''' +input = '''gryphon.att.com!eby''' +matches = [[0, 19]] + +[[tests]] +name = "basic191" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$''' +input = '''bas''' +matches = [[0, 3]] + +[[tests]] +name = "basic192" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic193" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic194" +options = ['escaped'] +pattern = '''^.+!([^!]+!)([^!]+)$''' +input = '''foo!bar!bas''' +matches = [[0, 11]] + +[[tests]] +name = "basic195" +options = ['escaped'] +pattern = '''((foo)|(bar))!bas''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic196" +options = ['escaped'] +pattern = '''((foo)|(bar))!bas''' +input = '''foo!bar!bas''' +matches = [[4, 11]] + +[[tests]] +name = "basic197" +options = ['escaped'] +pattern = '''((foo)|(bar))!bas''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic198" +options = ['escaped'] +pattern = '''((foo)|bar)!bas''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic199" +options = ['escaped'] +pattern = '''((foo)|bar)!bas''' +input = '''foo!bar!bas''' +matches = [[4, 11]] + +[[tests]] +name = "basic200" +options = ['escaped'] +pattern = '''((foo)|bar)!bas''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic201" +options = ['escaped'] +pattern = '''(foo|(bar))!bas''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic202" +options = ['escaped'] +pattern = '''(foo|(bar))!bas''' +input = '''foo!bar!bas''' +matches = [[4, 11]] + +[[tests]] +name = "basic203" +options = ['escaped'] +pattern = '''(foo|(bar))!bas''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic204" +options = ['escaped'] +pattern = '''(foo|bar)!bas''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic205" +options = ['escaped'] +pattern = '''(foo|bar)!bas''' +input = '''foo!bar!bas''' +matches = [[4, 11]] + +[[tests]] +name = "basic206" +options = ['escaped'] +pattern = '''(foo|bar)!bas''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic207" +options = ['escaped'] +pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''foo!bar!bas''' +matches = [[0, 11]] + +[[tests]] +name = "basic208" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''bas''' +matches = [[0, 3]] + +[[tests]] +name = "basic209" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic210" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''foo!bar!bas''' +matches = [[0, 11]] + +[[tests]] +name = "basic211" +options = ['escaped'] +pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic212" +options = ['escaped'] +pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''bas''' +matches = [[0, 3]] + +[[tests]] +name = "basic213" +options = ['escaped'] +pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''bar!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic214" +options = ['escaped'] +pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''foo!bar!bas''' +matches = [[0, 11]] + +[[tests]] +name = "basic215" +options = ['escaped'] +pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' +input = '''foo!bas''' +matches = [[0, 7]] + +[[tests]] +name = "basic216" +options = ['escaped'] +pattern = '''.*(/XXX).*''' +input = '''/XXX''' +matches = [[0, 4]] + +[[tests]] +name = "basic217" +options = ['escaped'] +pattern = '''.*(\\XXX).*''' +input = '''\\XXX''' +matches = [[0, 4]] + +[[tests]] +name = "basic218" +options = ['escaped'] +pattern = '''\\XXX''' +input = '''\\XXX''' +matches = [[0, 4]] + +[[tests]] +name = "basic219" +options = ['escaped'] +pattern = '''.*(/000).*''' +input = '''/000''' +matches = [[0, 4]] + +[[tests]] +name = "basic220" +options = ['escaped'] +pattern = '''.*(\\000).*''' +input = '''\\000''' +matches = [[0, 4]] + +[[tests]] +name = "basic221" +options = ['escaped'] +pattern = '''\\000''' +input = '''\\000''' +matches = [[0, 4]] + diff --git a/vendor/regex-automata/data/tests/fowler/fowler-to-toml b/vendor/regex-automata/data/tests/fowler/fowler-to-toml new file mode 100755 index 000000000..5f1d91fcb --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/fowler-to-toml @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +from __future__ import absolute_import, division, print_function +import argparse +import os.path as path + + +def read_tests(f): + basename, _ = path.splitext(path.basename(f)) + tests = [] + prev_pattern = None + + for lineno, line in enumerate(open(f), 1): + fields = list(filter(None, map(str.strip, line.split('\t')))) + if not (4 <= len(fields) <= 5) \ + or 'E' not in fields[0] or fields[0][0] == '#': + continue + + terse_opts, pat, text, sgroups = fields[0:4] + groups = [] # groups as integer ranges + if sgroups == 'NOMATCH': + groups = [] + elif ',' in sgroups: + noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) + for g in noparen: + s, e = map(str.strip, g.split(',')) + groups.append([int(s), int(e)]) + break + else: + # This skips tests that should result in an error. + # There aren't many, so I think we can just capture those + # manually. Possibly fix this in future. + continue + + opts = [] + if text == "NULL": + text = "" + if pat == 'SAME': + pat = prev_pattern + if '$' in terse_opts: + pat = pat.encode('utf-8').decode('unicode_escape') + text = text.encode('utf-8').decode('unicode_escape') + text = text.encode('unicode_escape').decode('utf-8') + opts.append('escaped') + else: + opts.append('escaped') + text = text.encode('unicode_escape').decode('utf-8') + if 'i' in terse_opts: + opts.append('case-insensitive') + + pat = pat.encode('unicode_escape').decode('utf-8') + pat = pat.replace('\\\\', '\\') + tests.append({ + 'name': '"%s%d"' % (basename, lineno), + 'options': repr(opts), + 'pattern': "'''%s'''" % pat, + 'input': "'''%s'''" % text, + 'matches': str(groups), + }) + prev_pattern = pat + return tests + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Generate match tests from an AT&T POSIX test file.') + aa = parser.add_argument + aa('datfile', help='A dat AT&T POSIX test file.') + args = parser.parse_args() + + tests = read_tests(args.datfile) + for t in tests: + print('[[tests]]') + for k, v in t.items(): + print('%s = %s' % (k, v)) + print('') diff --git a/vendor/regex-automata/data/tests/fowler/nullsubexpr.dat b/vendor/regex-automata/data/tests/fowler/nullsubexpr.dat new file mode 100644 index 000000000..2e18fbb91 --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/vendor/regex-automata/data/tests/fowler/nullsubexpr.toml b/vendor/regex-automata/data/tests/fowler/nullsubexpr.toml new file mode 100644 index 000000000..331067c60 --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/nullsubexpr.toml @@ -0,0 +1,350 @@ +[[tests]] +name = "nullsubexpr3" +options = ['escaped'] +pattern = '''(a*)*''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr5" +options = ['escaped'] +pattern = '''(a*)*''' +input = '''x''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr6" +options = ['escaped'] +pattern = '''(a*)*''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr7" +options = ['escaped'] +pattern = '''(a*)*''' +input = '''aaaaaax''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr8" +options = ['escaped'] +pattern = '''(a*)+''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr9" +options = ['escaped'] +pattern = '''(a*)+''' +input = '''x''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr10" +options = ['escaped'] +pattern = '''(a*)+''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr11" +options = ['escaped'] +pattern = '''(a*)+''' +input = '''aaaaaax''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr12" +options = ['escaped'] +pattern = '''(a+)*''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr13" +options = ['escaped'] +pattern = '''(a+)*''' +input = '''x''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr14" +options = ['escaped'] +pattern = '''(a+)*''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr15" +options = ['escaped'] +pattern = '''(a+)*''' +input = '''aaaaaax''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr16" +options = ['escaped'] +pattern = '''(a+)+''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr17" +options = ['escaped'] +pattern = '''(a+)+''' +input = '''x''' +matches = [] + +[[tests]] +name = "nullsubexpr18" +options = ['escaped'] +pattern = '''(a+)+''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr19" +options = ['escaped'] +pattern = '''(a+)+''' +input = '''aaaaaax''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr21" +options = ['escaped'] +pattern = '''([a]*)*''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr23" +options = ['escaped'] +pattern = '''([a]*)*''' +input = '''x''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr24" +options = ['escaped'] +pattern = '''([a]*)*''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr25" +options = ['escaped'] +pattern = '''([a]*)*''' +input = '''aaaaaax''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr26" +options = ['escaped'] +pattern = '''([a]*)+''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr27" +options = ['escaped'] +pattern = '''([a]*)+''' +input = '''x''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr28" +options = ['escaped'] +pattern = '''([a]*)+''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr29" +options = ['escaped'] +pattern = '''([a]*)+''' +input = '''aaaaaax''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr30" +options = ['escaped'] +pattern = '''([^b]*)*''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr32" +options = ['escaped'] +pattern = '''([^b]*)*''' +input = '''b''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr33" +options = ['escaped'] +pattern = '''([^b]*)*''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr34" +options = ['escaped'] +pattern = '''([^b]*)*''' +input = '''aaaaaab''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr35" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr36" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr37" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''ababab''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr38" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''bababa''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr39" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''b''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr40" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''bbbbbb''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr41" +options = ['escaped'] +pattern = '''([ab]*)*''' +input = '''aaaabcde''' +matches = [[0, 5]] + +[[tests]] +name = "nullsubexpr42" +options = ['escaped'] +pattern = '''([^a]*)*''' +input = '''b''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr43" +options = ['escaped'] +pattern = '''([^a]*)*''' +input = '''bbbbbb''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr45" +options = ['escaped'] +pattern = '''([^a]*)*''' +input = '''aaaaaa''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr46" +options = ['escaped'] +pattern = '''([^ab]*)*''' +input = '''ccccxx''' +matches = [[0, 6]] + +[[tests]] +name = "nullsubexpr48" +options = ['escaped'] +pattern = '''([^ab]*)*''' +input = '''ababab''' +matches = [[0, 0]] + +[[tests]] +name = "nullsubexpr50" +options = ['escaped'] +pattern = '''((z)+|a)*''' +input = '''zabcde''' +matches = [[0, 2]] + +[[tests]] +name = "nullsubexpr69" +options = ['escaped'] +pattern = '''(a*)*(x)''' +input = '''x''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr70" +options = ['escaped'] +pattern = '''(a*)*(x)''' +input = '''ax''' +matches = [[0, 2]] + +[[tests]] +name = "nullsubexpr71" +options = ['escaped'] +pattern = '''(a*)*(x)''' +input = '''axa''' +matches = [[0, 2]] + +[[tests]] +name = "nullsubexpr73" +options = ['escaped'] +pattern = '''(a*)+(x)''' +input = '''x''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr74" +options = ['escaped'] +pattern = '''(a*)+(x)''' +input = '''ax''' +matches = [[0, 2]] + +[[tests]] +name = "nullsubexpr75" +options = ['escaped'] +pattern = '''(a*)+(x)''' +input = '''axa''' +matches = [[0, 2]] + +[[tests]] +name = "nullsubexpr77" +options = ['escaped'] +pattern = '''(a*){2}(x)''' +input = '''x''' +matches = [[0, 1]] + +[[tests]] +name = "nullsubexpr78" +options = ['escaped'] +pattern = '''(a*){2}(x)''' +input = '''ax''' +matches = [[0, 2]] + +[[tests]] +name = "nullsubexpr79" +options = ['escaped'] +pattern = '''(a*){2}(x)''' +input = '''axa''' +matches = [[0, 2]] + diff --git a/vendor/regex-automata/data/tests/fowler/repetition-long.dat b/vendor/regex-automata/data/tests/fowler/repetition-long.dat new file mode 100644 index 000000000..c91580236 --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/repetition-long.dat @@ -0,0 +1,85 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/vendor/regex-automata/data/tests/fowler/repetition-long.toml b/vendor/regex-automata/data/tests/fowler/repetition-long.toml new file mode 100644 index 000000000..e0b2ea76b --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/repetition-long.toml @@ -0,0 +1,294 @@ +[[tests]] +name = "repetition-long12" +options = ['escaped'] +pattern = '''X(.?){0,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long13" +options = ['escaped'] +pattern = '''X(.?){1,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long14" +options = ['escaped'] +pattern = '''X(.?){2,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long15" +options = ['escaped'] +pattern = '''X(.?){3,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long16" +options = ['escaped'] +pattern = '''X(.?){4,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long17" +options = ['escaped'] +pattern = '''X(.?){5,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long18" +options = ['escaped'] +pattern = '''X(.?){6,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long19" +options = ['escaped'] +pattern = '''X(.?){7,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long20" +options = ['escaped'] +pattern = '''X(.?){8,}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long22" +options = ['escaped'] +pattern = '''X(.?){0,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long24" +options = ['escaped'] +pattern = '''X(.?){1,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long26" +options = ['escaped'] +pattern = '''X(.?){2,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long28" +options = ['escaped'] +pattern = '''X(.?){3,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long30" +options = ['escaped'] +pattern = '''X(.?){4,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long32" +options = ['escaped'] +pattern = '''X(.?){5,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long34" +options = ['escaped'] +pattern = '''X(.?){6,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long36" +options = ['escaped'] +pattern = '''X(.?){7,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long37" +options = ['escaped'] +pattern = '''X(.?){8,8}Y''' +input = '''X1234567Y''' +matches = [[0, 9]] + +[[tests]] +name = "repetition-long48" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){0,}(d*)''' +input = '''ababcd''' +matches = [[0, 1]] + +[[tests]] +name = "repetition-long49" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){1,}(d*)''' +input = '''ababcd''' +matches = [[0, 1]] + +[[tests]] +name = "repetition-long50" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){2,}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long51" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){3,}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long52" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){4,}(d*)''' +input = '''ababcd''' +matches = [] + +[[tests]] +name = "repetition-long53" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){0,10}(d*)''' +input = '''ababcd''' +matches = [[0, 1]] + +[[tests]] +name = "repetition-long54" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){1,10}(d*)''' +input = '''ababcd''' +matches = [[0, 1]] + +[[tests]] +name = "repetition-long55" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){2,10}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long56" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){3,10}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long57" +options = ['escaped'] +pattern = '''(a|ab|c|bcd){4,10}(d*)''' +input = '''ababcd''' +matches = [] + +[[tests]] +name = "repetition-long58" +options = ['escaped'] +pattern = '''(a|ab|c|bcd)*(d*)''' +input = '''ababcd''' +matches = [[0, 1]] + +[[tests]] +name = "repetition-long59" +options = ['escaped'] +pattern = '''(a|ab|c|bcd)+(d*)''' +input = '''ababcd''' +matches = [[0, 1]] + +[[tests]] +name = "repetition-long65" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){0,}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long67" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){1,}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long69" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){2,}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long71" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){3,}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long72" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){4,}(d*)''' +input = '''ababcd''' +matches = [] + +[[tests]] +name = "repetition-long74" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){0,10}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long76" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){1,10}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long78" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){2,10}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long80" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){3,10}(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long81" +options = ['escaped'] +pattern = '''(ab|a|c|bcd){4,10}(d*)''' +input = '''ababcd''' +matches = [] + +[[tests]] +name = "repetition-long83" +options = ['escaped'] +pattern = '''(ab|a|c|bcd)*(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + +[[tests]] +name = "repetition-long85" +options = ['escaped'] +pattern = '''(ab|a|c|bcd)+(d*)''' +input = '''ababcd''' +matches = [[0, 6]] + diff --git a/vendor/regex-automata/data/tests/fowler/repetition.dat b/vendor/regex-automata/data/tests/fowler/repetition.dat new file mode 100644 index 000000000..2dac0823f --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/repetition.dat @@ -0,0 +1,83 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) diff --git a/vendor/regex-automata/data/tests/fowler/repetition.toml b/vendor/regex-automata/data/tests/fowler/repetition.toml new file mode 100644 index 000000000..43280a409 --- /dev/null +++ b/vendor/regex-automata/data/tests/fowler/repetition.toml @@ -0,0 +1,343 @@ +[[tests]] +name = "repetition10" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''''' +matches = [] + +[[tests]] +name = "repetition11" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''''' +matches = [] + +[[tests]] +name = "repetition12" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''''' +matches = [] + +[[tests]] +name = "repetition14" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''''' +matches = [] + +[[tests]] +name = "repetition15" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''''' +matches = [] + +[[tests]] +name = "repetition16" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''''' +matches = [] + +[[tests]] +name = "repetition18" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''''' +matches = [[0, 0]] + +[[tests]] +name = "repetition20" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "repetition21" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''a''' +matches = [] + +[[tests]] +name = "repetition22" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''a''' +matches = [] + +[[tests]] +name = "repetition24" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "repetition25" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''a''' +matches = [] + +[[tests]] +name = "repetition26" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''a''' +matches = [] + +[[tests]] +name = "repetition28" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''a''' +matches = [[0, 1]] + +[[tests]] +name = "repetition30" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition31" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition32" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aa''' +matches = [] + +[[tests]] +name = "repetition34" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition35" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition36" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''aa''' +matches = [] + +[[tests]] +name = "repetition38" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''aa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition40" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''aaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition41" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''aaa''' +matches = [[0, 3]] + +[[tests]] +name = "repetition42" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaa''' +matches = [[0, 3]] + +[[tests]] +name = "repetition44" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''aaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition46" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''aaa''' +matches = [[0, 3]] + +[[tests]] +name = "repetition47" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''aaa''' +matches = [[0, 3]] + +[[tests]] +name = "repetition50" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''aaa''' +matches = [[0, 3]] + +[[tests]] +name = "repetition52" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''aaaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition53" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition54" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition56" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''aaaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition57" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition59" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition61" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''aaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition63" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''aaaaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition64" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''aaaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition65" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaaaa''' +matches = [[0, 5]] + +[[tests]] +name = "repetition67" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''aaaaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition68" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''aaaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition70" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''aaaaa''' +matches = [[0, 5]] + +[[tests]] +name = "repetition73" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''aaaaa''' +matches = [[0, 5]] + +[[tests]] +name = "repetition75" +options = ['escaped'] +pattern = '''((..)|(.))''' +input = '''aaaaaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition76" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))''' +input = '''aaaaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition77" +options = ['escaped'] +pattern = '''((..)|(.))((..)|(.))((..)|(.))''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "repetition79" +options = ['escaped'] +pattern = '''((..)|(.)){1}''' +input = '''aaaaaa''' +matches = [[0, 2]] + +[[tests]] +name = "repetition80" +options = ['escaped'] +pattern = '''((..)|(.)){2}''' +input = '''aaaaaa''' +matches = [[0, 4]] + +[[tests]] +name = "repetition81" +options = ['escaped'] +pattern = '''((..)|(.)){3}''' +input = '''aaaaaa''' +matches = [[0, 6]] + +[[tests]] +name = "repetition83" +options = ['escaped'] +pattern = '''((..)|(.))*''' +input = '''aaaaaa''' +matches = [[0, 6]] + diff --git a/vendor/regex-automata/data/tests/iter.toml b/vendor/regex-automata/data/tests/iter.toml new file mode 100644 index 000000000..30abae86e --- /dev/null +++ b/vendor/regex-automata/data/tests/iter.toml @@ -0,0 +1,92 @@ +[[tests]] +name = "iter1" +pattern = "a" +input = "aaa" +matches = [[0, 1], [1, 2], [2, 3]] + +[[tests]] +name = "iter2" +pattern = "a" +input = "aba" +matches = [[0, 1], [2, 3]] + +[[tests]] +name = "iter-empty1" +pattern = '' +input = '' +matches = [[0, 0]] + +[[tests]] +name = "iter-empty2" +pattern = '' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty3" +pattern = '()' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty4" +pattern = '()*' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty5" +pattern = '()+' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty6" +pattern = '()?' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty7" +pattern = '()()' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty8" +pattern = '()+|z' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty9" +pattern = 'z|()+' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty10" +pattern = '()+|b' +input = 'abc' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +name = "iter-empty11" +pattern = 'b|()+' +input = 'abc' +matches = [[0, 0], [1, 2], [3, 3]] + + +[[tests]] +options = ["anchored"] +name = "iter-anchored1" +pattern = "a" +input = "a" +matches = [[0, 1]] + +[[tests]] +options = ["anchored"] +name = "iter-anchored2" +pattern = "a" +input = "aa" +matches = [[0, 1]] diff --git a/vendor/regex-automata/data/tests/no-unicode.toml b/vendor/regex-automata/data/tests/no-unicode.toml new file mode 100644 index 000000000..16e02b426 --- /dev/null +++ b/vendor/regex-automata/data/tests/no-unicode.toml @@ -0,0 +1,138 @@ +[[tests]] +name = "invalid-utf8-literal1" +options = ["escaped", "invalid-utf8", "no-unicode"] +pattern = '\xFF' +input = '\xFF' +matches = [[0, 1]] + + +[[tests]] +name = "no-unicode-mixed" +options = ["escaped", "invalid-utf8"] +pattern = '(.+)(?-u)(.+)' +input = '\xCE\x93\xCE\x94\xFF' +matches = [[0, 5]] + + +[[tests]] +name = "no-unicode-case1" +options = ["case-insensitive", "no-unicode"] +pattern = "a" +input = "A" +matches = [[0, 1]] + +[[tests]] +name = "no-unicode-case2" +options = ["case-insensitive", "no-unicode"] +pattern = "[a-z]+" +input = "AaAaA" +matches = [[0, 5]] + +[[tests]] +name = "no-unicode-case3" +options = ["case-insensitive"] +pattern = "[a-z]+" +input = "aA\u212AaA" +matches = [[0, 7]] + +[[tests]] +name = "no-unicode-case4" +options = ["case-insensitive", "no-unicode"] +pattern = "[a-z]+" +input = "aA\u212AaA" +matches = [[0, 2]] + + +[[tests]] +name = "no-unicode-negate1" +options = [] +pattern = "[^a]" +input = "δ" +matches = [[0, 2]] + +[[tests]] +name = "no-unicode-negate2" +options = ["no-unicode", "invalid-utf8"] +pattern = "[^a]" +input = "δ" +matches = [[0, 1]] + + +[[tests]] +name = "no-unicode-dotstar-prefix1" +options = ["escaped", "no-unicode", "invalid-utf8"] +pattern = "a" +input = '\xFFa' +matches = [[1, 2]] + +[[tests]] +name = "no-unicode-dotstar-prefix2" +options = ["escaped", "invalid-utf8"] +pattern = "a" +input = '\xFFa' +matches = [[1, 2]] + + +[[tests]] +name = "no-unicode-null-bytes1" +options = ["escaped", "no-unicode", "invalid-utf8"] +pattern = '[^\x00]+\x00' +input = 'foo\x00' +matches = [[0, 4]] + + +[[tests]] +name = "no-unicode1" +options = ["no-unicode"] +pattern = '\w+' +input = "aδ" +matches = [[0, 1]] + +[[tests]] +name = "no-unicode2" +options = [] +pattern = '\w+' +input = "aδ" +matches = [[0, 3]] + +[[tests]] +name = "no-unicode3" +options = ["no-unicode"] +pattern = '\d+' +input = "1२३9" +matches = [[0, 1]] + +[[tests]] +name = "no-unicode4" +pattern = '\d+' +input = "1२३9" +matches = [[0, 8]] + +[[tests]] +name = "no-unicode5" +options = ["no-unicode"] +pattern = '\s+' +input = " \u1680" +matches = [[0, 1]] + +[[tests]] +name = "no-unicode6" +pattern = '\s+' +input = " \u1680" +matches = [[0, 4]] + + +[[tests]] +# See: https://github.com/rust-lang/regex/issues/484 +name = "no-unicode-iter1" +pattern = '' +input = "☃" +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] + +[[tests]] +# See: https://github.com/rust-lang/regex/issues/484 +options = ['escaped'] +name = "no-unicode-iter2" +pattern = '' +input = 'b\xFFr' +matches = [[0, 0], [1, 1], [2, 2], [3, 3]] diff --git a/vendor/regex-automata/data/tests/unicode.toml b/vendor/regex-automata/data/tests/unicode.toml new file mode 100644 index 000000000..845393f28 --- /dev/null +++ b/vendor/regex-automata/data/tests/unicode.toml @@ -0,0 +1,489 @@ +[[tests]] +name = "unicode-literal1" +pattern = '☃' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "unicode-literal2" +pattern = '☃+' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "unicode-literal3" +options = ["case-insensitive"] +pattern = '☃+' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "unicode-literal4" +options = ["case-insensitive"] +pattern = 'Δ' +input = "δ" +matches = [[0, 2]] + + +[[tests]] +name = "unicode-class1" +pattern = '[☃Ⅰ]+' +input = "☃" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class2" +pattern = '\pN' +input = "Ⅰ" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class3" +pattern = '\pN+' +input = "Ⅰ1Ⅱ2" +matches = [[0, 8]] + +[[tests]] +name = "unicode-class4" +pattern = '\PN+' +input = "abⅠ" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class5" +pattern = '[\PN]+' +input = "abⅠ" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class6" +pattern = '[^\PN]+' +input = "abⅠ" +matches = [[2, 5]] + +[[tests]] +name = "unicode-class7" +pattern = '\p{Lu}+' +input = "ΛΘΓΔα" +matches = [[0, 8]] + +[[tests]] +name = "unicode-class8" +options = ["case-insensitive"] +pattern = '\p{Lu}+' +input = "ΛΘΓΔα" +matches = [[0, 10]] + +[[tests]] +name = "unicode-class9" +pattern = '\pL+' +input = "ΛΘΓΔα" +matches = [[0, 10]] + +[[tests]] +name = "unicode-class10" +pattern = '\p{Ll}+' +input = "ΛΘΓΔα" +matches = [[8, 10]] + + +[[tests]] +name = "unicode-perl1" +pattern = '\w+' +input = "dδd" +matches = [[0, 4]] + +[[tests]] +name = "unicode-perl2" +pattern = '\w+' +input = "⥡" +matches = [] + +[[tests]] +name = "unicode-perl3" +pattern = '\W+' +input = "⥡" +matches = [[0, 3]] + +[[tests]] +name = "unicode-perl4" +pattern = '\d+' +input = "1२३9" +matches = [[0, 8]] + +[[tests]] +name = "unicode-perl5" +pattern = '\d+' +input = "Ⅱ" +matches = [] + +[[tests]] +name = "unicode-perl6" +pattern = '\D+' +input = "Ⅱ" +matches = [[0, 3]] + +[[tests]] +name = "unicode-perl7" +pattern = '\s+' +input = " " +matches = [[0, 3]] + +[[tests]] +name = "unicode-perl8" +pattern = '\s+' +input = "☃" +matches = [] + +[[tests]] +name = "unicode-perl9" +pattern = '\S+' +input = "☃" +matches = [[0, 3]] + + +[[tests]] +name = "unicode-class-gencat1" +pattern = '\p{Cased_Letter}' +input = "A" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat2" +pattern = '\p{Close_Punctuation}' +input = "❯" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat3" +pattern = '\p{Connector_Punctuation}' +input = "⁀" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat4" +pattern = '\p{Control}' +input = "\u009F" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class-gencat5" +pattern = '\p{Currency_Symbol}' +input = "£" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat6" +pattern = '\p{Dash_Punctuation}' +input = "〰" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat7" +pattern = '\p{Decimal_Number}' +input = "𑓙" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat8" +pattern = '\p{Enclosing_Mark}' +input = "\uA672" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat9" +pattern = '\p{Final_Punctuation}' +input = "⸡" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat10" +pattern = '\p{Format}' +input = "\U000E007F" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat11" +pattern = '\p{Initial_Punctuation}' +input = "⸜" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat12" +pattern = '\p{Letter}' +input = "Έ" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class-gencat13" +pattern = '\p{Letter_Number}' +input = "ↂ" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat14" +pattern = '\p{Line_Separator}' +input = "\u2028" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat15" +pattern = '\p{Lowercase_Letter}' +input = "ϛ" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class-gencat16" +pattern = '\p{Mark}' +input = "\U000E01EF" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat17" +pattern = '\p{Math}' +input = "⋿" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat18" +pattern = '\p{Modifier_Letter}' +input = "𖭃" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat19" +pattern = '\p{Modifier_Symbol}' +input = "🏿" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat20" +pattern = '\p{Nonspacing_Mark}' +input = "\U0001E94A" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat21" +pattern = '\p{Number}' +input = "⓿" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat22" +pattern = '\p{Open_Punctuation}' +input = "⦅" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat23" +pattern = '\p{Other}' +input = "\u0BC9" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat24" +pattern = '\p{Other_Letter}' +input = "ꓷ" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat25" +pattern = '\p{Other_Number}' +input = "㉏" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat26" +pattern = '\p{Other_Punctuation}' +input = "𞥞" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat27" +pattern = '\p{Other_Symbol}' +input = "⅌" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat28" +pattern = '\p{Paragraph_Separator}' +input = "\u2029" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat29" +pattern = '\p{Private_Use}' +input = "\U0010FFFD" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat30" +pattern = '\p{Punctuation}' +input = "𑁍" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat31" +pattern = '\p{Separator}' +input = "\u3000" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat32" +pattern = '\p{Space_Separator}' +input = "\u205F" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat33" +pattern = '\p{Spacing_Mark}' +input = "\U00016F7E" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat34" +pattern = '\p{Symbol}' +input = "⯈" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat35" +pattern = '\p{Titlecase_Letter}' +input = "ῼ" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gencat36" +pattern = '\p{Unassigned}' +input = "\U0010FFFF" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gencat37" +pattern = '\p{Uppercase_Letter}' +input = "Ꝋ" +matches = [[0, 3]] + + +[[tests]] +name = "unicode-class-emoji1" +pattern = '\p{Emoji}' +input = "\u23E9" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-emoji2" +pattern = '\p{emoji}' +input = "\U0001F21A" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-emoji3" +pattern = '\p{extendedpictographic}' +input = "\U0001FA6E" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-emoji4" +pattern = '\p{extendedpictographic}' +input = "\U0001FFFD" +matches = [[0, 4]] + + +[[tests]] +name = "unicode-class-gcb1" +pattern = '\p{grapheme_cluster_break=prepend}' +input = "\U00011D46" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gcb2" +pattern = '\p{gcb=regional_indicator}' +input = "\U0001F1E6" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gcb3" +pattern = '\p{gcb=ri}' +input = "\U0001F1E7" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gcb4" +pattern = '\p{regionalindicator}' +input = "\U0001F1FF" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-gcb5" +pattern = '\p{gcb=lvt}' +input = "\uC989" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-gcb6" +pattern = '\p{gcb=zwj}' +input = "\u200D" +matches = [[0, 3]] + + +[[tests]] +name = "unicode-class-word-break1" +pattern = '\p{word_break=Hebrew_Letter}' +input = "\uFB46" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-word-break2" +pattern = '\p{wb=hebrewletter}' +input = "\uFB46" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-word-break3" +pattern = '\p{wb=ExtendNumLet}' +input = "\uFF3F" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-word-break4" +pattern = '\p{wb=WSegSpace}' +input = "\u3000" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-word-break5" +pattern = '\p{wb=numeric}' +input = "\U0001E950" +matches = [[0, 4]] + + +[[tests]] +name = "unicode-class-sentence-break1" +pattern = '\p{sentence_break=Lower}' +input = "\u0469" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class-sentence-break2" +pattern = '\p{sb=lower}' +input = "\u0469" +matches = [[0, 2]] + +[[tests]] +name = "unicode-class-sentence-break3" +pattern = '\p{sb=Close}' +input = "\uFF60" +matches = [[0, 3]] + +[[tests]] +name = "unicode-class-sentence-break4" +pattern = '\p{sb=Close}' +input = "\U0001F677" +matches = [[0, 4]] + +[[tests]] +name = "unicode-class-sentence-break5" +pattern = '\p{sb=SContinue}' +input = "\uFF64" +matches = [[0, 3]] diff --git a/vendor/regex-automata/rustfmt.toml b/vendor/regex-automata/rustfmt.toml new file mode 100644 index 000000000..aa37a218b --- /dev/null +++ b/vendor/regex-automata/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/vendor/regex-automata/src/byteorder.rs b/vendor/regex-automata/src/byteorder.rs new file mode 100644 index 000000000..e909f93a2 --- /dev/null +++ b/vendor/regex-automata/src/byteorder.rs @@ -0,0 +1,76 @@ +use core::convert::TryInto; + +pub trait ByteOrder { + fn read_u16(buf: &[u8]) -> u16; + fn read_u32(buf: &[u8]) -> u32; + fn read_u64(buf: &[u8]) -> u64; + fn read_uint(buf: &[u8], nbytes: usize) -> u64; + fn write_u16(buf: &mut [u8], n: u16); + fn write_u32(buf: &mut [u8], n: u32); + fn write_u64(buf: &mut [u8], n: u64); + fn write_uint(buf: &mut [u8], n: u64, nbytes: usize); +} + +pub enum BigEndian {} +pub enum LittleEndian {} +pub enum NativeEndian {} + +macro_rules! impl_endian { + ($t:ty, $from_endian:ident, $to_endian:ident) => { + impl ByteOrder for $t { + #[inline] + fn read_u16(buf: &[u8]) -> u16 { + u16::$from_endian(buf[0..2].try_into().unwrap()) + } + + #[inline] + fn read_u32(buf: &[u8]) -> u32 { + u32::$from_endian(buf[0..4].try_into().unwrap()) + } + + #[inline] + fn read_u64(buf: &[u8]) -> u64 { + u64::$from_endian(buf[0..8].try_into().unwrap()) + } + + #[inline] + fn read_uint(buf: &[u8], nbytes: usize) -> u64 { + let mut dst = [0u8; 8]; + dst[..nbytes].copy_from_slice(&buf[..nbytes]); + u64::$from_endian(dst) + } + + #[inline] + fn write_u16(buf: &mut [u8], n: u16) { + buf[0..2].copy_from_slice(&n.$to_endian()[..]); + } + + #[inline] + fn write_u32(buf: &mut [u8], n: u32) { + buf[0..4].copy_from_slice(&n.$to_endian()[..]); + } + + #[inline] + fn write_u64(buf: &mut [u8], n: u64) { + buf[0..8].copy_from_slice(&n.$to_endian()[..]); + } + + #[inline] + fn write_uint(buf: &mut [u8], n: u64, nbytes: usize) { + buf[..nbytes].copy_from_slice(&n.$to_endian()[..nbytes]); + } + } + }; +} + +impl_endian! { + BigEndian, from_be_bytes, to_be_bytes +} + +impl_endian! { + LittleEndian, from_le_bytes, to_le_bytes +} + +impl_endian! { + NativeEndian, from_ne_bytes, to_ne_bytes +} diff --git a/vendor/regex-automata/src/classes.rs b/vendor/regex-automata/src/classes.rs new file mode 100644 index 000000000..143908b3a --- /dev/null +++ b/vendor/regex-automata/src/classes.rs @@ -0,0 +1,271 @@ +use core::fmt; + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in a DFA to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of a dense DFA, +/// but also on compile times. +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for i in 0..256 { + classes.set(i as u8, i as u8); + } + classes + } + + /// Copies the byte classes given. The given slice must have length 0 or + /// length 256. Slices of length 0 are treated as singletons (every byte + /// is its own class). + pub fn from_slice(slice: &[u8]) -> ByteClasses { + assert!(slice.is_empty() || slice.len() == 256); + + if slice.is_empty() { + ByteClasses::singletons() + } else { + let mut classes = ByteClasses::empty(); + for (b, &class) in slice.iter().enumerate() { + classes.set(b as u8, class); + } + classes + } + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[byte as usize] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + self.0[byte as usize] + } + + /// Get the equivalence class for the given byte while forcefully + /// eliding bounds checks. + #[inline] + pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { + *self.0.get_unchecked(byte as usize) + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + self.0[255] as usize + 1 + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 256 equivalence classes + /// and each class contains exactly one byte. + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 256 + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class. Namely, this yields exactly N items, where N is + /// equivalent to the number of equivalence classes. Each item is an + /// arbitrary byte drawn from each equivalence class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes yet. Picking an arbitrary + /// byte from each equivalence class then permits a full exploration of + /// the NFA instead of using every possible byte value. + #[cfg(feature = "std")] + pub fn representatives(&self) -> ByteClassRepresentatives { + ByteClassRepresentatives { classes: self, byte: 0, last_class: None } + } + + /// Returns all of the bytes in the given equivalence class. + /// + /// The second element in the tuple indicates the number of elements in + /// the array. + fn elements(&self, equiv: u8) -> ([u8; 256], usize) { + let (mut array, mut len) = ([0; 256], 0); + for b in 0..256 { + if self.get(b as u8) == equiv { + array[len] = b as u8; + len += 1; + } + } + (array, len) + } +} + +impl fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for equiv in 0..self.alphabet_len() { + let (members, len) = self.elements(equiv as u8); + write!(f, "{} => {:?}", equiv, &members[..len])?; + } + write!(f, ")") + } + } +} + +/// An iterator over representative bytes from each equivalence class. +#[cfg(feature = "std")] +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + byte: usize, + last_class: Option<u8>, +} + +#[cfg(feature = "std")] +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + while self.byte < 256 { + let byte = self.byte as u8; + let class = self.classes.get(byte); + self.byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(byte); + } + } + None + } +} + +/// A byte class set keeps track of an *approximation* of equivalence classes +/// of bytes during NFA construction. That is, every byte in an equivalence +/// class cannot discriminate between a match and a non-match. +/// +/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the +/// same equivalence class because it never matters whether an `a` or a `b` is +/// seen, and no combination of `a`s and `b`s in the text can discriminate +/// a match. +/// +/// Note though that this does not compute the minimal set of equivalence +/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the +/// same equivalence class for the same reason that `a` and `b` are in the +/// same equivalence class in the aforementioned regex. However, in this +/// implementation, `a` and `c` are put into distinct equivalence classes. +/// The reason for this is implementation complexity. In the future, we should +/// endeavor to compute the minimal equivalence classes since they can have a +/// rather large impact on the size of the DFA. +/// +/// The representation here is 256 booleans, all initially set to false. Each +/// boolean maps to its corresponding byte based on position. A `true` value +/// indicates the end of an equivalence class, where its corresponding byte +/// and all of the bytes corresponding to all previous contiguous `false` +/// values are in the same equivalence class. +/// +/// This particular representation only permits contiguous ranges of bytes to +/// be in the same equivalence class, which means that we can never discover +/// the true minimal set of equivalence classes. +#[cfg(feature = "std")] +#[derive(Debug)] +pub struct ByteClassSet(Vec<bool>); + +#[cfg(feature = "std")] +impl ByteClassSet { + /// Create a new set of byte classes where all bytes are part of the same + /// equivalence class. + pub fn new() -> Self { + ByteClassSet(vec![false; 256]) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0[start as usize - 1] = true; + } + self.0[end as usize] = true; + } + + /// Convert this boolean set to a map that maps all byte values to their + /// corresponding equivalence class. The last mapping indicates the largest + /// equivalence class identifier (which is never bigger than 255). + pub fn byte_classes(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut i = 0; + loop { + classes.set(i as u8, class as u8); + if i >= 255 { + break; + } + if self.0[i] { + class = class.checked_add(1).unwrap(); + } + i += 1; + } + classes + } +} + +#[cfg(test)] +mod tests { + #[cfg(feature = "std")] + #[test] + fn byte_classes() { + use super::ByteClassSet; + + let mut set = ByteClassSet::new(); + set.set_range(b'a', b'z'); + + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassSet::new(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[cfg(feature = "std")] + #[test] + fn full_byte_classes() { + use super::ByteClassSet; + + let mut set = ByteClassSet::new(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.byte_classes().alphabet_len(), 256); + } +} diff --git a/vendor/regex-automata/src/codegen.rs b/vendor/regex-automata/src/codegen.rs new file mode 100644 index 000000000..b2aacbb46 --- /dev/null +++ b/vendor/regex-automata/src/codegen.rs @@ -0,0 +1,104 @@ +// This module is unused. It was written as an experiment to get a ballpark +// idea of what state machines look like when translated to Rust code, and +// in particular, an idea of how much code it generates. The implementation +// below isn't optimal with respect to size, but the result wasn't exactly +// small. At some point, we should pursue building this out beyond +// experimentation, and in particular, probably provide a command line tool +// and/or a macro. It's a fair bit of work, so I abandoned it for the initial +// release. ---AG + +use std::collections::HashMap; +use std::io::Write; + +use dense::DFA; +use state_id::StateID; + +macro_rules! wstr { + ($($tt:tt)*) => { write!($($tt)*).unwrap() } +} + +macro_rules! wstrln { + ($($tt:tt)*) => { writeln!($($tt)*).unwrap() } +} + +pub fn is_match_forward<S: StateID>(dfa: &DFA<S>) -> String { + let names = state_variant_names(dfa); + + let mut buf = vec![]; + wstrln!(buf, "pub fn is_match(input: &[u8]) -> bool {{"); + if dfa.is_match_state(dfa.start()) { + wstrln!(buf, " return true;"); + wstrln!(buf, "}}"); + return String::from_utf8(buf).unwrap(); + } + + wstrln!(buf, "{}", state_enum_def(dfa, &names)); + + wstrln!(buf, " let mut state = {};", names[&dfa.start()]); + wstrln!(buf, " for &b in input.iter() {{"); + wstrln!(buf, " state = match state {{"); + for (id, s) in dfa.iter() { + if dfa.is_match_state(id) { + continue; + } + + wstrln!(buf, " {} => {{", &names[&id]); + wstrln!(buf, " match b {{"); + for (start, end, next_id) in s.sparse_transitions() { + if dfa.is_match_state(next_id) { + wstrln!(buf, " {:?}...{:?} => return true,", start, end); + } else { + if start == end { + wstrln!(buf, " {:?} => {},", start, &names[&next_id]); + } else { + wstrln!(buf, " {:?}...{:?} => {},", start, end, &names[&next_id]); + } + } + } + wstrln!(buf, " _ => S::S0,"); + wstrln!(buf, " }}"); + wstrln!(buf, " }}"); + } + wstrln!(buf, " }};"); + wstrln!(buf, " }}"); + + wstrln!(buf, " false"); + wstrln!(buf, "}}"); + String::from_utf8(buf).unwrap() +} + +fn state_enum_def<S: StateID>( + dfa: &DFA<S>, + variant_names: &HashMap<S, String>, +) -> String { + let mut buf = vec![]; + wstrln!(buf, " #[derive(Clone, Copy)]"); + wstr!(buf, " enum S {{"); + + let mut i = 0; + for (id, _) in dfa.iter() { + if dfa.is_match_state(id) { + continue; + } + if i % 10 == 0 { + wstr!(buf, "\n "); + } + let name = format!("S{}", id.to_usize()); + wstr!(buf, " {},", name); + i += 1; + } + wstr!(buf, "\n"); + wstrln!(buf, " }}"); + String::from_utf8(buf).unwrap() +} + +fn state_variant_names<S: StateID>(dfa: &DFA<S>) -> HashMap<S, String> { + let mut variants = HashMap::new(); + for (id, _) in dfa.iter() { + if dfa.is_match_state(id) { + continue; + } + variants.insert(id, format!("S::S{}", id.to_usize())); + } + variants +} diff --git a/vendor/regex-automata/src/dense.rs b/vendor/regex-automata/src/dense.rs new file mode 100644 index 000000000..ed4d1b683 --- /dev/null +++ b/vendor/regex-automata/src/dense.rs @@ -0,0 +1,2332 @@ +#[cfg(feature = "std")] +use core::fmt; +#[cfg(feature = "std")] +use core::iter; +use core::mem; +use core::slice; + +#[cfg(feature = "std")] +use byteorder::{BigEndian, LittleEndian}; +use byteorder::{ByteOrder, NativeEndian}; +#[cfg(feature = "std")] +use regex_syntax::ParserBuilder; + +use classes::ByteClasses; +#[cfg(feature = "std")] +use determinize::Determinizer; +use dfa::DFA; +#[cfg(feature = "std")] +use error::{Error, Result}; +#[cfg(feature = "std")] +use minimize::Minimizer; +#[cfg(feature = "std")] +use nfa::{self, NFA}; +#[cfg(feature = "std")] +use sparse::SparseDFA; +use state_id::{dead_id, StateID}; +#[cfg(feature = "std")] +use state_id::{ + next_state_id, premultiply_overflow_error, write_state_id_bytes, +}; + +/// The size of the alphabet in a standard DFA. +/// +/// Specifically, this length controls the number of transitions present in +/// each DFA state. However, when the byte class optimization is enabled, +/// then each DFA maps the space of all possible 256 byte values to at most +/// 256 distinct equivalence classes. In this case, the number of distinct +/// equivalence classes corresponds to the internal alphabet of the DFA, in the +/// sense that each DFA state has a number of transitions equal to the number +/// of equivalence classes despite supporting matching on all possible byte +/// values. +const ALPHABET_LEN: usize = 256; + +/// Masks used in serialization of DFAs. +pub(crate) const MASK_PREMULTIPLIED: u16 = 0b0000_0000_0000_0001; +pub(crate) const MASK_ANCHORED: u16 = 0b0000_0000_0000_0010; + +/// A dense table-based deterministic finite automaton (DFA). +/// +/// A dense DFA represents the core matching primitive in this crate. That is, +/// logically, all DFAs have a single start state, one or more match states +/// and a transition table that maps the current state and the current byte of +/// input to the next state. A DFA can use this information to implement fast +/// searching. In particular, the use of a dense DFA generally makes the trade +/// off that match speed is the most valuable characteristic, even if building +/// the regex may take significant time *and* space. As such, the processing +/// of every byte of input is done with a small constant number of operations +/// that does not vary with the pattern, its size or the size of the alphabet. +/// If your needs don't line up with this trade off, then a dense DFA may not +/// be an adequate solution to your problem. +/// +/// In contrast, a [sparse DFA](enum.SparseDFA.html) makes the opposite +/// trade off: it uses less space but will execute a variable number of +/// instructions per byte at match time, which makes it slower for matching. +/// +/// A DFA can be built using the default configuration via the +/// [`DenseDFA::new`](enum.DenseDFA.html#method.new) constructor. Otherwise, +/// one can configure various aspects via the +/// [`dense::Builder`](dense/struct.Builder.html). +/// +/// A single DFA fundamentally supports the following operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of the first possible match. +/// 3. Location of the end of the leftmost-first match. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of a +/// match, *two* DFAs are required. This functionality is provided by a +/// [`Regex`](struct.Regex.html), which can be built with its basic +/// constructor, [`Regex::new`](struct.Regex.html#method.new), or with +/// a [`RegexBuilder`](struct.RegexBuilder.html). +/// +/// # State size +/// +/// A `DenseDFA` has two type parameters, `T` and `S`. `T` corresponds to +/// the type of the DFA's transition table while `S` corresponds to the +/// representation used for the DFA's state identifiers as described by the +/// [`StateID`](trait.StateID.html) trait. This type parameter is typically +/// `usize`, but other valid choices provided by this crate include `u8`, +/// `u16`, `u32` and `u64`. The primary reason for choosing a different state +/// identifier representation than the default is to reduce the amount of +/// memory used by a DFA. Note though, that if the chosen representation cannot +/// accommodate the size of your DFA, then building the DFA will fail and +/// return an error. +/// +/// While the reduction in heap memory used by a DFA is one reason for choosing +/// a smaller state identifier representation, another possible reason is for +/// decreasing the serialization size of a DFA, as returned by +/// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian), +/// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) +/// or +/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). +/// +/// The type of the transition table is typically either `Vec<S>` or `&[S]`, +/// depending on where the transition table is stored. +/// +/// # Variants +/// +/// This DFA is defined as a non-exhaustive enumeration of different types of +/// dense DFAs. All of these dense DFAs use the same internal representation +/// for the transition table, but they vary in how the transition table is +/// read. A DFA's specific variant depends on the configuration options set via +/// [`dense::Builder`](dense/struct.Builder.html). The default variant is +/// `PremultipliedByteClass`. +/// +/// # The `DFA` trait +/// +/// This type implements the [`DFA`](trait.DFA.html) trait, which means it +/// can be used for searching. For example: +/// +/// ``` +/// use regex_automata::{DFA, DenseDFA}; +/// +/// # fn example() -> Result<(), regex_automata::Error> { +/// let dfa = DenseDFA::new("foo[0-9]+")?; +/// assert_eq!(Some(8), dfa.find(b"foo12345")); +/// # Ok(()) }; example().unwrap() +/// ``` +/// +/// The `DFA` trait also provides an assortment of other lower level methods +/// for DFAs, such as `start_state` and `next_state`. While these are correctly +/// implemented, it is an anti-pattern to use them in performance sensitive +/// code on the `DenseDFA` type directly. Namely, each implementation requires +/// a branch to determine which type of dense DFA is being used. Instead, +/// this branch should be pushed up a layer in the code since walking the +/// transitions of a DFA is usually a hot path. If you do need to use these +/// lower level methods in performance critical code, then you should match on +/// the variants of this DFA and use each variant's implementation of the `DFA` +/// trait directly. +#[derive(Clone, Debug)] +pub enum DenseDFA<T: AsRef<[S]>, S: StateID> { + /// A standard DFA that does not use premultiplication or byte classes. + Standard(Standard<T, S>), + /// A DFA that shrinks its alphabet to a set of equivalence classes instead + /// of using all possible byte values. Any two bytes belong to the same + /// equivalence class if and only if they can be used interchangeably + /// anywhere in the DFA while never discriminating between a match and a + /// non-match. + /// + /// This type of DFA can result in significant space reduction with a very + /// small match time performance penalty. + ByteClass(ByteClass<T, S>), + /// A DFA that premultiplies all of its state identifiers in its + /// transition table. This saves an instruction per byte at match time + /// which improves search performance. + /// + /// The only downside of premultiplication is that it may prevent one from + /// using a smaller state identifier representation than you otherwise + /// could. + Premultiplied(Premultiplied<T, S>), + /// The default configuration of a DFA, which uses byte classes and + /// premultiplies its state identifiers. + PremultipliedByteClass(PremultipliedByteClass<T, S>), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> { + /// Return the internal DFA representation. + /// + /// All variants share the same internal representation. + fn repr(&self) -> &Repr<T, S> { + match *self { + DenseDFA::Standard(ref r) => &r.0, + DenseDFA::ByteClass(ref r) => &r.0, + DenseDFA::Premultiplied(ref r) => &r.0, + DenseDFA::PremultipliedByteClass(ref r) => &r.0, + DenseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +#[cfg(feature = "std")] +impl DenseDFA<Vec<usize>, usize> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding DFA. + /// + /// The default configuration uses `usize` for state IDs, premultiplies + /// them and reduces the alphabet size by splitting bytes into equivalence + /// classes. The DFA is *not* minimized. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](dense/struct.Builder.html) + /// to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = DenseDFA::new("foo[0-9]+bar")?; + /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn new(pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> { + Builder::new().build(pattern) + } +} + +#[cfg(feature = "std")] +impl<S: StateID> DenseDFA<Vec<S>, S> { + /// Create a new empty DFA that never matches any input. + /// + /// # Example + /// + /// In order to build an empty DFA, callers must provide a type hint + /// indicating their choice of state identifier representation. + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa: DenseDFA<Vec<usize>, usize> = DenseDFA::empty(); + /// assert_eq!(None, dfa.find(b"")); + /// assert_eq!(None, dfa.find(b"foo")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn empty() -> DenseDFA<Vec<S>, S> { + Repr::empty().into_dense_dfa() + } +} + +impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> { + /// Cheaply return a borrowed version of this dense DFA. Specifically, the + /// DFA returned always uses `&[S]` for its transition table while keeping + /// the same state identifier representation. + pub fn as_ref<'a>(&'a self) -> DenseDFA<&'a [S], S> { + match *self { + DenseDFA::Standard(ref r) => { + DenseDFA::Standard(Standard(r.0.as_ref())) + } + DenseDFA::ByteClass(ref r) => { + DenseDFA::ByteClass(ByteClass(r.0.as_ref())) + } + DenseDFA::Premultiplied(ref r) => { + DenseDFA::Premultiplied(Premultiplied(r.0.as_ref())) + } + DenseDFA::PremultipliedByteClass(ref r) => { + let inner = PremultipliedByteClass(r.0.as_ref()); + DenseDFA::PremultipliedByteClass(inner) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u8>` for its transition table while keeping + /// the same state identifier representation. + /// + /// Effectively, this returns a sparse DFA whose transition table lives + /// on the heap. + #[cfg(feature = "std")] + pub fn to_owned(&self) -> DenseDFA<Vec<S>, S> { + match *self { + DenseDFA::Standard(ref r) => { + DenseDFA::Standard(Standard(r.0.to_owned())) + } + DenseDFA::ByteClass(ref r) => { + DenseDFA::ByteClass(ByteClass(r.0.to_owned())) + } + DenseDFA::Premultiplied(ref r) => { + DenseDFA::Premultiplied(Premultiplied(r.0.to_owned())) + } + DenseDFA::PremultipliedByteClass(ref r) => { + let inner = PremultipliedByteClass(r.0.to_owned()); + DenseDFA::PremultipliedByteClass(inner) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA's transition table. This corresponds to heap memory + /// usage. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, used `std::mem::size_of::<DenseDFA>()`. + pub fn memory_usage(&self) -> usize { + self.repr().memory_usage() + } +} + +/// Routines for converting a dense DFA to other representations, such as +/// sparse DFAs, smaller state identifiers or raw bytes suitable for persistent +/// storage. +#[cfg(feature = "std")] +impl<T: AsRef<[S]>, S: StateID> DenseDFA<T, S> { + /// Convert this dense DFA to a sparse DFA. + /// + /// This is a convenience routine for `to_sparse_sized` that fixes the + /// state identifier representation of the sparse DFA to the same + /// representation used for this dense DFA. + /// + /// If the chosen state identifier representation is too small to represent + /// all states in the sparse DFA, then this returns an error. In most + /// cases, if a dense DFA is constructable with `S` then a sparse DFA will + /// be as well. However, it is not guaranteed. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dense = DenseDFA::new("foo[0-9]+")?; + /// let sparse = dense.to_sparse()?; + /// assert_eq!(Some(8), sparse.find(b"foo12345")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn to_sparse(&self) -> Result<SparseDFA<Vec<u8>, S>> { + self.to_sparse_sized() + } + + /// Convert this dense DFA to a sparse DFA. + /// + /// Using this routine requires supplying a type hint to choose the state + /// identifier representation for the resulting sparse DFA. + /// + /// If the chosen state identifier representation is too small to represent + /// all states in the sparse DFA, then this returns an error. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dense = DenseDFA::new("foo[0-9]+")?; + /// let sparse = dense.to_sparse_sized::<u8>()?; + /// assert_eq!(Some(8), sparse.find(b"foo12345")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn to_sparse_sized<A: StateID>( + &self, + ) -> Result<SparseDFA<Vec<u8>, A>> { + self.repr().to_sparse_sized() + } + + /// Create a new DFA whose match semantics are equivalent to this DFA, + /// but attempt to use `u8` for the representation of state identifiers. + /// If `u8` is insufficient to represent all state identifiers in this + /// DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u8>()`. + pub fn to_u8(&self) -> Result<DenseDFA<Vec<u8>, u8>> { + self.to_sized() + } + + /// Create a new DFA whose match semantics are equivalent to this DFA, + /// but attempt to use `u16` for the representation of state identifiers. + /// If `u16` is insufficient to represent all state identifiers in this + /// DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u16>()`. + pub fn to_u16(&self) -> Result<DenseDFA<Vec<u16>, u16>> { + self.to_sized() + } + + /// Create a new DFA whose match semantics are equivalent to this DFA, + /// but attempt to use `u32` for the representation of state identifiers. + /// If `u32` is insufficient to represent all state identifiers in this + /// DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u32>()`. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub fn to_u32(&self) -> Result<DenseDFA<Vec<u32>, u32>> { + self.to_sized() + } + + /// Create a new DFA whose match semantics are equivalent to this DFA, + /// but attempt to use `u64` for the representation of state identifiers. + /// If `u64` is insufficient to represent all state identifiers in this + /// DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u64>()`. + #[cfg(target_pointer_width = "64")] + pub fn to_u64(&self) -> Result<DenseDFA<Vec<u64>, u64>> { + self.to_sized() + } + + /// Create a new DFA whose match semantics are equivalent to this DFA, but + /// attempt to use `A` for the representation of state identifiers. If `A` + /// is insufficient to represent all state identifiers in this DFA, then + /// this returns an error. + /// + /// An alternative way to construct such a DFA is to use + /// [`dense::Builder::build_with_size`](dense/struct.Builder.html#method.build_with_size). + /// In general, using the builder is preferred since it will use the given + /// state identifier representation throughout determinization (and + /// minimization, if done), and thereby using less memory throughout the + /// entire construction process. However, these routines are necessary + /// in cases where, say, a minimized DFA could fit in a smaller state + /// identifier representation, but the initial determinized DFA would not. + pub fn to_sized<A: StateID>(&self) -> Result<DenseDFA<Vec<A>, A>> { + self.repr().to_sized().map(|r| r.into_dense_dfa()) + } + + /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in little + /// endian format. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<LittleEndian>() + } + + /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in big + /// endian format. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<BigEndian>() + } + + /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in native + /// endian format. Generally, it is better to pick an explicit endianness + /// using either `to_bytes_little_endian` or `to_bytes_big_endian`. This + /// routine is useful in tests where the DFA is serialized and deserialized + /// on the same platform. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<NativeEndian>() + } +} + +impl<'a, S: StateID> DenseDFA<&'a [S], S> { + /// Deserialize a DFA with a specific state identifier representation. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// This is also guaranteed to be a constant time operation that does not + /// vary with the size of the DFA. + /// + /// The bytes given should be generated by the serialization of a DFA with + /// either the + /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) + /// method or the + /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) + /// endian, depending on the endianness of the machine you are + /// deserializing this DFA from. + /// + /// If the state identifier representation is `usize`, then deserialization + /// is dependent on the pointer size. For this reason, it is best to + /// serialize DFAs using a fixed size representation for your state + /// identifiers, such as `u8`, `u16`, `u32` or `u64`. + /// + /// # Panics + /// + /// The bytes given should be *trusted*. In particular, if the bytes + /// are not a valid serialization of a DFA, or if the given bytes are + /// not aligned to an 8 byte boundary, or if the endianness of the + /// serialized bytes is different than the endianness of the machine that + /// is deserializing the DFA, then this routine will panic. Moreover, it is + /// possible for this deserialization routine to succeed even if the given + /// bytes do not represent a valid serialized dense DFA. + /// + /// # Safety + /// + /// This routine is unsafe because it permits callers to provide an + /// arbitrary transition table with possibly incorrect transitions. While + /// the various serialization routines will never return an incorrect + /// transition table, there is no guarantee that the bytes provided here + /// are correct. While deserialization does many checks (as documented + /// above in the panic conditions), this routine does not check that the + /// transition table is correct. Given an incorrect transition table, it is + /// possible for the search routines to access out-of-bounds memory because + /// of explicit bounds check elision. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. Note that we first convert the DFA to + /// using `u16` for its state identifier representation before serializing + /// it. While this isn't strictly necessary, it's good practice in order to + /// decrease the size of the DFA and to avoid platform specific pitfalls + /// such as differing pointer sizes. + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let initial = DenseDFA::new("foo[0-9]+")?; + /// let bytes = initial.to_u16()?.to_bytes_native_endian()?; + /// let dfa: DenseDFA<&[u16], u16> = unsafe { + /// DenseDFA::from_bytes(&bytes) + /// }; + /// + /// assert_eq!(Some(8), dfa.find(b"foo12345")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub unsafe fn from_bytes(buf: &'a [u8]) -> DenseDFA<&'a [S], S> { + Repr::from_bytes(buf).into_dense_dfa() + } +} + +#[cfg(feature = "std")] +impl<S: StateID> DenseDFA<Vec<S>, S> { + /// Minimize this DFA in place. + /// + /// This is not part of the public API. It is only exposed to allow for + /// more granular external benchmarking. + #[doc(hidden)] + pub fn minimize(&mut self) { + self.repr_mut().minimize(); + } + + /// Return a mutable reference to the internal DFA representation. + fn repr_mut(&mut self) -> &mut Repr<Vec<S>, S> { + match *self { + DenseDFA::Standard(ref mut r) => &mut r.0, + DenseDFA::ByteClass(ref mut r) => &mut r.0, + DenseDFA::Premultiplied(ref mut r) => &mut r.0, + DenseDFA::PremultipliedByteClass(ref mut r) => &mut r.0, + DenseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +impl<T: AsRef<[S]>, S: StateID> DFA for DenseDFA<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.repr().start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.repr().is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.repr().is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + match *self { + DenseDFA::Standard(ref r) => r.next_state(current, input), + DenseDFA::ByteClass(ref r) => r.next_state(current, input), + DenseDFA::Premultiplied(ref r) => r.next_state(current, input), + DenseDFA::PremultipliedByteClass(ref r) => { + r.next_state(current, input) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + match *self { + DenseDFA::Standard(ref r) => { + r.next_state_unchecked(current, input) + } + DenseDFA::ByteClass(ref r) => { + r.next_state_unchecked(current, input) + } + DenseDFA::Premultiplied(ref r) => { + r.next_state_unchecked(current, input) + } + DenseDFA::PremultipliedByteClass(ref r) => { + r.next_state_unchecked(current, input) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + // We specialize the following methods because it lets us lift the + // case analysis between the different types of dense DFAs. Instead of + // doing the case analysis for every transition, we do it once before + // searching. + + #[inline] + fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { + match *self { + DenseDFA::Standard(ref r) => r.is_match_at(bytes, start), + DenseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), + DenseDFA::Premultiplied(ref r) => r.is_match_at(bytes, start), + DenseDFA::PremultipliedByteClass(ref r) => { + r.is_match_at(bytes, start) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + DenseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), + DenseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), + DenseDFA::Premultiplied(ref r) => { + r.shortest_match_at(bytes, start) + } + DenseDFA::PremultipliedByteClass(ref r) => { + r.shortest_match_at(bytes, start) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + DenseDFA::Standard(ref r) => r.find_at(bytes, start), + DenseDFA::ByteClass(ref r) => r.find_at(bytes, start), + DenseDFA::Premultiplied(ref r) => r.find_at(bytes, start), + DenseDFA::PremultipliedByteClass(ref r) => r.find_at(bytes, start), + DenseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + DenseDFA::Standard(ref r) => r.rfind_at(bytes, start), + DenseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), + DenseDFA::Premultiplied(ref r) => r.rfind_at(bytes, start), + DenseDFA::PremultipliedByteClass(ref r) => { + r.rfind_at(bytes, start) + } + DenseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +/// A standard dense DFA that does not use premultiplication or byte classes. +/// +/// Generally, it isn't necessary to use this type directly, since a `DenseDFA` +/// can be used for searching directly. One possible reason why one might want +/// to use this type directly is if you are implementing your own search +/// routines by walking a DFA's transitions directly. In that case, you'll want +/// to use this type (or any of the other DFA variant types) directly, since +/// they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct Standard<T: AsRef<[S]>, S: StateID>(Repr<T, S>); + +impl<T: AsRef<[S]>, S: StateID> DFA for Standard<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + let o = current.to_usize() * ALPHABET_LEN + input as usize; + self.0.trans()[o] + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + let o = current.to_usize() * ALPHABET_LEN + input as usize; + *self.0.trans().get_unchecked(o) + } +} + +/// A dense DFA that shrinks its alphabet. +/// +/// Alphabet shrinking is achieved by using a set of equivalence classes +/// instead of using all possible byte values. Any two bytes belong to the same +/// equivalence class if and only if they can be used interchangeably anywhere +/// in the DFA while never discriminating between a match and a non-match. +/// +/// This type of DFA can result in significant space reduction with a very +/// small match time performance penalty. +/// +/// Generally, it isn't necessary to use this type directly, since a `DenseDFA` +/// can be used for searching directly. One possible reason why one might want +/// to use this type directly is if you are implementing your own search +/// routines by walking a DFA's transitions directly. In that case, you'll want +/// to use this type (or any of the other DFA variant types) directly, since +/// they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct ByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>); + +impl<T: AsRef<[S]>, S: StateID> DFA for ByteClass<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + let input = self.0.byte_classes().get(input); + let o = current.to_usize() * self.0.alphabet_len() + input as usize; + self.0.trans()[o] + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + let input = self.0.byte_classes().get_unchecked(input); + let o = current.to_usize() * self.0.alphabet_len() + input as usize; + *self.0.trans().get_unchecked(o) + } +} + +/// A dense DFA that premultiplies all of its state identifiers in its +/// transition table. +/// +/// This saves an instruction per byte at match time which improves search +/// performance. +/// +/// The only downside of premultiplication is that it may prevent one from +/// using a smaller state identifier representation than you otherwise could. +/// +/// Generally, it isn't necessary to use this type directly, since a `DenseDFA` +/// can be used for searching directly. One possible reason why one might want +/// to use this type directly is if you are implementing your own search +/// routines by walking a DFA's transitions directly. In that case, you'll want +/// to use this type (or any of the other DFA variant types) directly, since +/// they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct Premultiplied<T: AsRef<[S]>, S: StateID>(Repr<T, S>); + +impl<T: AsRef<[S]>, S: StateID> DFA for Premultiplied<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + let o = current.to_usize() + input as usize; + self.0.trans()[o] + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + let o = current.to_usize() + input as usize; + *self.0.trans().get_unchecked(o) + } +} + +/// The default configuration of a dense DFA, which uses byte classes and +/// premultiplies its state identifiers. +/// +/// Generally, it isn't necessary to use this type directly, since a `DenseDFA` +/// can be used for searching directly. One possible reason why one might want +/// to use this type directly is if you are implementing your own search +/// routines by walking a DFA's transitions directly. In that case, you'll want +/// to use this type (or any of the other DFA variant types) directly, since +/// they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct PremultipliedByteClass<T: AsRef<[S]>, S: StateID>(Repr<T, S>); + +impl<T: AsRef<[S]>, S: StateID> DFA for PremultipliedByteClass<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + let input = self.0.byte_classes().get(input); + let o = current.to_usize() + input as usize; + self.0.trans()[o] + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + let input = self.0.byte_classes().get_unchecked(input); + let o = current.to_usize() + input as usize; + *self.0.trans().get_unchecked(o) + } +} + +/// The internal representation of a dense DFA. +/// +/// This representation is shared by all DFA variants. +#[derive(Clone)] +#[cfg_attr(not(feature = "std"), derive(Debug))] +pub(crate) struct Repr<T, S> { + /// Whether the state identifiers in the transition table have been + /// premultiplied or not. + /// + /// Premultiplied identifiers means that instead of your matching loop + /// looking something like this: + /// + /// state = dfa.start + /// for byte in haystack: + /// next = dfa.transitions[state * len(alphabet) + byte] + /// if dfa.is_match(next): + /// return true + /// return false + /// + /// it can instead look like this: + /// + /// state = dfa.start + /// for byte in haystack: + /// next = dfa.transitions[state + byte] + /// if dfa.is_match(next): + /// return true + /// return false + /// + /// In other words, we save a multiplication instruction in the critical + /// path. This turns out to be a decent performance win. The cost of using + /// premultiplied state ids is that they can require a bigger state id + /// representation. + premultiplied: bool, + /// Whether this DFA can only match at the beginning of input or not. + /// + /// When true, a match should only be reported if it begins at the 0th + /// index of the haystack. + anchored: bool, + /// The initial start state ID. + start: S, + /// The total number of states in this DFA. Note that a DFA always has at + /// least one state---the dead state---even the empty DFA. In particular, + /// the dead state always has ID 0 and is correspondingly always the first + /// state. The dead state is never a match state. + state_count: usize, + /// States in a DFA have a *partial* ordering such that a match state + /// always precedes any non-match state (except for the special dead + /// state). + /// + /// `max_match` corresponds to the last state that is a match state. This + /// encoding has two critical benefits. Firstly, we are not required to + /// store any additional per-state information about whether it is a match + /// state or not. Secondly, when searching with the DFA, we can do a single + /// comparison with `max_match` for each byte instead of two comparisons + /// for each byte (one testing whether it is a match and the other testing + /// whether we've reached a dead state). Namely, to determine the status + /// of the next state, we can do this: + /// + /// next_state = transition[cur_state * alphabet_len + cur_byte] + /// if next_state <= max_match: + /// // next_state is either dead (no-match) or a match + /// return next_state != dead + max_match: S, + /// A set of equivalence classes, where a single equivalence class + /// represents a set of bytes that never discriminate between a match + /// and a non-match in the DFA. Each equivalence class corresponds to + /// a single letter in this DFA's alphabet, where the maximum number of + /// letters is 256 (each possible value of a byte). Consequently, the + /// number of equivalence classes corresponds to the number of transitions + /// for each DFA state. + /// + /// The only time the number of equivalence classes is fewer than 256 is + /// if the DFA's kind uses byte classes. If the DFA doesn't use byte + /// classes, then this vector is empty. + byte_classes: ByteClasses, + /// A contiguous region of memory representing the transition table in + /// row-major order. The representation is dense. That is, every state has + /// precisely the same number of transitions. The maximum number of + /// transitions is 256. If a DFA has been instructed to use byte classes, + /// then the number of transitions can be much less. + /// + /// In practice, T is either Vec<S> or &[S]. + trans: T, +} + +#[cfg(feature = "std")] +impl<S: StateID> Repr<Vec<S>, S> { + /// Create a new empty DFA with singleton byte classes (every byte is its + /// own equivalence class). + pub fn empty() -> Repr<Vec<S>, S> { + Repr::empty_with_byte_classes(ByteClasses::singletons()) + } + + /// Create a new empty DFA with the given set of byte equivalence classes. + /// An empty DFA never matches any input. + pub fn empty_with_byte_classes( + byte_classes: ByteClasses, + ) -> Repr<Vec<S>, S> { + let mut dfa = Repr { + premultiplied: false, + anchored: true, + start: dead_id(), + state_count: 0, + max_match: S::from_usize(0), + byte_classes, + trans: vec![], + }; + // Every state ID repr must be able to fit at least one state. + dfa.add_empty_state().unwrap(); + dfa + } + + /// Sets whether this DFA is anchored or not. + pub fn anchored(mut self, yes: bool) -> Repr<Vec<S>, S> { + self.anchored = yes; + self + } +} + +impl<T: AsRef<[S]>, S: StateID> Repr<T, S> { + /// Convert this internal DFA representation to a DenseDFA based on its + /// transition table access pattern. + pub fn into_dense_dfa(self) -> DenseDFA<T, S> { + match (self.premultiplied, self.byte_classes().is_singleton()) { + // no premultiplication, no byte classes + (false, true) => DenseDFA::Standard(Standard(self)), + // no premultiplication, yes byte classes + (false, false) => DenseDFA::ByteClass(ByteClass(self)), + // yes premultiplication, no byte classes + (true, true) => DenseDFA::Premultiplied(Premultiplied(self)), + // yes premultiplication, yes byte classes + (true, false) => { + DenseDFA::PremultipliedByteClass(PremultipliedByteClass(self)) + } + } + } + + fn as_ref<'a>(&'a self) -> Repr<&'a [S], S> { + Repr { + premultiplied: self.premultiplied, + anchored: self.anchored, + start: self.start, + state_count: self.state_count, + max_match: self.max_match, + byte_classes: self.byte_classes().clone(), + trans: self.trans(), + } + } + + #[cfg(feature = "std")] + fn to_owned(&self) -> Repr<Vec<S>, S> { + Repr { + premultiplied: self.premultiplied, + anchored: self.anchored, + start: self.start, + state_count: self.state_count, + max_match: self.max_match, + byte_classes: self.byte_classes().clone(), + trans: self.trans().to_vec(), + } + } + + /// Return the starting state of this DFA. + /// + /// All searches using this DFA must begin at this state. There is exactly + /// one starting state for every DFA. A starting state may be a dead state + /// or a matching state or neither. + pub fn start_state(&self) -> S { + self.start + } + + /// Returns true if and only if the given identifier corresponds to a match + /// state. + pub fn is_match_state(&self, id: S) -> bool { + id <= self.max_match && id != dead_id() + } + + /// Returns true if and only if the given identifier corresponds to a dead + /// state. + pub fn is_dead_state(&self, id: S) -> bool { + id == dead_id() + } + + /// Returns true if and only if the given identifier could correspond to + /// either a match state or a dead state. If this returns false, then the + /// given identifier does not correspond to either a match state or a dead + /// state. + pub fn is_match_or_dead_state(&self, id: S) -> bool { + id <= self.max_match_state() + } + + /// Returns the maximum identifier for which a match state can exist. + /// + /// More specifically, the return identifier always corresponds to either + /// a match state or a dead state. Namely, either + /// `is_match_state(returned)` or `is_dead_state(returned)` is guaranteed + /// to be true. + pub fn max_match_state(&self) -> S { + self.max_match + } + + /// Returns true if and only if this DFA is anchored. + pub fn is_anchored(&self) -> bool { + self.anchored + } + + /// Return the byte classes used by this DFA. + pub fn byte_classes(&self) -> &ByteClasses { + &self.byte_classes + } + + /// Returns an iterator over all states in this DFA. + /// + /// This iterator yields a tuple for each state. The first element of the + /// tuple corresponds to a state's identifier, and the second element + /// corresponds to the state itself (comprised of its transitions). + /// + /// If this DFA is premultiplied, then the state identifiers are in + /// turn premultiplied as well, making them usable without additional + /// modification. + #[cfg(feature = "std")] + pub fn states(&self) -> StateIter<T, S> { + let it = self.trans().chunks(self.alphabet_len()); + StateIter { dfa: self, it: it.enumerate() } + } + + /// Return the total number of states in this DFA. Every DFA has at least + /// 1 state, even the empty DFA. + #[cfg(feature = "std")] + pub fn state_count(&self) -> usize { + self.state_count + } + + /// Return the number of elements in this DFA's alphabet. + /// + /// If this DFA doesn't use byte classes, then this is always equivalent + /// to 256. Otherwise, it is guaranteed to be some value less than or equal + /// to 256. + pub fn alphabet_len(&self) -> usize { + self.byte_classes().alphabet_len() + } + + /// Returns the memory usage, in bytes, of this DFA. + pub fn memory_usage(&self) -> usize { + self.trans().len() * mem::size_of::<S>() + } + + /// Convert the given state identifier to the state's index. The state's + /// index corresponds to the position in which it appears in the transition + /// table. When a DFA is NOT premultiplied, then a state's identifier is + /// also its index. When a DFA is premultiplied, then a state's identifier + /// is equal to `index * alphabet_len`. This routine reverses that. + #[cfg(feature = "std")] + pub fn state_id_to_index(&self, id: S) -> usize { + if self.premultiplied { + id.to_usize() / self.alphabet_len() + } else { + id.to_usize() + } + } + + /// Return this DFA's transition table as a slice. + fn trans(&self) -> &[S] { + self.trans.as_ref() + } + + /// Create a sparse DFA from the internal representation of a dense DFA. + #[cfg(feature = "std")] + pub fn to_sparse_sized<A: StateID>( + &self, + ) -> Result<SparseDFA<Vec<u8>, A>> { + SparseDFA::from_dense_sized(self) + } + + /// Create a new DFA whose match semantics are equivalent to this DFA, but + /// attempt to use `A` for the representation of state identifiers. If `A` + /// is insufficient to represent all state identifiers in this DFA, then + /// this returns an error. + #[cfg(feature = "std")] + pub fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<A>, A>> { + // Check that this DFA can fit into A's representation. + let mut last_state_id = self.state_count - 1; + if self.premultiplied { + last_state_id *= self.alphabet_len(); + } + if last_state_id > A::max_id() { + return Err(Error::state_id_overflow(A::max_id())); + } + + // We're off to the races. The new DFA is the same as the old one, + // but its transition table is truncated. + let mut new = Repr { + premultiplied: self.premultiplied, + anchored: self.anchored, + start: A::from_usize(self.start.to_usize()), + state_count: self.state_count, + max_match: A::from_usize(self.max_match.to_usize()), + byte_classes: self.byte_classes().clone(), + trans: vec![dead_id::<A>(); self.trans().len()], + }; + for (i, id) in new.trans.iter_mut().enumerate() { + *id = A::from_usize(self.trans()[i].to_usize()); + } + Ok(new) + } + + /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + #[cfg(feature = "std")] + pub(crate) fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> { + let label = b"rust-regex-automata-dfa\x00"; + assert_eq!(24, label.len()); + + let trans_size = mem::size_of::<S>() * self.trans().len(); + let size = + // For human readable label. + label.len() + // endiannes check, must be equal to 0xFEFF for native endian + + 2 + // For version number. + + 2 + // Size of state ID representation, in bytes. + // Must be 1, 2, 4 or 8. + + 2 + // For DFA misc options. + + 2 + // For start state. + + 8 + // For state count. + + 8 + // For max match state. + + 8 + // For byte class map. + + 256 + // For transition table. + + trans_size; + // sanity check, this can be updated if need be + assert_eq!(312 + trans_size, size); + // This must always pass. It checks that the transition table is at + // a properly aligned address. + assert_eq!(0, (size - trans_size) % 8); + + let mut buf = vec![0; size]; + let mut i = 0; + + // write label + for &b in label { + buf[i] = b; + i += 1; + } + // endianness check + A::write_u16(&mut buf[i..], 0xFEFF); + i += 2; + // version number + A::write_u16(&mut buf[i..], 1); + i += 2; + // size of state ID + let state_size = mem::size_of::<S>(); + if ![1, 2, 4, 8].contains(&state_size) { + return Err(Error::serialize(&format!( + "state size of {} not supported, must be 1, 2, 4 or 8", + state_size + ))); + } + A::write_u16(&mut buf[i..], state_size as u16); + i += 2; + // DFA misc options + let mut options = 0u16; + if self.premultiplied { + options |= MASK_PREMULTIPLIED; + } + if self.anchored { + options |= MASK_ANCHORED; + } + A::write_u16(&mut buf[i..], options); + i += 2; + // start state + A::write_u64(&mut buf[i..], self.start.to_usize() as u64); + i += 8; + // state count + A::write_u64(&mut buf[i..], self.state_count as u64); + i += 8; + // max match state + A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64); + i += 8; + // byte class map + for b in (0..256).map(|b| b as u8) { + buf[i] = self.byte_classes().get(b); + i += 1; + } + // transition table + for &id in self.trans() { + write_state_id_bytes::<A, _>(&mut buf[i..], id); + i += state_size; + } + assert_eq!(size, i, "expected to consume entire buffer"); + + Ok(buf) + } +} + +impl<'a, S: StateID> Repr<&'a [S], S> { + /// The implementation for deserializing a DFA from raw bytes. + unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [S], S> { + assert_eq!( + 0, + buf.as_ptr() as usize % mem::align_of::<S>(), + "DenseDFA starting at address {} is not aligned to {} bytes", + buf.as_ptr() as usize, + mem::align_of::<S>() + ); + + // skip over label + match buf.iter().position(|&b| b == b'\x00') { + None => panic!("could not find label"), + Some(i) => buf = &buf[i + 1..], + } + + // check that current endianness is same as endianness of DFA + let endian_check = NativeEndian::read_u16(buf); + buf = &buf[2..]; + if endian_check != 0xFEFF { + panic!( + "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ + are you trying to load a DenseDFA serialized with a \ + different endianness?", + endian_check, + ); + } + + // check that the version number is supported + let version = NativeEndian::read_u16(buf); + buf = &buf[2..]; + if version != 1 { + panic!( + "expected version 1, but found unsupported version {}", + version, + ); + } + + // read size of state + let state_size = NativeEndian::read_u16(buf) as usize; + if state_size != mem::size_of::<S>() { + panic!( + "state size of DenseDFA ({}) does not match \ + requested state size ({})", + state_size, + mem::size_of::<S>(), + ); + } + buf = &buf[2..]; + + // read miscellaneous options + let opts = NativeEndian::read_u16(buf); + buf = &buf[2..]; + + // read start state + let start = S::from_usize(NativeEndian::read_u64(buf) as usize); + buf = &buf[8..]; + + // read state count + let state_count = NativeEndian::read_u64(buf) as usize; + buf = &buf[8..]; + + // read max match state + let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); + buf = &buf[8..]; + + // read byte classes + let byte_classes = ByteClasses::from_slice(&buf[..256]); + buf = &buf[256..]; + + let len = state_count * byte_classes.alphabet_len(); + let len_bytes = len * state_size; + assert!( + buf.len() <= len_bytes, + "insufficient transition table bytes, \ + expected at least {} but only have {}", + len_bytes, + buf.len() + ); + assert_eq!( + 0, + buf.as_ptr() as usize % mem::align_of::<S>(), + "DenseDFA transition table is not properly aligned" + ); + + // SAFETY: This is the only actual not-safe thing in this entire + // routine. The key things we need to worry about here are alignment + // and size. The two asserts above should cover both conditions. + let trans = slice::from_raw_parts(buf.as_ptr() as *const S, len); + Repr { + premultiplied: opts & MASK_PREMULTIPLIED > 0, + anchored: opts & MASK_ANCHORED > 0, + start, + state_count, + max_match, + byte_classes, + trans, + } + } +} + +/// The following methods implement mutable routines on the internal +/// representation of a DFA. As such, we must fix the first type parameter to +/// a `Vec<S>` since a generic `T: AsRef<[S]>` does not permit mutation. We +/// can get away with this because these methods are internal to the crate and +/// are exclusively used during construction of the DFA. +#[cfg(feature = "std")] +impl<S: StateID> Repr<Vec<S>, S> { + pub fn premultiply(&mut self) -> Result<()> { + if self.premultiplied || self.state_count <= 1 { + return Ok(()); + } + + let alpha_len = self.alphabet_len(); + premultiply_overflow_error( + S::from_usize(self.state_count - 1), + alpha_len, + )?; + + for id in (0..self.state_count).map(S::from_usize) { + for (_, next) in self.get_state_mut(id).iter_mut() { + *next = S::from_usize(next.to_usize() * alpha_len); + } + } + self.premultiplied = true; + self.start = S::from_usize(self.start.to_usize() * alpha_len); + self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len); + Ok(()) + } + + /// Minimize this DFA using Hopcroft's algorithm. + /// + /// This cannot be called on a premultiplied DFA. + pub fn minimize(&mut self) { + assert!(!self.premultiplied, "can't minimize premultiplied DFA"); + + Minimizer::new(self).run(); + } + + /// Set the start state of this DFA. + /// + /// Note that a start state cannot be set on a premultiplied DFA. Instead, + /// DFAs should first be completely constructed and then premultiplied. + pub fn set_start_state(&mut self, start: S) { + assert!(!self.premultiplied, "can't set start on premultiplied DFA"); + assert!(start.to_usize() < self.state_count, "invalid start state"); + + self.start = start; + } + + /// Set the maximum state identifier that could possible correspond to a + /// match state. + /// + /// Callers must uphold the invariant that any state identifier less than + /// or equal to the identifier given is either a match state or the special + /// dead state (which always has identifier 0 and whose transitions all + /// lead back to itself). + /// + /// This cannot be called on a premultiplied DFA. + pub fn set_max_match_state(&mut self, id: S) { + assert!(!self.premultiplied, "can't set match on premultiplied DFA"); + assert!(id.to_usize() < self.state_count, "invalid max match state"); + + self.max_match = id; + } + + /// Add the given transition to this DFA. Both the `from` and `to` states + /// must already exist. + /// + /// This cannot be called on a premultiplied DFA. + pub fn add_transition(&mut self, from: S, byte: u8, to: S) { + assert!(!self.premultiplied, "can't add trans to premultiplied DFA"); + assert!(from.to_usize() < self.state_count, "invalid from state"); + assert!(to.to_usize() < self.state_count, "invalid to state"); + + let class = self.byte_classes().get(byte); + let offset = from.to_usize() * self.alphabet_len() + class as usize; + self.trans[offset] = to; + } + + /// An an empty state (a state where all transitions lead to a dead state) + /// and return its identifier. The identifier returned is guaranteed to + /// not point to any other existing state. + /// + /// If adding a state would exhaust the state identifier space (given by + /// `S`), then this returns an error. In practice, this means that the + /// state identifier representation chosen is too small. + /// + /// This cannot be called on a premultiplied DFA. + pub fn add_empty_state(&mut self) -> Result<S> { + assert!(!self.premultiplied, "can't add state to premultiplied DFA"); + + let id = if self.state_count == 0 { + S::from_usize(0) + } else { + next_state_id(S::from_usize(self.state_count - 1))? + }; + let alphabet_len = self.alphabet_len(); + self.trans.extend(iter::repeat(dead_id::<S>()).take(alphabet_len)); + // This should never panic, since state_count is a usize. The + // transition table size would have run out of room long ago. + self.state_count = self.state_count.checked_add(1).unwrap(); + Ok(id) + } + + /// Return a mutable representation of the state corresponding to the given + /// id. This is useful for implementing routines that manipulate DFA states + /// (e.g., swapping states). + /// + /// This cannot be called on a premultiplied DFA. + pub fn get_state_mut(&mut self, id: S) -> StateMut<S> { + assert!(!self.premultiplied, "can't get state in premultiplied DFA"); + + let alphabet_len = self.alphabet_len(); + let offset = id.to_usize() * alphabet_len; + StateMut { + transitions: &mut self.trans[offset..offset + alphabet_len], + } + } + + /// Swap the two states given in the transition table. + /// + /// This routine does not do anything to check the correctness of this + /// swap. Callers must ensure that other states pointing to id1 and id2 are + /// updated appropriately. + /// + /// This cannot be called on a premultiplied DFA. + pub fn swap_states(&mut self, id1: S, id2: S) { + assert!(!self.premultiplied, "can't swap states in premultiplied DFA"); + + let o1 = id1.to_usize() * self.alphabet_len(); + let o2 = id2.to_usize() * self.alphabet_len(); + for b in 0..self.alphabet_len() { + self.trans.swap(o1 + b, o2 + b); + } + } + + /// Truncate the states in this DFA to the given count. + /// + /// This routine does not do anything to check the correctness of this + /// truncation. Callers must ensure that other states pointing to truncated + /// states are updated appropriately. + /// + /// This cannot be called on a premultiplied DFA. + pub fn truncate_states(&mut self, count: usize) { + assert!(!self.premultiplied, "can't truncate in premultiplied DFA"); + + let alphabet_len = self.alphabet_len(); + self.trans.truncate(count * alphabet_len); + self.state_count = count; + } + + /// This routine shuffles all match states in this DFA---according to the + /// given map---to the beginning of the DFA such that every non-match state + /// appears after every match state. (With one exception: the special dead + /// state remains as the first state.) The given map should have length + /// exactly equivalent to the number of states in this DFA. + /// + /// The purpose of doing this shuffling is to avoid the need to store + /// additional state to determine whether a state is a match state or not. + /// It also enables a single conditional in the core matching loop instead + /// of two. + /// + /// This updates `self.max_match` to point to the last matching state as + /// well as `self.start` if the starting state was moved. + pub fn shuffle_match_states(&mut self, is_match: &[bool]) { + assert!( + !self.premultiplied, + "cannot shuffle match states of premultiplied DFA" + ); + assert_eq!(self.state_count, is_match.len()); + + if self.state_count <= 1 { + return; + } + + let mut first_non_match = 1; + while first_non_match < self.state_count && is_match[first_non_match] { + first_non_match += 1; + } + + let mut swaps: Vec<S> = vec![dead_id(); self.state_count]; + let mut cur = self.state_count - 1; + while cur > first_non_match { + if is_match[cur] { + self.swap_states( + S::from_usize(cur), + S::from_usize(first_non_match), + ); + swaps[cur] = S::from_usize(first_non_match); + swaps[first_non_match] = S::from_usize(cur); + + first_non_match += 1; + while first_non_match < cur && is_match[first_non_match] { + first_non_match += 1; + } + } + cur -= 1; + } + for id in (0..self.state_count).map(S::from_usize) { + for (_, next) in self.get_state_mut(id).iter_mut() { + if swaps[next.to_usize()] != dead_id() { + *next = swaps[next.to_usize()]; + } + } + } + if swaps[self.start.to_usize()] != dead_id() { + self.start = swaps[self.start.to_usize()]; + } + self.max_match = S::from_usize(first_non_match - 1); + } +} + +#[cfg(feature = "std")] +impl<T: AsRef<[S]>, S: StateID> fmt::Debug for Repr<T, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn state_status<T: AsRef<[S]>, S: StateID>( + dfa: &Repr<T, S>, + id: S, + ) -> &'static str { + if id == dead_id() { + if dfa.is_match_state(id) { + "D*" + } else { + "D " + } + } else if id == dfa.start_state() { + if dfa.is_match_state(id) { + ">*" + } else { + "> " + } + } else { + if dfa.is_match_state(id) { + " *" + } else { + " " + } + } + } + + writeln!(f, "DenseDFA(")?; + for (id, state) in self.states() { + let status = state_status(self, id); + writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over all states in a DFA. +/// +/// This iterator yields a tuple for each state. The first element of the +/// tuple corresponds to a state's identifier, and the second element +/// corresponds to the state itself (comprised of its transitions). +/// +/// If this DFA is premultiplied, then the state identifiers are in turn +/// premultiplied as well, making them usable without additional modification. +/// +/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to +/// the type of the transition table itself and `S` corresponds to the state +/// identifier representation. +#[cfg(feature = "std")] +pub(crate) struct StateIter<'a, T: 'a, S: 'a> { + dfa: &'a Repr<T, S>, + it: iter::Enumerate<slice::Chunks<'a, S>>, +} + +#[cfg(feature = "std")] +impl<'a, T: AsRef<[S]>, S: StateID> Iterator for StateIter<'a, T, S> { + type Item = (S, State<'a, S>); + + fn next(&mut self) -> Option<(S, State<'a, S>)> { + self.it.next().map(|(id, chunk)| { + let state = State { transitions: chunk }; + let id = if self.dfa.premultiplied { + id * self.dfa.alphabet_len() + } else { + id + }; + (S::from_usize(id), state) + }) + } +} + +/// An immutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table and `S` +/// corresponds to the state identifier representation. +#[cfg(feature = "std")] +pub(crate) struct State<'a, S: 'a> { + transitions: &'a [S], +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> State<'a, S> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is + /// the input byte for that transition and the second element is the + /// transitions itself. + pub fn transitions(&self) -> StateTransitionIter<S> { + StateTransitionIter { it: self.transitions.iter().enumerate() } + } + + /// Return an iterator over a sparse representation of the transitions in + /// this state. Only non-dead transitions are returned. + /// + /// The "sparse" representation in this case corresponds to a sequence of + /// triples. The first two elements of the triple comprise an inclusive + /// byte range while the last element corresponds to the transition taken + /// for all bytes in the range. + /// + /// This is somewhat more condensed than the classical sparse + /// representation (where you have an element for every non-dead + /// transition), but in practice, checking if a byte is in a range is very + /// cheap and using ranges tends to conserve quite a bit more space. + pub fn sparse_transitions(&self) -> StateSparseTransitionIter<S> { + StateSparseTransitionIter { dense: self.transitions(), cur: None } + } +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> fmt::Debug for State<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut transitions = vec![]; + for (start, end, next_id) in self.sparse_transitions() { + let line = if start == end { + format!("{} => {}", escape(start), next_id.to_usize()) + } else { + format!( + "{}-{} => {}", + escape(start), + escape(end), + next_id.to_usize(), + ) + }; + transitions.push(line); + } + write!(f, "{}", transitions.join(", "))?; + Ok(()) + } +} + +/// An iterator over all transitions in a single DFA state. This yields +/// a number of transitions equivalent to the alphabet length of the +/// corresponding DFA. +/// +/// Each transition is represented by a tuple. The first element is the input +/// byte for that transition and the second element is the transitions itself. +#[cfg(feature = "std")] +#[derive(Debug)] +pub(crate) struct StateTransitionIter<'a, S: 'a> { + it: iter::Enumerate<slice::Iter<'a, S>>, +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> Iterator for StateTransitionIter<'a, S> { + type Item = (u8, S); + + fn next(&mut self) -> Option<(u8, S)> { + self.it.next().map(|(i, &id)| (i as u8, id)) + } +} + +/// An iterator over all transitions in a single DFA state using a sparse +/// representation. +/// +/// Each transition is represented by a triple. The first two elements of the +/// triple comprise an inclusive byte range while the last element corresponds +/// to the transition taken for all bytes in the range. +#[cfg(feature = "std")] +#[derive(Debug)] +pub(crate) struct StateSparseTransitionIter<'a, S: 'a> { + dense: StateTransitionIter<'a, S>, + cur: Option<(u8, u8, S)>, +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> Iterator for StateSparseTransitionIter<'a, S> { + type Item = (u8, u8, S); + + fn next(&mut self) -> Option<(u8, u8, S)> { + while let Some((b, next)) = self.dense.next() { + let (prev_start, prev_end, prev_next) = match self.cur { + Some(t) => t, + None => { + self.cur = Some((b, b, next)); + continue; + } + }; + if prev_next == next { + self.cur = Some((prev_start, b, prev_next)); + } else { + self.cur = Some((b, b, next)); + if prev_next != dead_id() { + return Some((prev_start, prev_end, prev_next)); + } + } + } + if let Some((start, end, next)) = self.cur.take() { + if next != dead_id() { + return Some((start, end, next)); + } + } + None + } +} + +/// A mutable representation of a single DFA state. +/// +/// `'a` correspondings to the lifetime of a DFA's transition table and `S` +/// corresponds to the state identifier representation. +#[cfg(feature = "std")] +pub(crate) struct StateMut<'a, S: 'a> { + transitions: &'a mut [S], +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> StateMut<'a, S> { + /// Return an iterator over all transitions in this state. This yields + /// a number of transitions equivalent to the alphabet length of the + /// corresponding DFA. + /// + /// Each transition is represented by a tuple. The first element is the + /// input byte for that transition and the second element is a mutable + /// reference to the transition itself. + pub fn iter_mut(&mut self) -> StateTransitionIterMut<S> { + StateTransitionIterMut { it: self.transitions.iter_mut().enumerate() } + } +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(&State { transitions: self.transitions }, f) + } +} + +/// A mutable iterator over all transitions in a DFA state. +/// +/// Each transition is represented by a tuple. The first element is the +/// input byte for that transition and the second element is a mutable +/// reference to the transition itself. +#[cfg(feature = "std")] +#[derive(Debug)] +pub(crate) struct StateTransitionIterMut<'a, S: 'a> { + it: iter::Enumerate<slice::IterMut<'a, S>>, +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> Iterator for StateTransitionIterMut<'a, S> { + type Item = (u8, &'a mut S); + + fn next(&mut self) -> Option<(u8, &'a mut S)> { + self.it.next().map(|(i, id)| (i as u8, id)) + } +} + +/// A builder for constructing a deterministic finite automaton from regular +/// expressions. +/// +/// This builder permits configuring several aspects of the construction +/// process such as case insensitivity, Unicode support and various options +/// that impact the size of the generated DFA. In some cases, options (like +/// performing DFA minimization) can come with a substantial additional cost. +/// +/// This builder always constructs a *single* DFA. As such, this builder can +/// only be used to construct regexes that either detect the presence of a +/// match or find the end location of a match. A single DFA cannot produce both +/// the start and end of a match. For that information, use a +/// [`Regex`](struct.Regex.html), which can be similarly configured using +/// [`RegexBuilder`](struct.RegexBuilder.html). +#[cfg(feature = "std")] +#[derive(Clone, Debug)] +pub struct Builder { + parser: ParserBuilder, + nfa: nfa::Builder, + anchored: bool, + minimize: bool, + premultiply: bool, + byte_classes: bool, + reverse: bool, + longest_match: bool, +} + +#[cfg(feature = "std")] +impl Builder { + /// Create a new DenseDFA builder with the default configuration. + pub fn new() -> Builder { + let mut nfa = nfa::Builder::new(); + // This is enabled by default, but we set it here anyway. Since we're + // building a DFA, shrinking the NFA is always a good idea. + nfa.shrink(true); + Builder { + parser: ParserBuilder::new(), + nfa, + anchored: false, + minimize: false, + premultiply: true, + byte_classes: true, + reverse: false, + longest_match: false, + } + } + + /// Build a DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<DenseDFA<Vec<usize>, usize>> { + self.build_with_size::<usize>(pattern) + } + + /// Build a DFA from the given pattern using a specific representation for + /// the DFA's state IDs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + /// + /// The representation of state IDs is determined by the `S` type + /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` + /// or `usize`, where `usize` is the default used for `build`. The purpose + /// of specifying a representation for state IDs is to reduce the memory + /// footprint of a DFA. + /// + /// When using this routine, the chosen state ID representation will be + /// used throughout determinization and minimization, if minimization + /// was requested. Even if the minimized DFA can fit into the chosen + /// state ID representation but the initial determinized DFA cannot, + /// then this will still return an error. To get a minimized DFA with a + /// smaller state ID representation, first build it with a bigger state ID + /// representation, and then shrink the size of the DFA using one of its + /// conversion routines, such as + /// [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). + pub fn build_with_size<S: StateID>( + &self, + pattern: &str, + ) -> Result<DenseDFA<Vec<S>, S>> { + self.build_from_nfa(&self.build_nfa(pattern)?) + } + + /// An internal only (for now) API for building a dense DFA directly from + /// an NFA. + pub(crate) fn build_from_nfa<S: StateID>( + &self, + nfa: &NFA, + ) -> Result<DenseDFA<Vec<S>, S>> { + if self.longest_match && !self.anchored { + return Err(Error::unsupported_longest_match()); + } + + let mut dfa = if self.byte_classes { + Determinizer::new(nfa) + .with_byte_classes() + .longest_match(self.longest_match) + .build() + } else { + Determinizer::new(nfa).longest_match(self.longest_match).build() + }?; + if self.minimize { + dfa.minimize(); + } + if self.premultiply { + dfa.premultiply()?; + } + Ok(dfa.into_dense_dfa()) + } + + /// Builds an NFA from the given pattern. + pub(crate) fn build_nfa(&self, pattern: &str) -> Result<NFA> { + let hir = self.parser.build().parse(pattern).map_err(Error::syntax)?; + Ok(self.nfa.build(&hir)?) + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of the input. When + /// disabled, the DFA will act as if the pattern started with a `.*?`, + /// which enables a match to appear anywhere. + /// + /// By default this is disabled. + pub fn anchored(&mut self, yes: bool) -> &mut Builder { + self.anchored = yes; + self.nfa.anchored(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.parser.case_insensitive(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder { + self.parser.ignore_whitespace(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder { + self.parser.dot_matches_new_line(yes); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut Builder { + self.parser.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut Builder { + self.parser.unicode(yes); + self + } + + /// When enabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { + self.parser.allow_invalid_utf8(yes); + self.nfa.allow_invalid_utf8(yes); + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the lenth of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut Builder { + self.parser.nest_limit(limit); + self + } + + /// Minimize the DFA. + /// + /// When enabled, the DFA built will be minimized such that it is as small + /// as possible. + /// + /// Whether one enables minimization or not depends on the types of costs + /// you're willing to pay and how much you care about its benefits. In + /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` + /// space, where `n` is the number of DFA states and `k` is the alphabet + /// size. In practice, minimization can be quite costly in terms of both + /// space and time, so it should only be done if you're willing to wait + /// longer to produce a DFA. In general, you might want a minimal DFA in + /// the following circumstances: + /// + /// 1. You would like to optimize for the size of the automaton. This can + /// manifest in one of two ways. Firstly, if you're converting the + /// DFA into Rust code (or a table embedded in the code), then a minimal + /// DFA will translate into a corresponding reduction in code size, and + /// thus, also the final compiled binary size. Secondly, if you are + /// building many DFAs and putting them on the heap, you'll be able to + /// fit more if they are smaller. Note though that building a minimal + /// DFA itself requires additional space; you only realize the space + /// savings once the minimal DFA is constructed (at which point, the + /// space used for minimization is freed). + /// 2. You've observed that a smaller DFA results in faster match + /// performance. Naively, this isn't guaranteed since there is no + /// inherent difference between matching with a bigger-than-minimal + /// DFA and a minimal DFA. However, a smaller DFA may make use of your + /// CPU's cache more efficiently. + /// 3. You are trying to establish an equivalence between regular + /// languages. The standard method for this is to build a minimal DFA + /// for each language and then compare them. If the DFAs are equivalent + /// (up to state renaming), then the languages are equivalent. + /// + /// This option is disabled by default. + pub fn minimize(&mut self, yes: bool) -> &mut Builder { + self.minimize = yes; + self + } + + /// Premultiply state identifiers in the DFA's transition table. + /// + /// When enabled, state identifiers are premultiplied to point to their + /// corresponding row in the DFA's transition table. That is, given the + /// `i`th state, its corresponding premultiplied identifier is `i * k` + /// where `k` is the alphabet size of the DFA. (The alphabet size is at + /// most 256, but is in practice smaller if byte classes is enabled.) + /// + /// When state identifiers are not premultiplied, then the identifier of + /// the `i`th state is `i`. + /// + /// The advantage of premultiplying state identifiers is that is saves + /// a multiplication instruction per byte when searching with the DFA. + /// This has been observed to lead to a 20% performance benefit in + /// micro-benchmarks. + /// + /// The primary disadvantage of premultiplying state identifiers is + /// that they require a larger integer size to represent. For example, + /// if your DFA has 200 states, then its premultiplied form requires + /// 16 bits to represent every possible state identifier, where as its + /// non-premultiplied form only requires 8 bits. + /// + /// This option is enabled by default. + pub fn premultiply(&mut self, yes: bool) -> &mut Builder { + self.premultiply = yes; + self + } + + /// Shrink the size of the DFA's alphabet by mapping bytes to their + /// equivalence classes. + /// + /// When enabled, each DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence classes because they never discriminate between a match + /// and a non-match. + /// + /// The advantage of this map is that the size of the transition table can + /// be reduced drastically from `#states * 256 * sizeof(id)` to + /// `#states * k * sizeof(id)` where `k` is the number of equivalence + /// classes. As a result, total space usage can decrease substantially. + /// Moreover, since a smaller alphabet is used, compilation becomes faster + /// as well. + /// + /// The disadvantage of this map is that every byte searched must be + /// passed through this map before it can be used to determine the next + /// transition. This has a small match time performance cost. + /// + /// This option is enabled by default. + pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { + self.byte_classes = yes; + self + } + + /// Reverse the DFA. + /// + /// A DFA reversal is performed by reversing all of the concatenated + /// sub-expressions in the original pattern, recursively. The resulting + /// DFA can be used to match the pattern starting from the end of a string + /// instead of the beginning of a string. + /// + /// Generally speaking, a reversed DFA is most useful for finding the start + /// of a match, since a single forward DFA is only capable of finding the + /// end of a match. This start of match handling is done for you + /// automatically if you build a [`Regex`](struct.Regex.html). + pub fn reverse(&mut self, yes: bool) -> &mut Builder { + self.reverse = yes; + self.nfa.reverse(yes); + self + } + + /// Find the longest possible match. + /// + /// This is distinct from the default leftmost-first match semantics in + /// that it treats all NFA states as having equivalent priority. In other + /// words, the longest possible match is always found and it is not + /// possible to implement non-greedy match semantics when this is set. That + /// is, `a+` and `a+?` are equivalent when this is enabled. + /// + /// In particular, a practical issue with this option at the moment is that + /// it prevents unanchored searches from working correctly, since + /// unanchored searches are implemented by prepending an non-greedy `.*?` + /// to the beginning of the pattern. As stated above, non-greedy match + /// semantics aren't supported. Therefore, if this option is enabled and + /// an unanchored search is requested, then building a DFA will return an + /// error. + /// + /// This option is principally useful when building a reverse DFA for + /// finding the start of a match. If you are building a regex with + /// [`RegexBuilder`](struct.RegexBuilder.html), then this is handled for + /// you automatically. The reason why this is necessary for start of match + /// handling is because we want to find the earliest possible starting + /// position of a match to satisfy leftmost-first match semantics. When + /// matching in reverse, this means finding the longest possible match, + /// hence, this option. + /// + /// By default this is disabled. + pub fn longest_match(&mut self, yes: bool) -> &mut Builder { + // There is prior art in RE2 that shows how this can support unanchored + // searches. Instead of treating all NFA states as having equivalent + // priority, we instead group NFA states into sets, and treat members + // of each set as having equivalent priority, but having greater + // priority than all following members of different sets. We then + // essentially assign a higher priority to everything over the prefix + // `.*?`. + self.longest_match = yes; + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// This may be exposed in the future, but for now is exported for use in + /// the `regex-automata-debug` tool. + #[doc(hidden)] + pub fn shrink(&mut self, yes: bool) -> &mut Builder { + self.nfa.shrink(yes); + self + } +} + +#[cfg(feature = "std")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// Return the given byte as its escaped string form. +#[cfg(feature = "std")] +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() +} + +#[cfg(all(test, feature = "std"))] +mod tests { + use super::*; + + #[test] + fn errors_when_converting_to_smaller_dfa() { + let pattern = r"\w{10}"; + let dfa = Builder::new() + .byte_classes(false) + .anchored(true) + .premultiply(false) + .build_with_size::<u16>(pattern) + .unwrap(); + assert!(dfa.to_u8().is_err()); + } + + #[test] + fn errors_when_determinization_would_overflow() { + let pattern = r"\w{10}"; + + let mut builder = Builder::new(); + builder.byte_classes(false).anchored(true).premultiply(false); + // using u16 is fine + assert!(builder.build_with_size::<u16>(pattern).is_ok()); + // // ... but u8 results in overflow (because there are >256 states) + assert!(builder.build_with_size::<u8>(pattern).is_err()); + } + + #[test] + fn errors_when_premultiply_would_overflow() { + let pattern = r"[a-z]"; + + let mut builder = Builder::new(); + builder.byte_classes(false).anchored(true).premultiply(false); + // without premultiplication is OK + assert!(builder.build_with_size::<u8>(pattern).is_ok()); + // ... but with premultiplication overflows u8 + builder.premultiply(true); + assert!(builder.build_with_size::<u8>(pattern).is_err()); + } + + // let data = ::std::fs::read_to_string("/usr/share/dict/words").unwrap(); + // let mut words: Vec<&str> = data.lines().collect(); + // println!("{} words", words.len()); + // words.sort_by(|w1, w2| w1.len().cmp(&w2.len()).reverse()); + // let pattern = words.join("|"); + // print_automata_counts(&pattern); + // print_automata(&pattern); + + // print_automata(r"[01]*1[01]{5}"); + // print_automata(r"X(.?){0,8}Y"); + // print_automata_counts(r"\p{alphabetic}"); + // print_automata(r"a*b+|cdefg"); + // print_automata(r"(..)*(...)*"); + + // let pattern = r"\p{any}*?\p{Other_Uppercase}"; + // let pattern = r"\p{any}*?\w+"; + // print_automata_counts(pattern); + // print_automata_counts(r"(?-u:\w)"); + + // let pattern = r"\p{Greek}"; + // let pattern = r"zZzZzZzZzZ"; + // let pattern = grapheme_pattern(); + // let pattern = r"\p{Ideographic}"; + // let pattern = r"\w{10}"; // 51784 --> 41264 + // let pattern = r"\w"; // 5182 + // let pattern = r"a*"; + // print_automata(pattern); + // let (_, _, dfa) = build_automata(pattern); +} diff --git a/vendor/regex-automata/src/determinize.rs b/vendor/regex-automata/src/determinize.rs new file mode 100644 index 000000000..cf0c28585 --- /dev/null +++ b/vendor/regex-automata/src/determinize.rs @@ -0,0 +1,286 @@ +use std::collections::HashMap; +use std::mem; +use std::rc::Rc; + +use dense; +use error::Result; +use nfa::{self, NFA}; +use sparse_set::SparseSet; +use state_id::{dead_id, StateID}; + +type DFARepr<S> = dense::Repr<Vec<S>, S>; + +/// A determinizer converts an NFA to a DFA. +/// +/// This determinizer follows the typical powerset construction, where each +/// DFA state is comprised of one or more NFA states. In the worst case, there +/// is one DFA state for every possible combination of NFA states. In practice, +/// this only happens in certain conditions, typically when there are bounded +/// repetitions. +/// +/// The type variable `S` refers to the chosen state identifier representation +/// used for the DFA. +/// +/// The lifetime variable `'a` refers to the lifetime of the NFA being +/// converted to a DFA. +#[derive(Debug)] +pub(crate) struct Determinizer<'a, S: StateID> { + /// The NFA we're converting into a DFA. + nfa: &'a NFA, + /// The DFA we're building. + dfa: DFARepr<S>, + /// Each DFA state being built is defined as an *ordered* set of NFA + /// states, along with a flag indicating whether the state is a match + /// state or not. + /// + /// This is never empty. The first state is always a dummy state such that + /// a state id == 0 corresponds to a dead state. + builder_states: Vec<Rc<State>>, + /// A cache of DFA states that already exist and can be easily looked up + /// via ordered sets of NFA states. + cache: HashMap<Rc<State>, S>, + /// Scratch space for a stack of NFA states to visit, for depth first + /// visiting without recursion. + stack: Vec<nfa::StateID>, + /// Scratch space for storing an ordered sequence of NFA states, for + /// amortizing allocation. + scratch_nfa_states: Vec<nfa::StateID>, + /// Whether to build a DFA that finds the longest possible match. + longest_match: bool, +} + +/// An intermediate representation for a DFA state during determinization. +#[derive(Debug, Eq, Hash, PartialEq)] +struct State { + /// Whether this state is a match state or not. + is_match: bool, + /// An ordered sequence of NFA states that make up this DFA state. + nfa_states: Vec<nfa::StateID>, +} + +impl<'a, S: StateID> Determinizer<'a, S> { + /// Create a new determinizer for converting the given NFA to a DFA. + pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { + let dead = Rc::new(State::dead()); + let mut cache = HashMap::default(); + cache.insert(dead.clone(), dead_id()); + + Determinizer { + nfa, + dfa: DFARepr::empty().anchored(nfa.is_anchored()), + builder_states: vec![dead], + cache, + stack: vec![], + scratch_nfa_states: vec![], + longest_match: false, + } + } + + /// Instruct the determinizer to use equivalence classes as the transition + /// alphabet instead of all possible byte values. + pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { + let byte_classes = self.nfa.byte_classes().clone(); + self.dfa = DFARepr::empty_with_byte_classes(byte_classes) + .anchored(self.nfa.is_anchored()); + self + } + + /// Instruct the determinizer to build a DFA that recognizes the longest + /// possible match instead of the leftmost first match. This is useful when + /// constructing reverse DFAs for finding the start of a match. + pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { + self.longest_match = yes; + self + } + + /// Build the DFA. If there was a problem constructing the DFA (e.g., if + /// the chosen state identifier representation is too small), then an error + /// is returned. + pub fn build(mut self) -> Result<DFARepr<S>> { + let representative_bytes: Vec<u8> = + self.dfa.byte_classes().representatives().collect(); + let mut sparse = self.new_sparse_set(); + let mut uncompiled = vec![self.add_start(&mut sparse)?]; + while let Some(dfa_id) = uncompiled.pop() { + for &b in &representative_bytes { + let (next_dfa_id, is_new) = + self.cached_state(dfa_id, b, &mut sparse)?; + self.dfa.add_transition(dfa_id, b, next_dfa_id); + if is_new { + uncompiled.push(next_dfa_id); + } + } + } + + // At this point, we shuffle the matching states in the final DFA to + // the beginning. This permits a DFA's match loop to detect a match + // condition by merely inspecting the current state's identifier, and + // avoids the need for any additional auxiliary storage. + let is_match: Vec<bool> = + self.builder_states.iter().map(|s| s.is_match).collect(); + self.dfa.shuffle_match_states(&is_match); + Ok(self.dfa) + } + + /// Return the identifier for the next DFA state given an existing DFA + /// state and an input byte. If the next DFA state already exists, then + /// return its identifier from the cache. Otherwise, build the state, cache + /// it and return its identifier. + /// + /// The given sparse set is used for scratch space. It must have a capacity + /// equivalent to the total number of NFA states, but its contents are + /// otherwise unspecified. + /// + /// This routine returns a boolean indicating whether a new state was + /// built. If a new state is built, then the caller needs to add it to its + /// frontier of uncompiled DFA states to compute transitions for. + fn cached_state( + &mut self, + dfa_id: S, + b: u8, + sparse: &mut SparseSet, + ) -> Result<(S, bool)> { + sparse.clear(); + // Compute the set of all reachable NFA states, including epsilons. + self.next(dfa_id, b, sparse); + // Build a candidate state and check if it has already been built. + let state = self.new_state(sparse); + if let Some(&cached_id) = self.cache.get(&state) { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + let _ = + mem::replace(&mut self.scratch_nfa_states, state.nfa_states); + return Ok((cached_id, false)); + } + // Nothing was in the cache, so add this state to the cache. + self.add_state(state).map(|s| (s, true)) + } + + /// Compute the set of all eachable NFA states, including the full epsilon + /// closure, from a DFA state for a single byte of input. + fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { + next_nfa_states.clear(); + for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { + let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; + match *self.nfa.state(nfa_id) { + nfa::State::Union { .. } + | nfa::State::Fail + | nfa::State::Match => {} + nfa::State::Range { range: ref r } => { + if r.start <= b && b <= r.end { + self.epsilon_closure(r.next, next_nfa_states); + } + } + nfa::State::Sparse { ref ranges } => { + for r in ranges.iter() { + if r.start > b { + break; + } else if r.start <= b && b <= r.end { + self.epsilon_closure(r.next, next_nfa_states); + break; + } + } + } + } + } + } + + /// Compute the epsilon closure for the given NFA state. + fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { + if !self.nfa.state(start).is_epsilon() { + set.insert(start); + return; + } + + self.stack.push(start); + while let Some(mut id) = self.stack.pop() { + loop { + if set.contains(id) { + break; + } + set.insert(id); + match *self.nfa.state(id) { + nfa::State::Range { .. } + | nfa::State::Sparse { .. } + | nfa::State::Fail + | nfa::State::Match => break, + nfa::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + self.stack.extend(alternates[1..].iter().rev()); + } + } + } + } + } + + /// Compute the initial DFA state and return its identifier. + /// + /// The sparse set given is used for scratch space, and must have capacity + /// equal to the total number of NFA states. Its contents are unspecified. + fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> { + sparse.clear(); + self.epsilon_closure(self.nfa.start(), sparse); + let state = self.new_state(&sparse); + let id = self.add_state(state)?; + self.dfa.set_start_state(id); + Ok(id) + } + + /// Add the given state to the DFA and make it available in the cache. + /// + /// The state initially has no transitions. That is, it transitions to the + /// dead state for all possible inputs. + fn add_state(&mut self, state: State) -> Result<S> { + let id = self.dfa.add_empty_state()?; + let rstate = Rc::new(state); + self.builder_states.push(rstate.clone()); + self.cache.insert(rstate, id); + Ok(id) + } + + /// Convert the given set of ordered NFA states to a DFA state. + fn new_state(&mut self, set: &SparseSet) -> State { + let mut state = State { + is_match: false, + nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), + }; + state.nfa_states.clear(); + + for &id in set { + match *self.nfa.state(id) { + nfa::State::Range { .. } => { + state.nfa_states.push(id); + } + nfa::State::Sparse { .. } => { + state.nfa_states.push(id); + } + nfa::State::Fail => { + break; + } + nfa::State::Match => { + state.is_match = true; + if !self.longest_match { + break; + } + } + nfa::State::Union { .. } => {} + } + } + state + } + + /// Create a new sparse set with enough capacity to hold all NFA states. + fn new_sparse_set(&self) -> SparseSet { + SparseSet::new(self.nfa.len()) + } +} + +impl State { + /// Create a new empty dead state. + fn dead() -> State { + State { nfa_states: vec![], is_match: false } + } +} diff --git a/vendor/regex-automata/src/dfa.rs b/vendor/regex-automata/src/dfa.rs new file mode 100644 index 000000000..43de3461f --- /dev/null +++ b/vendor/regex-automata/src/dfa.rs @@ -0,0 +1,363 @@ +use state_id::StateID; + +/// A trait describing the interface of a deterministic finite automaton (DFA). +/// +/// Every DFA has exactly one start state and at least one dead state (which +/// may be the same, as in the case of an empty DFA). In all cases, a state +/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` +/// always returns `true`. +/// +/// Every DFA also has zero or more match states, such that +/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to +/// a match state. +/// +/// In general, users of this trait likely will only need to use the search +/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other +/// methods are lower level and are used for walking the transitions of a DFA +/// manually. In particular, the aforementioned search routines are implemented +/// generically in terms of the lower level transition walking routines. +pub trait DFA { + /// The representation used for state identifiers in this DFA. + /// + /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. + type ID: StateID; + + /// Return the identifier of this DFA's start state. + fn start_state(&self) -> Self::ID; + + /// Returns true if and only if the given identifier corresponds to a match + /// state. + fn is_match_state(&self, id: Self::ID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a dead + /// state. When a DFA enters a dead state, it is impossible to leave and + /// thus can never lead to a match. + fn is_dead_state(&self, id: Self::ID) -> bool; + + /// Returns true if and only if the given identifier corresponds to either + /// a dead state or a match state, such that one of `is_match_state(id)` + /// or `is_dead_state(id)` must return true. + /// + /// Depending on the implementation of the DFA, this routine can be used + /// to save a branch in the core matching loop. Nevertheless, + /// `is_match_state(id) || is_dead_state(id)` is always a valid + /// implementation. + fn is_match_or_dead_state(&self, id: Self::ID) -> bool; + + /// Returns true if and only if this DFA is anchored. + /// + /// When a DFA is anchored, it is only allowed to report matches that + /// start at index `0`. + fn is_anchored(&self) -> bool; + + /// Given the current state that this DFA is in and the next input byte, + /// this method returns the identifier of the next state. The identifier + /// returned is always valid, but it may correspond to a dead state. + fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; + + /// Like `next_state`, but its implementation may look up the next state + /// without memory safety checks such as bounds checks. As such, callers + /// must ensure that the given identifier corresponds to a valid DFA + /// state. Implementors must, in turn, ensure that this routine is safe + /// for all valid state identifiers and for all possible `u8` values. + unsafe fn next_state_unchecked( + &self, + current: Self::ID, + input: u8, + ) -> Self::ID; + + /// Returns true if and only if the given bytes match this DFA. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if a DFA enters + /// a match state or a dead state, then this routine will return `true` or + /// `false`, respectively, without inspecting any future input. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`DenseDFA`](enum.DenseDFA.html). + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = DenseDFA::new("foo[0-9]+bar")?; + /// assert_eq!(true, dfa.is_match(b"foo12345bar")); + /// assert_eq!(false, dfa.is_match(b"foobar")); + /// # Ok(()) }; example().unwrap() + /// ``` + #[inline] + fn is_match(&self, bytes: &[u8]) -> bool { + self.is_match_at(bytes, 0) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`DenseDFA`](enum.DenseDFA.html). + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = DenseDFA::new("foo[0-9]+")?; + /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the shortest match semantics detect a match earlier. + /// let dfa = DenseDFA::new("abc|a")?; + /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); + /// # Ok(()) }; example().unwrap() + /// ``` + #[inline] + fn shortest_match(&self, bytes: &[u8]) -> Option<usize> { + self.shortest_match_at(bytes, 0) + } + + /// Returns the end offset of the longest match. If no match exists, + /// then `None` is returned. + /// + /// Implementors of this trait are not required to implement any particular + /// match semantics (such as leftmost-first), which are instead manifest in + /// the DFA's topology itself. + /// + /// In particular, this method must continue searching even after it + /// enters a match state. The search should only terminate once it has + /// reached the end of the input or when it has entered a dead state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses + /// "leftmost first" match semantics. + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = DenseDFA::new("foo[0-9]+")?; + /// assert_eq!(Some(8), dfa.find(b"foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = DenseDFA::new("abc|a")?; + /// assert_eq!(Some(3), dfa.find(b"abc")); + /// # Ok(()) }; example().unwrap() + /// ``` + #[inline] + fn find(&self, bytes: &[u8]) -> Option<usize> { + self.find_at(bytes, 0) + } + + /// Returns the start offset of the longest match in reverse, by searching + /// from the end of the input towards the start of the input. If no match + /// exists, then `None` is returned. In other words, this has the same + /// match semantics as `find`, but in reverse. + /// + /// # Example + /// + /// This example shows how to use this method with a + /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine + /// is principally useful when used in conjunction with the + /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) + /// configuration knob. In general, it's unlikely to be correct to use both + /// `find` and `rfind` with the same DFA since any particular DFA will only + /// support searching in one direction. + /// + /// ``` + /// use regex_automata::{dense, DFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; + /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); + /// # Ok(()) }; example().unwrap() + /// ``` + #[inline] + fn rfind(&self, bytes: &[u8]) -> Option<usize> { + self.rfind_at(bytes, bytes.len()) + } + + /// Returns the same as `is_match`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + #[inline] + fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { + if self.is_anchored() && start > 0 { + return false; + } + + let mut state = self.start_state(); + if self.is_match_or_dead_state(state) { + return self.is_match_state(state); + } + for &b in bytes[start..].iter() { + state = unsafe { self.next_state_unchecked(state, b) }; + if self.is_match_or_dead_state(state) { + return self.is_match_state(state); + } + } + false + } + + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + #[inline] + fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + if self.is_anchored() && start > 0 { + return None; + } + + let mut state = self.start_state(); + if self.is_match_or_dead_state(state) { + return if self.is_dead_state(state) { None } else { Some(start) }; + } + for (i, &b) in bytes[start..].iter().enumerate() { + state = unsafe { self.next_state_unchecked(state, b) }; + if self.is_match_or_dead_state(state) { + return if self.is_dead_state(state) { + None + } else { + Some(start + i + 1) + }; + } + } + None + } + + /// Returns the same as `find`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + #[inline] + fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + if self.is_anchored() && start > 0 { + return None; + } + + let mut state = self.start_state(); + let mut last_match = if self.is_dead_state(state) { + return None; + } else if self.is_match_state(state) { + Some(start) + } else { + None + }; + for (i, &b) in bytes[start..].iter().enumerate() { + state = unsafe { self.next_state_unchecked(state, b) }; + if self.is_match_or_dead_state(state) { + if self.is_dead_state(state) { + return last_match; + } + last_match = Some(start + i + 1); + } + } + last_match + } + + /// Returns the same as `rfind`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == bytes.len()`. + #[inline(never)] + fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + if self.is_anchored() && start < bytes.len() { + return None; + } + + let mut state = self.start_state(); + let mut last_match = if self.is_dead_state(state) { + return None; + } else if self.is_match_state(state) { + Some(start) + } else { + None + }; + for (i, &b) in bytes[..start].iter().enumerate().rev() { + state = unsafe { self.next_state_unchecked(state, b) }; + if self.is_match_or_dead_state(state) { + if self.is_dead_state(state) { + return last_match; + } + last_match = Some(i); + } + } + last_match + } +} + +impl<'a, T: DFA> DFA for &'a T { + type ID = T::ID; + + #[inline] + fn start_state(&self) -> Self::ID { + (**self).start_state() + } + + #[inline] + fn is_match_state(&self, id: Self::ID) -> bool { + (**self).is_match_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: Self::ID) -> bool { + (**self).is_match_or_dead_state(id) + } + + #[inline] + fn is_dead_state(&self, id: Self::ID) -> bool { + (**self).is_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + (**self).is_anchored() + } + + #[inline] + fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { + (**self).next_state(current, input) + } + + #[inline] + unsafe fn next_state_unchecked( + &self, + current: Self::ID, + input: u8, + ) -> Self::ID { + (**self).next_state_unchecked(current, input) + } +} diff --git a/vendor/regex-automata/src/error.rs b/vendor/regex-automata/src/error.rs new file mode 100644 index 000000000..70fe436ea --- /dev/null +++ b/vendor/regex-automata/src/error.rs @@ -0,0 +1,150 @@ +use std::error; +use std::fmt; +use std::result; + +use regex_syntax; + +pub type Result<T> = result::Result<T, Error>; + +/// An error that occurred during the construction of a DFA. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurred while parsing a regular expression. Note that + /// this error may be printed over multiple lines, and is generally + /// intended to be end user readable on its own. + Syntax(String), + /// An error that occurred because an unsupported regex feature was used. + /// The message string describes which unsupported feature was used. + /// + /// The primary regex features that are unsupported are those that require + /// look-around, such as the `^` and `$` anchors and the word boundary + /// assertion `\b`. These may be supported in the future. + Unsupported(String), + /// An error that occurred when attempting to serialize a DFA to bytes. + Serialize(String), + /// An error that occurs when constructing a DFA would require the use of + /// a state ID that overflows the chosen state ID representation. For + /// example, if one is using `u8` for state IDs and builds a DFA with + /// 257 states, then the last state's ID will be `256` which cannot be + /// represented with `u8`. + /// + /// Typically, this error occurs in the determinization process of building + /// a DFA (the conversion step from NFA to DFA). It can also occur when + /// trying to build a smaller DFA from an existing one. + StateIDOverflow { + /// The maximum possible state ID. + max: usize, + }, + /// An error that occurs when premultiplication of state IDs is requested, + /// but doing so would overflow the chosen state ID representation. + /// + /// When `max == requested_max`, then the state ID would overflow `usize`. + PremultiplyOverflow { + /// The maximum possible state id. + max: usize, + /// The maximum ID required by premultiplication. + requested_max: usize, + }, +} + +impl Error { + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn syntax(err: regex_syntax::Error) -> Error { + Error { kind: ErrorKind::Syntax(err.to_string()) } + } + + pub(crate) fn unsupported_anchor() -> Error { + let msg = r"anchors such as ^, $, \A and \z are not supported"; + Error { kind: ErrorKind::Unsupported(msg.to_string()) } + } + + pub(crate) fn unsupported_word() -> Error { + let msg = r"word boundary assertions (\b and \B) are not supported"; + Error { kind: ErrorKind::Unsupported(msg.to_string()) } + } + + pub(crate) fn unsupported_longest_match() -> Error { + let msg = "unachored searches with longest match \ + semantics are not supported"; + Error { kind: ErrorKind::Unsupported(msg.to_string()) } + } + + pub(crate) fn serialize(message: &str) -> Error { + Error { kind: ErrorKind::Serialize(message.to_string()) } + } + + pub(crate) fn state_id_overflow(max: usize) -> Error { + Error { kind: ErrorKind::StateIDOverflow { max } } + } + + pub(crate) fn premultiply_overflow( + max: usize, + requested_max: usize, + ) -> Error { + Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } } + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::Syntax(_) => "syntax error", + ErrorKind::Unsupported(_) => "unsupported syntax", + ErrorKind::Serialize(_) => "serialization error", + ErrorKind::StateIDOverflow { .. } => { + "state id representation too small" + } + ErrorKind::PremultiplyOverflow { .. } => { + "state id representation too small for premultiplication" + } + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + ErrorKind::Syntax(ref msg) => write!(f, "{}", msg), + ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg), + ErrorKind::Serialize(ref msg) => { + write!(f, "DFA serialization error: {}", msg) + } + ErrorKind::StateIDOverflow { max } => write!( + f, + "building the DFA failed because it required building \ + more states that can be identified, where the maximum \ + ID for the chosen representation is {}", + max, + ), + ErrorKind::PremultiplyOverflow { max, requested_max } => { + if max == requested_max { + write!( + f, + "premultiplication of states requires the ability to \ + represent a state ID greater than what can fit on \ + this platform's usize, which is {}", + ::std::usize::MAX, + ) + } else { + write!( + f, + "premultiplication of states requires the ability to \ + represent at least a state ID of {}, but the chosen \ + representation only permits a maximum state ID of {}", + requested_max, max, + ) + } + } + } + } +} diff --git a/vendor/regex-automata/src/lib.rs b/vendor/regex-automata/src/lib.rs new file mode 100644 index 000000000..7894eccea --- /dev/null +++ b/vendor/regex-automata/src/lib.rs @@ -0,0 +1,360 @@ +/*! +A low level regular expression library that uses deterministic finite automata. +It supports a rich syntax with Unicode support, has extensive options for +configuring the best space vs time trade off for your use case and provides +support for cheap deserialization of automata for use in `no_std` environments. + +# Overview + +This section gives a brief overview of the primary types in this crate: + +* A [`Regex`](struct.Regex.html) provides a way to search for matches of a + regular expression. This includes iterating over matches with both the start + and end positions of each match. +* A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many + compilation options for a regex. +* A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that + uses a dense representation (uses lots of space, but fast searching). +* A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`, + but uses a sparse representation (uses less space, but slower matching). +* A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must + implement. +* Both dense DFAs and sparse DFAs support + [serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian) + and + [cheap deserialization](enum.DenseDFA.html#method.from_bytes). + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::Regex; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +``` + +# Example: use sparse DFAs + +By default, compiling a regex will use dense DFAs internally. This uses more +memory, but executes searches more quickly. If you can abide slower searches +(somewhere around 3-5x), then sparse DFAs might make more sense since they can +use significantly less space. + +Using sparse DFAs is as easy as using `Regex::new_sparse` instead of +`Regex::new`: + +``` +use regex_automata::Regex; + +# fn example() -> Result<(), regex_automata::Error> { +let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +# Ok(()) }; example().unwrap() +``` + +If you already have dense DFAs for some reason, they can be converted to sparse +DFAs and used to build a new `Regex`. For example: + +``` +use regex_automata::Regex; + +# fn example() -> Result<(), regex_automata::Error> { +let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let sparse_re = Regex::from_dfas( + dense_re.forward().to_sparse()?, + dense_re.reverse().to_sparse()?, +); +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +# Ok(()) }; example().unwrap() +``` + +# Example: deserialize a DFA + +This shows how to first serialize a DFA into raw bytes, and then deserialize +those raw bytes back into a DFA. While this particular example is a bit +contrived, this same technique can be used in your program to deserialize a +DFA at start up time or by memory mapping a file. In particular, +deserialization is guaranteed to be cheap because it will always be a constant +time operation. + +``` +use regex_automata::{DenseDFA, Regex}; + +# fn example() -> Result<(), regex_automata::Error> { +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both the forward and reverse DFAs, see note below +let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?; +let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?; +// now deserialize both---we need to specify the correct type! +let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) }; +let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) }; +// finally, reconstruct our regex +let re2 = Regex::from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = re2.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +# Ok(()) }; example().unwrap() +``` + +There are a few points worth noting here: + +* We need to extract the raw DFAs used by the regex and serialize those. You + can build the DFAs manually yourself using + [`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a + `Regex` guarantees that the DFAs are built correctly. +* We specifically convert the dense DFA to a representation that uses `u16` + for its state identifiers using + [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't + strictly necessary, if we skipped this step, then the serialized bytes would + use `usize` for state identifiers, which does not have a fixed size. Using + `u16` ensures that we can deserialize this DFA even on platforms with a + smaller pointer size. If our DFA is too big for `u16` state identifiers, then + one can use `u32` or `u64`. +* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` + method. In practice, you'll want to use either + [`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) + or + [`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian), + depending on which platform you're deserializing your DFA from. If you intend + to deserialize on either platform, then you'll need to serialize both and + deserialize the right one depending on your target's endianness. +* Deserializing a DFA requires the use of `unsafe` because the raw bytes must + be *trusted*. In particular, while some degree of sanity checks are + performed, nothing guarantees the integrity of the DFA's transition table + since deserialization is a constant time operation. Since searching with a + DFA must be able to follow transitions blindly for performance reasons, + giving incorrect bytes to the deserialization API can result in memory + unsafety. + +The same process can be achieved with sparse DFAs as well: + +``` +use regex_automata::{SparseDFA, Regex}; + +# fn example() -> Result<(), regex_automata::Error> { +let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +// serialize both +let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?; +let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?; +// now deserialize both---we need to specify the correct type! +let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) }; +let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) }; +// finally, reconstruct our regex +let re2 = Regex::from_dfas(fwd, rev); + +// we can use it like normal +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<(usize, usize)> = re2.find_iter(text).collect(); +assert_eq!(matches, vec![(0, 10), (11, 21)]); +# Ok(()) }; example().unwrap() +``` + +Note that unlike dense DFAs, sparse DFAs have no alignment requirements. +Conversely, dense DFAs must be be aligned to the same alignment as their +state identifier representation. + +# Support for `no_std` + +This crate comes with a `std` feature that is enabled by default. When the +`std` feature is enabled, the API of this crate will include the facilities +necessary for compiling, serializing, deserializing and searching with regular +expressions. When the `std` feature is disabled, the API of this crate will +shrink such that it only includes the facilities necessary for deserializing +and searching with regular expressions. + +The intended workflow for `no_std` environments is thus as follows: + +* Write a program with the `std` feature that compiles and serializes a + regular expression. Serialization should only happen after first converting + the DFAs to use a fixed size state identifier instead of the default `usize`. + You may also need to serialize both little and big endian versions of each + DFA. (So that's 4 DFAs in total for each regex.) +* In your `no_std` environment, follow the examples above for deserializing + your previously serialized DFAs into regexes. You can then search with them + as you would any regex. + +Deserialization can happen anywhere. For example, with bytes embedded into a +binary or with a file memory mapped at runtime. + +Note that the +[`ucd-generate`](https://github.com/BurntSushi/ucd-generate) +tool will do the first step for you with its `dfa` or `regex` sub-commands. + +# Syntax + +This crate supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax). + +Currently, there are a couple limitations. In general, this crate does not +support zero-width assertions, although they may be added in the future. This +includes: + +* Anchors such as `^`, `$`, `\A` and `\z`. +* Word boundary assertions such as `\b` and `\B`. + +It is possible to run a search that is anchored at the beginning of the input. +To do that, set the +[`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored) +option when building a regex. By default, all searches are unanchored. + +# Differences with the regex crate + +The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a +general purpose regular expression engine. It aims to automatically balance low +compile times, fast search times and low memory usage, while also providing +a convenient API for users. In contrast, this crate provides a lower level +regular expression interface that is a bit less convenient while providing more +explicit control over memory usage and search times. + +Here are some specific negative differences: + +* **Compilation can take an exponential amount of time and space** in the size + of the regex pattern. While most patterns do not exhibit worst case + exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will + build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should + not be compiled with this library. (In the future, the API may expose an + option to return an error if the DFA gets too big.) +* This crate does not support sub-match extraction, which can be achieved with + the regex crate's "captures" API. This may be added in the future, but is + unlikely. +* While the regex crate doesn't necessarily sport fast compilation times, the + regexes in this crate are almost universally slow to compile, especially when + they contain large Unicode character classes. For example, on my system, + compiling `\w{3}` with byte classes enabled takes just over 1 second and + almost 5MB of memory! (Compiling a sparse regex takes about the same time + but only uses about 500KB of memory.) Conversly, compiling the same regex + without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and + less than 5KB of memory. For this reason, you should only use Unicode + character classes if you absolutely need them! +* This crate does not support regex sets. +* This crate does not support zero-width assertions such as `^`, `$`, `\b` or + `\B`. +* As a lower level crate, this library does not do literal optimizations. In + exchange, you get predictable performance regardless of input. The + philosophy here is that literal optimizations should be applied at a higher + level, although there is no easy support for this in the ecosystem yet. +* There is no `&str` API like in the regex crate. In this crate, all APIs + operate on `&[u8]`. By default, match indices are guaranteed to fall on + UTF-8 boundaries, unless + [`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) + is enabled. + +With some of the downsides out of the way, here are some positive differences: + +* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply + deserialized. Deserialization always takes constant time since searching can + be performed directly on the raw serialized bytes of a DFA. +* This crate was specifically designed so that the searching phase of a DFA has + minimal runtime requirements, and can therefore be used in `no_std` + environments. While `no_std` environments cannot compile regexes, they can + deserialize pre-compiled regexes. +* Since this crate builds DFAs ahead of time, it will generally out-perform + the `regex` crate on equivalent tasks. The performance difference is likely + not large. However, because of a complex set of optimizations in the regex + crate (like literal optimizations), an accurate performance comparison may be + difficult to do. +* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search + performance a small amount, but uses much less storage space. Potentially + even less than what the regex crate uses. +* This crate exposes DFAs directly, such as + [`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html), + which enables one to do less work in some cases. For example, if you only + need the end of a match and not the start of a match, then you can use a DFA + directly without building a `Regex`, which always requires a second DFA to + find the start of a match. +* Aside from choosing between dense and sparse DFAs, there are several options + for configuring the space usage vs search time trade off. These include + things like choosing a smaller state identifier representation, to + premultiplying state identifiers and splitting a DFA's alphabet into + equivalence classes. Finally, DFA minimization is also provided, but can + increase compilation times dramatically. +*/ + +#![deny(missing_docs)] +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(feature = "std")] +extern crate core; + +#[cfg(all(test, feature = "transducer"))] +extern crate bstr; +#[cfg(feature = "transducer")] +extern crate fst; +#[cfg(feature = "std")] +extern crate regex_syntax; + +pub use dense::DenseDFA; +pub use dfa::DFA; +#[cfg(feature = "std")] +pub use error::{Error, ErrorKind}; +pub use regex::Regex; +#[cfg(feature = "std")] +pub use regex::RegexBuilder; +pub use sparse::SparseDFA; +pub use state_id::StateID; + +mod byteorder; +mod classes; +#[path = "dense.rs"] +mod dense_imp; +#[cfg(feature = "std")] +mod determinize; +mod dfa; +#[cfg(feature = "std")] +mod error; +#[cfg(feature = "std")] +mod minimize; +#[cfg(feature = "std")] +#[doc(hidden)] +pub mod nfa; +mod regex; +#[path = "sparse.rs"] +mod sparse_imp; +#[cfg(feature = "std")] +mod sparse_set; +mod state_id; +#[cfg(feature = "transducer")] +mod transducer; + +/// Types and routines specific to dense DFAs. +/// +/// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its +/// corresponding variant DFA types, such as [`Standard`](struct.Standard.html) +/// and [`ByteClass`](struct.ByteClass.html). +/// +/// This module also contains a [builder](struct.Builder.html) for +/// configuring the construction of a dense DFA. +pub mod dense { + pub use dense_imp::*; +} + +/// Types and routines specific to sparse DFAs. +/// +/// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of +/// its corresponding variant DFA types, such as +/// [`Standard`](struct.Standard.html) and +/// [`ByteClass`](struct.ByteClass.html). +/// +/// Unlike the [`dense`](../dense/index.html) module, this module does not +/// contain a builder specific for sparse DFAs. Instead, the intended way to +/// build a sparse DFA is either by using a default configuration with its +/// [constructor](enum.SparseDFA.html#method.new), +/// or by first +/// [configuring the construction of a dense DFA](../dense/struct.Builder.html) +/// and then calling +/// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse). +pub mod sparse { + pub use sparse_imp::*; +} diff --git a/vendor/regex-automata/src/minimize.rs b/vendor/regex-automata/src/minimize.rs new file mode 100644 index 000000000..ededa5f66 --- /dev/null +++ b/vendor/regex-automata/src/minimize.rs @@ -0,0 +1,373 @@ +use std::cell::RefCell; +use std::fmt; +use std::mem; +use std::rc::Rc; + +use dense; +use state_id::{dead_id, StateID}; + +type DFARepr<S> = dense::Repr<Vec<S>, S>; + +/// An implementation of Hopcroft's algorithm for minimizing DFAs. +/// +/// The algorithm implemented here is mostly taken from Wikipedia: +/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm +/// +/// This code has had some light optimization attention paid to it, +/// particularly in the form of reducing allocation as much as possible. +/// However, it is still generally slow. Future optimization work should +/// probably focus on the bigger picture rather than micro-optimizations. For +/// example: +/// +/// 1. Figure out how to more intelligently create initial partitions. That is, +/// Hopcroft's algorithm starts by creating two partitions of DFA states +/// that are known to NOT be equivalent: match states and non-match states. +/// The algorithm proceeds by progressively refining these partitions into +/// smaller partitions. If we could start with more partitions, then we +/// could reduce the amount of work that Hopcroft's algorithm needs to do. +/// 2. For every partition that we visit, we find all incoming transitions to +/// every state in the partition for *every* element in the alphabet. (This +/// is why using byte classes can significantly decrease minimization times, +/// since byte classes shrink the alphabet.) This is quite costly and there +/// is perhaps some redundant work being performed depending on the specific +/// states in the set. For example, we might be able to only visit some +/// elements of the alphabet based on the transitions. +/// 3. Move parts of minimization into determinization. If minimization has +/// fewer states to deal with, then it should run faster. A prime example +/// of this might be large Unicode classes, which are generated in way that +/// can create a lot of redundant states. (Some work has been done on this +/// point during NFA compilation via the algorithm described in the +/// "Incremental Construction of MinimalAcyclic Finite-State Automata" +/// paper.) +pub(crate) struct Minimizer<'a, S: 'a> { + dfa: &'a mut DFARepr<S>, + in_transitions: Vec<Vec<Vec<S>>>, + partitions: Vec<StateSet<S>>, + waiting: Vec<StateSet<S>>, +} + +impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Minimizer") + .field("dfa", &self.dfa) + .field("in_transitions", &self.in_transitions) + .field("partitions", &self.partitions) + .field("waiting", &self.waiting) + .finish() + } +} + +/// A set of states. A state set makes up a single partition in Hopcroft's +/// algorithm. +/// +/// It is represented by an ordered set of state identifiers. We use shared +/// ownership so that a single state set can be in both the set of partitions +/// and in the set of waiting sets simultaneously without an additional +/// allocation. Generally, once a state set is built, it becomes immutable. +/// +/// We use this representation because it avoids the overhead of more +/// traditional set data structures (HashSet/BTreeSet), and also because +/// computing intersection/subtraction on this representation is especially +/// fast. +#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +struct StateSet<S>(Rc<RefCell<Vec<S>>>); + +impl<'a, S: StateID> Minimizer<'a, S> { + pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> { + let in_transitions = Minimizer::incoming_transitions(dfa); + let partitions = Minimizer::initial_partitions(dfa); + let waiting = vec![partitions[0].clone()]; + + Minimizer { dfa, in_transitions, partitions, waiting } + } + + pub fn run(mut self) { + let mut incoming = StateSet::empty(); + let mut scratch1 = StateSet::empty(); + let mut scratch2 = StateSet::empty(); + let mut newparts = vec![]; + + while let Some(set) = self.waiting.pop() { + for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) { + self.find_incoming_to(b, &set, &mut incoming); + + for p in 0..self.partitions.len() { + self.partitions[p].intersection(&incoming, &mut scratch1); + if scratch1.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + self.partitions[p].subtract(&incoming, &mut scratch2); + if scratch2.is_empty() { + newparts.push(self.partitions[p].clone()); + continue; + } + + let (x, y) = + (scratch1.deep_clone(), scratch2.deep_clone()); + newparts.push(x.clone()); + newparts.push(y.clone()); + match self.find_waiting(&self.partitions[p]) { + Some(i) => { + self.waiting[i] = x; + self.waiting.push(y); + } + None => { + if x.len() <= y.len() { + self.waiting.push(x); + } else { + self.waiting.push(y); + } + } + } + } + newparts = mem::replace(&mut self.partitions, newparts); + newparts.clear(); + } + } + + // At this point, we now have a minimal partitioning of states, where + // each partition is an equivalence class of DFA states. Now we need to + // use this partioning to update the DFA to only contain one state for + // each partition. + + // Create a map from DFA state ID to the representative ID of the + // equivalence class to which it belongs. The representative ID of an + // equivalence class of states is the minimum ID in that class. + let mut state_to_part = vec![dead_id(); self.dfa.state_count()]; + for p in &self.partitions { + p.iter(|id| state_to_part[id.to_usize()] = p.min()); + } + + // Generate a new contiguous sequence of IDs for minimal states, and + // create a map from equivalence IDs to the new IDs. Thus, the new + // minimal ID of *any* state in the unminimized DFA can be obtained + // with minimals_ids[state_to_part[old_id]]. + let mut minimal_ids = vec![dead_id(); self.dfa.state_count()]; + let mut new_id = S::from_usize(0); + for (id, _) in self.dfa.states() { + if state_to_part[id.to_usize()] == id { + minimal_ids[id.to_usize()] = new_id; + new_id = S::from_usize(new_id.to_usize() + 1); + } + } + // The total number of states in the minimal DFA. + let minimal_count = new_id.to_usize(); + + // Re-map this DFA in place such that the only states remaining + // correspond to the representative states of every equivalence class. + for id in (0..self.dfa.state_count()).map(S::from_usize) { + // If this state isn't a representative for an equivalence class, + // then we skip it since it won't appear in the minimal DFA. + if state_to_part[id.to_usize()] != id { + continue; + } + for (_, next) in self.dfa.get_state_mut(id).iter_mut() { + *next = minimal_ids[state_to_part[next.to_usize()].to_usize()]; + } + self.dfa.swap_states(id, minimal_ids[id.to_usize()]); + } + // Trim off all unused states from the pre-minimized DFA. This + // represents all states that were merged into a non-singleton + // equivalence class of states, and appeared after the first state + // in each such class. (Because the state with the smallest ID in each + // equivalence class is its representative ID.) + self.dfa.truncate_states(minimal_count); + + // Update the new start state, which is now just the minimal ID of + // whatever state the old start state was collapsed into. + let old_start = self.dfa.start_state(); + self.dfa.set_start_state( + minimal_ids[state_to_part[old_start.to_usize()].to_usize()], + ); + + // In order to update the ID of the maximum match state, we need to + // find the maximum ID among all of the match states in the minimized + // DFA. This is not necessarily the new ID of the unminimized maximum + // match state, since that could have been collapsed with a much + // earlier match state. Therefore, to find the new max match state, + // we iterate over all previous match states, find their corresponding + // new minimal ID, and take the maximum of those. + let old_max = self.dfa.max_match_state(); + self.dfa.set_max_match_state(dead_id()); + for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) { + let part = state_to_part[id.to_usize()]; + let new_id = minimal_ids[part.to_usize()]; + if new_id > self.dfa.max_match_state() { + self.dfa.set_max_match_state(new_id); + } + } + } + + fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> { + self.waiting.iter().position(|s| s == set) + } + + fn find_incoming_to( + &self, + b: u8, + set: &StateSet<S>, + incoming: &mut StateSet<S>, + ) { + incoming.clear(); + set.iter(|id| { + for &inid in &self.in_transitions[id.to_usize()][b as usize] { + incoming.add(inid); + } + }); + incoming.canonicalize(); + } + + fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> { + let mut is_match = StateSet::empty(); + let mut no_match = StateSet::empty(); + for (id, _) in dfa.states() { + if dfa.is_match_state(id) { + is_match.add(id); + } else { + no_match.add(id); + } + } + + let mut sets = vec![is_match]; + if !no_match.is_empty() { + sets.push(no_match); + } + sets.sort_by_key(|s| s.len()); + sets + } + + fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> { + let mut incoming = vec![]; + for _ in dfa.states() { + incoming.push(vec![vec![]; dfa.alphabet_len()]); + } + for (id, state) in dfa.states() { + for (b, next) in state.transitions() { + incoming[next.to_usize()][b as usize].push(id); + } + } + incoming + } +} + +impl<S: StateID> StateSet<S> { + fn empty() -> StateSet<S> { + StateSet(Rc::new(RefCell::new(vec![]))) + } + + fn add(&mut self, id: S) { + self.0.borrow_mut().push(id); + } + + fn min(&self) -> S { + self.0.borrow()[0] + } + + fn canonicalize(&mut self) { + self.0.borrow_mut().sort(); + self.0.borrow_mut().dedup(); + } + + fn clear(&mut self) { + self.0.borrow_mut().clear(); + } + + fn len(&self) -> usize { + self.0.borrow().len() + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } + + fn deep_clone(&self) -> StateSet<S> { + let ids = self.0.borrow().iter().cloned().collect(); + StateSet(Rc::new(RefCell::new(ids))) + } + + fn iter<F: FnMut(S)>(&self, mut f: F) { + for &id in self.0.borrow().iter() { + f(id); + } + } + + fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { + dest.clear(); + if self.is_empty() || other.is_empty() { + return; + } + + let (seta, setb) = (self.0.borrow(), other.0.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => break, + Some(b) => b, + }; + } else if a < b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => break, + Some(b) => b, + }; + } + } + } + + fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) { + dest.clear(); + if self.is_empty() || other.is_empty() { + self.iter(|s| dest.add(s)); + return; + } + + let (seta, setb) = (self.0.borrow(), other.0.borrow()); + let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); + let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); + loop { + if a == b { + a = match ita.next() { + None => break, + Some(a) => a, + }; + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } else if a < b { + dest.add(a); + a = match ita.next() { + None => break, + Some(a) => a, + }; + } else { + b = match itb.next() { + None => { + dest.add(a); + break; + } + Some(b) => b, + }; + } + } + for a in ita { + dest.add(a); + } + } +} diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs new file mode 100644 index 000000000..d9b3945b3 --- /dev/null +++ b/vendor/regex-automata/src/nfa/compiler.rs @@ -0,0 +1,1193 @@ +// This module provides an NFA compiler using Thompson's construction +// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA +// graph as output. The NFA graph is structured in a way that permits it to be +// executed by a virtual machine and also used to efficiently build a DFA. +// +// The compiler deals with a slightly expanded set of NFA states that notably +// includes an empty node that has exactly one epsilon transition to the next +// state. In other words, it's a "goto" instruction if one views Thompson's NFA +// as a set of bytecode instructions. These goto instructions are removed in +// a subsequent phase before returning the NFA to the caller. The purpose of +// these empty nodes is that they make the construction algorithm substantially +// simpler to implement. We remove them before returning to the caller because +// they can represent substantial overhead when traversing the NFA graph +// (either while searching using the NFA directly or while building a DFA). +// +// In the future, it would be nice to provide a Glushkov compiler as well, +// as it would work well as a bit-parallel NFA for smaller regexes. But +// the Thompson construction is one I'm more familiar with and seems more +// straight-forward to deal with when it comes to large Unicode character +// classes. +// +// Internally, the compiler uses interior mutability to improve composition +// in the face of the borrow checker. In particular, we'd really like to be +// able to write things like this: +// +// self.c_concat(exprs.iter().map(|e| self.c(e))) +// +// Which elegantly uses iterators to build up a sequence of compiled regex +// sub-expressions and then hands it off to the concatenating compiler +// routine. Without interior mutability, the borrow checker won't let us +// borrow `self` mutably both inside and outside the closure at the same +// time. + +use std::cell::RefCell; +use std::mem; + +use regex_syntax::hir::{self, Hir, HirKind}; +use regex_syntax::utf8::{Utf8Range, Utf8Sequences}; + +use classes::ByteClassSet; +use error::{Error, Result}; +use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}; +use nfa::range_trie::RangeTrie; +use nfa::{State, StateID, Transition, NFA}; + +/// Config knobs for the NFA compiler. See the builder's methods for more +/// docs on each one. +#[derive(Clone, Copy, Debug)] +struct Config { + anchored: bool, + allow_invalid_utf8: bool, + reverse: bool, + shrink: bool, +} + +impl Default for Config { + fn default() -> Config { + Config { + anchored: false, + allow_invalid_utf8: false, + reverse: false, + shrink: true, + } + } +} + +/// A builder for compiling an NFA. +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, +} + +impl Builder { + /// Create a new NFA builder with its default configuration. + pub fn new() -> Builder { + Builder { config: Config::default() } + } + + /// Compile the given high level intermediate representation of a regular + /// expression into an NFA. + /// + /// If there was a problem building the NFA, then an error is returned. + /// For example, if the regex uses unsupported features (such as zero-width + /// assertions), then an error is returned. + pub fn build(&self, expr: &Hir) -> Result<NFA> { + let mut nfa = NFA::always_match(); + self.build_with(&mut Compiler::new(), &mut nfa, expr)?; + Ok(nfa) + } + + /// Compile the given high level intermediate representation of a regular + /// expression into the NFA given using the given compiler. Callers may + /// prefer this over `build` if they would like to reuse allocations while + /// compiling many regular expressions. + /// + /// On success, the given NFA is completely overwritten with the NFA + /// produced by the compiler. + /// + /// If there was a problem building the NFA, then an error is returned. For + /// example, if the regex uses unsupported features (such as zero-width + /// assertions), then an error is returned. When an error is returned, + /// the contents of `nfa` are unspecified and should not be relied upon. + /// However, it can still be reused in subsequent calls to this method. + pub fn build_with( + &self, + compiler: &mut Compiler, + nfa: &mut NFA, + expr: &Hir, + ) -> Result<()> { + compiler.clear(); + compiler.configure(self.config); + compiler.compile(nfa, expr) + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of the input. When + /// disabled, the NFA will act as if the pattern started with a `.*?`, + /// which enables a match to appear anywhere. + /// + /// By default this is disabled. + pub fn anchored(&mut self, yes: bool) -> &mut Builder { + self.config.anchored = yes; + self + } + + /// When enabled, the builder will permit the construction of an NFA that + /// may match invalid UTF-8. + /// + /// When disabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { + self.config.allow_invalid_utf8 = yes; + self + } + + /// Reverse the NFA. + /// + /// A NFA reversal is performed by reversing all of the concatenated + /// sub-expressions in the original pattern, recursively. The resulting + /// NFA can be used to match the pattern starting from the end of a string + /// instead of the beginning of a string. + /// + /// Reversing the NFA is useful for building a reverse DFA, which is most + /// useful for finding the start of a match. + pub fn reverse(&mut self, yes: bool) -> &mut Builder { + self.config.reverse = yes; + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// This is enabled by default. Generally speaking, if one is using an NFA + /// to compile DFA, then the extra time used to shrink the NFA will be + /// more than made up for during DFA construction (potentially by a lot). + /// In other words, enabling this can substantially decrease the overall + /// amount of time it takes to build a DFA. + /// + /// The only reason to disable this if you want to compile an NFA and start + /// using it as quickly as possible without needing to build a DFA. + pub fn shrink(&mut self, yes: bool) -> &mut Builder { + self.config.shrink = yes; + self + } +} + +/// A compiler that converts a regex abstract syntax to an NFA via Thompson's +/// construction. Namely, this compiler permits epsilon transitions between +/// states. +/// +/// Users of this crate cannot use a compiler directly. Instead, all one can +/// do is create one and use it via the +/// [`Builder::build_with`](struct.Builder.html#method.build_with) +/// method. This permits callers to reuse compilers in order to amortize +/// allocations. +#[derive(Clone, Debug)] +pub struct Compiler { + /// The set of compiled NFA states. Once a state is compiled, it is + /// assigned a state ID equivalent to its index in this list. Subsequent + /// compilation can modify previous states by adding new transitions. + states: RefCell<Vec<CState>>, + /// The configuration from the builder. + config: Config, + /// State used for compiling character classes to UTF-8 byte automata. + /// State is not retained between character class compilations. This just + /// serves to amortize allocation to the extent possible. + utf8_state: RefCell<Utf8State>, + /// State used for arranging character classes in reverse into a trie. + trie_state: RefCell<RangeTrie>, + /// State used for caching common suffixes when compiling reverse UTF-8 + /// automata (for Unicode character classes). + utf8_suffix: RefCell<Utf8SuffixMap>, + /// A map used to re-map state IDs when translating the compiler's internal + /// NFA state representation to the external NFA representation. + remap: RefCell<Vec<StateID>>, + /// A set of compiler internal state IDs that correspond to states that are + /// exclusively epsilon transitions, i.e., goto instructions, combined with + /// the state that they point to. This is used to record said states while + /// transforming the compiler's internal NFA representation to the external + /// form. + empties: RefCell<Vec<(StateID, StateID)>>, +} + +/// A compiler intermediate state representation for an NFA that is only used +/// during compilation. Once compilation is done, `CState`s are converted to +/// `State`s, which have a much simpler representation. +#[derive(Clone, Debug, Eq, PartialEq)] +enum CState { + /// An empty state whose only purpose is to forward the automaton to + /// another state via en epsilon transition. These are useful during + /// compilation but are otherwise removed at the end. + Empty { next: StateID }, + /// A state that only transitions to `next` if the current input byte is + /// in the range `[start, end]` (inclusive on both ends). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. + /// As such, this may only be used when every transition has equal + /// priority. (In practice, this is only used for encoding large UTF-8 + /// automata.) + Sparse { ranges: Vec<Transition> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Vec<StateID> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via later transitions + /// are preferred over earlier transitions. + /// + /// This "reverse" state exists for convenience during compilation that + /// permits easy construction of non-greedy combinations of NFA states. + /// At the end of compilation, Union and UnionReverse states are merged + /// into one Union type of state, where the latter has its epsilon + /// transitions reversed to reflect the priority inversion. + UnionReverse { alternates: Vec<StateID> }, + /// A match state. There is exactly one such occurrence of this state in + /// an NFA. + Match, +} + +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +pub struct ThompsonRef { + start: StateID, + end: StateID, +} + +impl Compiler { + /// Create a new compiler. + pub fn new() -> Compiler { + Compiler { + states: RefCell::new(vec![]), + config: Config::default(), + utf8_state: RefCell::new(Utf8State::new()), + trie_state: RefCell::new(RangeTrie::new()), + utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + remap: RefCell::new(vec![]), + empties: RefCell::new(vec![]), + } + } + + /// Clear any memory used by this compiler such that it is ready to compile + /// a new regex. + /// + /// It is preferrable to reuse a compiler if possible in order to reuse + /// allocations. + fn clear(&self) { + self.states.borrow_mut().clear(); + // We don't need to clear anything else since they are cleared on + // their own and only when they are used. + } + + /// Configure this compiler from the builder's knobs. + /// + /// The compiler is always reconfigured by the builder before using it to + /// build an NFA. + fn configure(&mut self, config: Config) { + self.config = config; + } + + /// Convert the current intermediate NFA to its final compiled form. + fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> { + nfa.anchored = self.config.anchored; + + let mut start = self.add_empty(); + if !nfa.anchored { + let compiled = if self.config.allow_invalid_utf8 { + self.c_unanchored_prefix_invalid_utf8()? + } else { + self.c_unanchored_prefix_valid_utf8()? + }; + self.patch(start, compiled.start); + start = compiled.end; + } + let compiled = self.c(&expr)?; + let match_id = self.add_match(); + self.patch(start, compiled.start); + self.patch(compiled.end, match_id); + self.finish(nfa); + Ok(()) + } + + /// Finishes the compilation process and populates the provide NFA with + /// the final graph. + fn finish(&self, nfa: &mut NFA) { + let mut bstates = self.states.borrow_mut(); + let mut remap = self.remap.borrow_mut(); + remap.resize(bstates.len(), 0); + let mut empties = self.empties.borrow_mut(); + empties.clear(); + + // We don't reuse allocations here becuase this is what we're + // returning. + nfa.states.clear(); + let mut byteset = ByteClassSet::new(); + + // The idea here is to convert our intermediate states to their final + // form. The only real complexity here is the process of converting + // transitions, which are expressed in terms of state IDs. The new + // set of states will be smaller because of partial epsilon removal, + // so the state IDs will not be the same. + for (id, bstate) in bstates.iter_mut().enumerate() { + match *bstate { + CState::Empty { next } => { + // Since we're removing empty states, we need to handle + // them later since we don't yet know which new state this + // empty state will be mapped to. + empties.push((id, next)); + } + CState::Range { ref range } => { + remap[id] = nfa.states.len(); + byteset.set_range(range.start, range.end); + nfa.states.push(State::Range { range: range.clone() }); + } + CState::Sparse { ref mut ranges } => { + remap[id] = nfa.states.len(); + + let ranges = mem::replace(ranges, vec![]); + for r in &ranges { + byteset.set_range(r.start, r.end); + } + nfa.states.push(State::Sparse { + ranges: ranges.into_boxed_slice(), + }); + } + CState::Union { ref mut alternates } => { + remap[id] = nfa.states.len(); + + let alternates = mem::replace(alternates, vec![]); + nfa.states.push(State::Union { + alternates: alternates.into_boxed_slice(), + }); + } + CState::UnionReverse { ref mut alternates } => { + remap[id] = nfa.states.len(); + + let mut alternates = mem::replace(alternates, vec![]); + alternates.reverse(); + nfa.states.push(State::Union { + alternates: alternates.into_boxed_slice(), + }); + } + CState::Match => { + remap[id] = nfa.states.len(); + nfa.states.push(State::Match); + } + } + } + for &(empty_id, mut empty_next) in empties.iter() { + // empty states can point to other empty states, forming a chain. + // So we must follow the chain until the end, which must end at + // a non-empty state, and therefore, a state that is correctly + // remapped. We are guaranteed to terminate because our compiler + // never builds a loop among empty states. + while let CState::Empty { next } = bstates[empty_next] { + empty_next = next; + } + remap[empty_id] = remap[empty_next]; + } + for state in &mut nfa.states { + state.remap(&remap); + } + // The compiler always begins the NFA at the first state. + nfa.start = remap[0]; + nfa.byte_classes = byteset.byte_classes(); + } + + fn c(&self, expr: &Hir) -> Result<ThompsonRef> { + match *expr.kind() { + HirKind::Empty => { + let id = self.add_empty(); + Ok(ThompsonRef { start: id, end: id }) + } + HirKind::Literal(hir::Literal::Unicode(ch)) => { + let mut buf = [0; 4]; + let it = ch + .encode_utf8(&mut buf) + .as_bytes() + .iter() + .map(|&b| Ok(self.c_range(b, b))); + self.c_concat(it) + } + HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)), + HirKind::Class(hir::Class::Bytes(ref cls)) => { + self.c_byte_class(cls) + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + self.c_unicode_class(cls) + } + HirKind::Repetition(ref rep) => self.c_repetition(rep), + HirKind::Group(ref group) => self.c(&*group.hir), + HirKind::Concat(ref exprs) => { + self.c_concat(exprs.iter().map(|e| self.c(e))) + } + HirKind::Alternation(ref exprs) => { + self.c_alternation(exprs.iter().map(|e| self.c(e))) + } + HirKind::Anchor(_) => Err(Error::unsupported_anchor()), + HirKind::WordBoundary(_) => Err(Error::unsupported_word()), + } + } + + fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef> + where + I: DoubleEndedIterator<Item = Result<ThompsonRef>>, + { + let first = + if self.config.reverse { it.next_back() } else { it.next() }; + let ThompsonRef { start, mut end } = match first { + Some(result) => result?, + None => return Ok(self.c_empty()), + }; + loop { + let next = + if self.config.reverse { it.next_back() } else { it.next() }; + let compiled = match next { + Some(result) => result?, + None => break, + }; + self.patch(end, compiled.start); + end = compiled.end; + } + Ok(ThompsonRef { start, end }) + } + + fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef> + where + I: Iterator<Item = Result<ThompsonRef>>, + { + let first = it.next().expect("alternations must be non-empty")?; + let second = match it.next() { + None => return Ok(first), + Some(result) => result?, + }; + + let union = self.add_union(); + let end = self.add_empty(); + self.patch(union, first.start); + self.patch(first.end, end); + self.patch(union, second.start); + self.patch(second.end, end); + for result in it { + let compiled = result?; + self.patch(union, compiled.start); + self.patch(compiled.end, end); + } + Ok(ThompsonRef { start: union, end }) + } + + fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> { + match rep.kind { + hir::RepetitionKind::ZeroOrOne => { + self.c_zero_or_one(&rep.hir, rep.greedy) + } + hir::RepetitionKind::ZeroOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 0) + } + hir::RepetitionKind::OneOrMore => { + self.c_at_least(&rep.hir, rep.greedy, 1) + } + hir::RepetitionKind::Range(ref rng) => match *rng { + hir::RepetitionRange::Exactly(count) => { + self.c_exactly(&rep.hir, count) + } + hir::RepetitionRange::AtLeast(m) => { + self.c_at_least(&rep.hir, rep.greedy, m) + } + hir::RepetitionRange::Bounded(min, max) => { + self.c_bounded(&rep.hir, rep.greedy, min, max) + } + }, + } + } + + fn c_bounded( + &self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> Result<ThompsonRef> { + let prefix = self.c_exactly(expr, min)?; + if min == max { + return Ok(prefix); + } + + // It is tempting here to compile the rest here as a concatenation + // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it + // were `aaa?a?a?`. The problem here is that it leads to this program: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: alt(03, 04) + // 000003: 61 => 04 + // 000004: alt(05, 06) + // 000005: 61 => 06 + // 000006: alt(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // And effectively, once you hit state 2, the epsilon closure will + // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is + // better to instead compile it like so: + // + // >000000: 61 => 01 + // 000001: 61 => 02 + // 000002: alt(03, 08) + // 000003: 61 => 04 + // 000004: alt(05, 08) + // 000005: 61 => 06 + // 000006: alt(07, 08) + // 000007: 61 => 08 + // 000008: MATCH + // + // So that the epsilon closure of state 2 is now just 3 and 8. + let empty = self.add_empty(); + let mut prev_end = prefix.end; + for _ in min..max { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + let compiled = self.c(expr)?; + self.patch(prev_end, union); + self.patch(union, compiled.start); + self.patch(union, empty); + prev_end = compiled.end; + } + self.patch(prev_end, empty); + Ok(ThompsonRef { start: prefix.start, end: empty }) + } + + fn c_at_least( + &self, + expr: &Hir, + greedy: bool, + n: u32, + ) -> Result<ThompsonRef> { + if n == 0 { + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + let compiled = self.c(expr)?; + self.patch(union, compiled.start); + self.patch(compiled.end, union); + Ok(ThompsonRef { start: union, end: union }) + } else if n == 1 { + let compiled = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + self.patch(compiled.end, union); + self.patch(union, compiled.start); + Ok(ThompsonRef { start: compiled.start, end: union }) + } else { + let prefix = self.c_exactly(expr, n - 1)?; + let last = self.c(expr)?; + let union = if greedy { + self.add_union() + } else { + self.add_reverse_union() + }; + self.patch(prefix.end, last.start); + self.patch(last.end, union); + self.patch(union, last.start); + Ok(ThompsonRef { start: prefix.start, end: union }) + } + } + + fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> { + let union = + if greedy { self.add_union() } else { self.add_reverse_union() }; + let compiled = self.c(expr)?; + let empty = self.add_empty(); + self.patch(union, compiled.start); + self.patch(union, empty); + self.patch(compiled.end, empty); + Ok(ThompsonRef { start: union, end: empty }) + } + + fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> { + let it = (0..n).map(|_| self.c(expr)); + self.c_concat(it) + } + + fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> { + let end = self.add_empty(); + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + trans.push(Transition { + start: r.start(), + end: r.end(), + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans), end }) + } + + fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> { + // If all we have are ASCII ranges wrapped in a Unicode package, then + // there is zero reason to bring out the big guns. We can fit all ASCII + // ranges within a single sparse transition. + if cls.is_all_ascii() { + let end = self.add_empty(); + let mut trans = Vec::with_capacity(cls.ranges().len()); + for r in cls.iter() { + assert!(r.start() <= '\x7F'); + assert!(r.end() <= '\x7F'); + trans.push(Transition { + start: r.start() as u8, + end: r.end() as u8, + next: end, + }); + } + Ok(ThompsonRef { start: self.add_sparse(trans), end }) + } else if self.config.reverse { + if !self.config.shrink { + // When we don't want to spend the extra time shrinking, we + // compile the UTF-8 automaton in reverse using something like + // the "naive" approach, but will attempt to re-use common + // suffixes. + self.c_unicode_class_reverse_with_suffix(cls) + } else { + // When we want to shrink our NFA for reverse UTF-8 automata, + // we cannot feed UTF-8 sequences directly to the UTF-8 + // compiler, since the UTF-8 compiler requires all sequences + // to be lexicographically sorted. Instead, we organize our + // sequences into a range trie, which can then output our + // sequences in the correct order. Unfortunately, building the + // range trie is fairly expensive (but not nearly as expensive + // as building a DFA). Hence the reason why the 'shrink' option + // exists, so that this path can be toggled off. + let mut trie = self.trie_state.borrow_mut(); + trie.clear(); + + for rng in cls.iter() { + for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { + seq.reverse(); + trie.insert(seq.as_slice()); + } + } + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); + trie.iter(|seq| { + utf8c.add(&seq); + }); + Ok(utf8c.finish()) + } + } else { + // In the forward direction, we always shrink our UTF-8 automata + // because we can stream it right into the UTF-8 compiler. There + // is almost no downside (in either memory or time) to using this + // approach. + let mut utf8_state = self.utf8_state.borrow_mut(); + let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state); + for rng in cls.iter() { + for seq in Utf8Sequences::new(rng.start(), rng.end()) { + utf8c.add(seq.as_slice()); + } + } + Ok(utf8c.finish()) + } + + // For reference, the code below is the "naive" version of compiling a + // UTF-8 automaton. It is deliciously simple (and works for both the + // forward and reverse cases), but will unfortunately produce very + // large NFAs. When compiling a forward automaton, the size difference + // can sometimes be an order of magnitude. For example, the '\w' regex + // will generate about ~3000 NFA states using the naive approach below, + // but only 283 states when using the approach above. This is because + // the approach above actually compiles a *minimal* (or near minimal, + // because of the bounded hashmap) UTF-8 automaton. + // + // The code below is kept as a reference point in order to make it + // easier to understand the higher level goal here. + /* + let it = cls + .iter() + .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) + .map(|seq| { + let it = seq + .as_slice() + .iter() + .map(|rng| Ok(self.c_range(rng.start, rng.end))); + self.c_concat(it) + }); + self.c_alternation(it); + */ + } + + fn c_unicode_class_reverse_with_suffix( + &self, + cls: &hir::ClassUnicode, + ) -> Result<ThompsonRef> { + // N.B. It would likely be better to cache common *prefixes* in the + // reverse direction, but it's not quite clear how to do that. The + // advantage of caching suffixes is that it does give us a win, and + // has a very small additional overhead. + let mut cache = self.utf8_suffix.borrow_mut(); + cache.clear(); + + let union = self.add_union(); + let alt_end = self.add_empty(); + for urng in cls.iter() { + for seq in Utf8Sequences::new(urng.start(), urng.end()) { + let mut end = alt_end; + for brng in seq.as_slice() { + let key = Utf8SuffixKey { + from: end, + start: brng.start, + end: brng.end, + }; + let hash = cache.hash(&key); + if let Some(id) = cache.get(&key, hash) { + end = id; + continue; + } + + let compiled = self.c_range(brng.start, brng.end); + self.patch(compiled.end, end); + end = compiled.start; + cache.set(key, hash, end); + } + self.patch(union, end); + } + } + Ok(ThompsonRef { start: union, end: alt_end }) + } + + fn c_range(&self, start: u8, end: u8) -> ThompsonRef { + let id = self.add_range(start, end); + ThompsonRef { start: id, end: id } + } + + fn c_empty(&self) -> ThompsonRef { + let id = self.add_empty(); + ThompsonRef { start: id, end: id } + } + + fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(false)), + })) + } + + fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(true)), + })) + } + + fn patch(&self, from: StateID, to: StateID) { + match self.states.borrow_mut()[from] { + CState::Empty { ref mut next } => { + *next = to; + } + CState::Range { ref mut range } => { + range.next = to; + } + CState::Sparse { .. } => { + panic!("cannot patch from a sparse NFA state") + } + CState::Union { ref mut alternates } => { + alternates.push(to); + } + CState::UnionReverse { ref mut alternates } => { + alternates.push(to); + } + CState::Match => {} + } + } + + fn add_empty(&self) -> StateID { + let id = self.states.borrow().len(); + self.states.borrow_mut().push(CState::Empty { next: 0 }); + id + } + + fn add_range(&self, start: u8, end: u8) -> StateID { + let id = self.states.borrow().len(); + let trans = Transition { start, end, next: 0 }; + let state = CState::Range { range: trans }; + self.states.borrow_mut().push(state); + id + } + + fn add_sparse(&self, ranges: Vec<Transition>) -> StateID { + if ranges.len() == 1 { + let id = self.states.borrow().len(); + let state = CState::Range { range: ranges[0] }; + self.states.borrow_mut().push(state); + return id; + } + let id = self.states.borrow().len(); + let state = CState::Sparse { ranges }; + self.states.borrow_mut().push(state); + id + } + + fn add_union(&self) -> StateID { + let id = self.states.borrow().len(); + let state = CState::Union { alternates: vec![] }; + self.states.borrow_mut().push(state); + id + } + + fn add_reverse_union(&self) -> StateID { + let id = self.states.borrow().len(); + let state = CState::UnionReverse { alternates: vec![] }; + self.states.borrow_mut().push(state); + id + } + + fn add_match(&self) -> StateID { + let id = self.states.borrow().len(); + self.states.borrow_mut().push(CState::Match); + id + } +} + +#[derive(Debug)] +struct Utf8Compiler<'a> { + nfac: &'a Compiler, + state: &'a mut Utf8State, + target: StateID, +} + +#[derive(Clone, Debug)] +struct Utf8State { + compiled: Utf8BoundedMap, + uncompiled: Vec<Utf8Node>, +} + +#[derive(Clone, Debug)] +struct Utf8Node { + trans: Vec<Transition>, + last: Option<Utf8LastTransition>, +} + +#[derive(Clone, Debug)] +struct Utf8LastTransition { + start: u8, + end: u8, +} + +impl Utf8State { + fn new() -> Utf8State { + Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] } + } + + fn clear(&mut self) { + self.compiled.clear(); + self.uncompiled.clear(); + } +} + +impl<'a> Utf8Compiler<'a> { + fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> { + let target = nfac.add_empty(); + state.clear(); + let mut utf8c = Utf8Compiler { nfac, state, target }; + utf8c.add_empty(); + utf8c + } + + fn finish(&mut self) -> ThompsonRef { + self.compile_from(0); + let node = self.pop_root(); + let start = self.compile(node); + ThompsonRef { start, end: self.target } + } + + fn add(&mut self, ranges: &[Utf8Range]) { + let prefix_len = ranges + .iter() + .zip(&self.state.uncompiled) + .take_while(|&(range, node)| { + node.last.as_ref().map_or(false, |t| { + (t.start, t.end) == (range.start, range.end) + }) + }) + .count(); + assert!(prefix_len < ranges.len()); + self.compile_from(prefix_len); + self.add_suffix(&ranges[prefix_len..]); + } + + fn compile_from(&mut self, from: usize) { + let mut next = self.target; + while from + 1 < self.state.uncompiled.len() { + let node = self.pop_freeze(next); + next = self.compile(node); + } + self.top_last_freeze(next); + } + + fn compile(&mut self, node: Vec<Transition>) -> StateID { + let hash = self.state.compiled.hash(&node); + if let Some(id) = self.state.compiled.get(&node, hash) { + return id; + } + let id = self.nfac.add_sparse(node.clone()); + self.state.compiled.set(node, hash, id); + id + } + + fn add_suffix(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + assert!(self.state.uncompiled[last].last.is_none()); + self.state.uncompiled[last].last = Some(Utf8LastTransition { + start: ranges[0].start, + end: ranges[0].end, + }); + for r in &ranges[1..] { + self.state.uncompiled.push(Utf8Node { + trans: vec![], + last: Some(Utf8LastTransition { start: r.start, end: r.end }), + }); + } + } + + fn add_empty(&mut self) { + self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); + } + + fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> { + let mut uncompiled = self.state.uncompiled.pop().unwrap(); + uncompiled.set_last_transition(next); + uncompiled.trans + } + + fn pop_root(&mut self) -> Vec<Transition> { + assert_eq!(self.state.uncompiled.len(), 1); + assert!(self.state.uncompiled[0].last.is_none()); + self.state.uncompiled.pop().expect("non-empty nodes").trans + } + + fn top_last_freeze(&mut self, next: StateID) { + let last = self + .state + .uncompiled + .len() + .checked_sub(1) + .expect("non-empty nodes"); + self.state.uncompiled[last].set_last_transition(next); + } +} + +impl Utf8Node { + fn set_last_transition(&mut self, next: StateID) { + if let Some(last) = self.last.take() { + self.trans.push(Transition { + start: last.start, + end: last.end, + next, + }); + } + } +} + +#[cfg(test)] +mod tests { + use regex_syntax::hir::Hir; + use regex_syntax::ParserBuilder; + + use super::{Builder, State, StateID, Transition, NFA}; + + fn parse(pattern: &str) -> Hir { + ParserBuilder::new().build().parse(pattern).unwrap() + } + + fn build(pattern: &str) -> NFA { + Builder::new().anchored(true).build(&parse(pattern)).unwrap() + } + + fn s_byte(byte: u8, next: StateID) -> State { + let trans = Transition { start: byte, end: byte, next }; + State::Range { range: trans } + } + + fn s_range(start: u8, end: u8, next: StateID) -> State { + let trans = Transition { start, end, next }; + State::Range { range: trans } + } + + fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State { + let ranges = ranges + .iter() + .map(|&(start, end, next)| Transition { start, end, next }) + .collect(); + State::Sparse { ranges } + } + + fn s_union(alts: &[StateID]) -> State { + State::Union { alternates: alts.to_vec().into_boxed_slice() } + } + + fn s_match() -> State { + State::Match + } + + #[test] + fn errors() { + // unsupported anchors + assert!(Builder::new().build(&parse(r"^")).is_err()); + assert!(Builder::new().build(&parse(r"$")).is_err()); + assert!(Builder::new().build(&parse(r"\A")).is_err()); + assert!(Builder::new().build(&parse(r"\z")).is_err()); + + // unsupported word boundaries + assert!(Builder::new().build(&parse(r"\b")).is_err()); + assert!(Builder::new().build(&parse(r"\B")).is_err()); + assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err()); + } + + // Test that building an unanchored NFA has an appropriate `.*?` prefix. + #[test] + fn compile_unanchored_prefix() { + // When the machine can only match valid UTF-8. + let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap(); + // There should be many states since the `.` in `.*?` matches any + // Unicode scalar value. + assert_eq!(11, nfa.len()); + assert_eq!(nfa.states[10], s_match()); + assert_eq!(nfa.states[9], s_byte(b'a', 10)); + + // When the machine can match invalid UTF-8. + let nfa = Builder::new() + .anchored(false) + .allow_invalid_utf8(true) + .build(&parse(r"a")) + .unwrap(); + assert_eq!( + nfa.states, + &[ + s_union(&[2, 1]), + s_range(0, 255, 0), + s_byte(b'a', 3), + s_match(), + ] + ); + } + + #[test] + fn compile_empty() { + assert_eq!(build("").states, &[s_match(),]); + } + + #[test] + fn compile_literal() { + assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]); + assert_eq!( + build("ab").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] + ); + assert_eq!( + build("☃").states, + &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),] + ); + + // Check that non-UTF-8 literals work. + let hir = ParserBuilder::new() + .allow_invalid_utf8(true) + .build() + .parse(r"(?-u)\xFF") + .unwrap(); + let nfa = Builder::new() + .anchored(true) + .allow_invalid_utf8(true) + .build(&hir) + .unwrap(); + assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]); + } + + #[test] + fn compile_class() { + assert_eq!( + build(r"[a-z]").states, + &[s_range(b'a', b'z', 1), s_match(),] + ); + assert_eq!( + build(r"[x-za-c]").states, + &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()] + ); + assert_eq!( + build(r"[\u03B1-\u03B4]").states, + &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()] + ); + assert_eq!( + build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, + &[ + s_range(0xB1, 0xB4, 5), + s_range(0x99, 0x9E, 5), + s_byte(0xA4, 1), + s_byte(0x9F, 2), + s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), + s_match(), + ] + ); + assert_eq!( + build(r"[a-z☃]").states, + &[ + s_byte(0x83, 3), + s_byte(0x98, 0), + s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), + s_match(), + ] + ); + } + + #[test] + fn compile_repetition() { + assert_eq!( + build(r"a?").states, + &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),] + ); + assert_eq!( + build(r"a??").states, + &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),] + ); + } + + #[test] + fn compile_group() { + assert_eq!( + build(r"ab+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),] + ); + assert_eq!( + build(r"(ab)").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),] + ); + assert_eq!( + build(r"(ab)+").states, + &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),] + ); + } + + #[test] + fn compile_alternation() { + assert_eq!( + build(r"a|b").states, + &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),] + ); + assert_eq!( + build(r"|b").states, + &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),] + ); + assert_eq!( + build(r"a|").states, + &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),] + ); + } +} diff --git a/vendor/regex-automata/src/nfa/map.rs b/vendor/regex-automata/src/nfa/map.rs new file mode 100644 index 000000000..e636c0dd3 --- /dev/null +++ b/vendor/regex-automata/src/nfa/map.rs @@ -0,0 +1,282 @@ +// This module contains a couple simple and purpose built hash maps. The key +// trade off they make is that they serve as caches rather than true maps. That +// is, inserting a new entry may cause eviction of another entry. This gives +// us two things. First, there's less overhead associated with inserts and +// lookups. Secondly, it lets us control our memory usage. +// +// These maps are used in some fairly hot code when generating NFA states for +// large Unicode character classes. +// +// Instead of exposing a rich hashmap entry API, we just permit the caller +// to produce a hash of the key directly. The hash can then be reused for both +// lookups and insertions at the cost of leaking things a bit. But these are +// for internal use only, so it's fine. +// +// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a +// (almost) minimal DFA for large Unicode character classes in linear time. +// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse +// NFAs, it's only used when the compiler is configured to 'shrink' the NFA, +// since there's a bit more expense in the reverse direction.) +// +// The Utf8SuffixMap is used when compiling large Unicode character classes for +// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive +// construction of UTF-8 automata by caching common suffixes. This doesn't +// get the same space savings as Daciuk's algorithm, but it's basically as +// fast as the naive approach and typically winds up using less memory (since +// it generates smaller NFAs) despite the presence of the cache. +// +// These maps effectively represent caching mechanisms for CState::Sparse and +// CState::Range, respectively. The former represents a single NFA state with +// many transitions of equivalent priority while the latter represents a single +// NFA state with a single transition. (Neither state ever has or is an +// epsilon transition.) Thus, they have different key types. It's likely we +// could make one generic map, but the machinery didn't seem worth it. They +// are simple enough. + +use nfa::{StateID, Transition}; + +// Basic FNV-1a hash constants as described in: +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +const PRIME: u64 = 1099511628211; +const INIT: u64 = 14695981039346656037; + +/// A bounded hash map where the key is a sequence of NFA transitions and the +/// value is a pre-existing NFA state ID. +/// +/// std's hashmap can be used for this, however, this map has two important +/// advantages. Firstly, it has lower overhead. Secondly, it permits us to +/// control our memory usage by limited the number of slots. In general, the +/// cost here is that this map acts as a cache. That is, inserting a new entry +/// may remove an old entry. We are okay with this, since it does not impact +/// correctness in the cases where it is used. The only effect that dropping +/// states from the cache has is that the resulting NFA generated may be bigger +/// than it otherwise would be. +/// +/// This improves benchmarks that compile large Unicode character classes, +/// since it makes the generation of (almost) minimal UTF-8 automaton faster. +/// Specifically, one could observe the difference with std's hashmap via +/// something like the following benchmark: +/// +/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" +/// +/// But to observe that difference, you'd have to modify the code to use +/// std's hashmap. +/// +/// It is quite possible that there is a better way to approach this problem. +/// For example, if there happens to be a very common state that collides with +/// a lot of less frequent states, then we could wind up with very poor caching +/// behavior. Alas, the effectiveness of this cache has not been measured. +/// Instead, ad hoc experiments suggest that it is "good enough." Additional +/// smarts (such as an LRU eviction policy) have to be weighed against the +/// amount of extra time they cost. +#[derive(Clone, Debug)] +pub struct Utf8BoundedMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8BoundedEntry>, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8BoundedEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which is a sorted sequence of non-overlapping NFA transitions. + key: Vec<Transition>, + /// The state ID corresponding to the state containing the transitions in + /// this entry. + val: StateID, +} + +impl Utf8BoundedMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8BoundedMap { + assert!(capacity > 0); + Utf8BoundedMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8BoundedEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transitions. + pub fn hash(&self, key: &[Transition]) -> usize { + let mut h = INIT; + for t in key { + h = (h ^ (t.start as u64)).wrapping_mul(PRIME); + h = (h ^ (t.end as u64)).wrapping_mul(PRIME); + h = (h ^ (t.next as u64)).wrapping_mul(PRIME); + } + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given transitions, then None is + /// returned. + pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + // There may be a hash collision, so we need to confirm real equality. + if entry.key != key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transitions given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set( + &mut self, + key: Vec<Transition>, + hash: usize, + state_id: StateID, + ) { + self.map[hash] = + Utf8BoundedEntry { version: self.version, key, val: state_id }; + } +} + +/// A cache of suffixes used to modestly compress UTF-8 automata for large +/// Unicode character classes. +#[derive(Clone, Debug)] +pub struct Utf8SuffixMap { + /// The current version of this map. Only entries with matching versions + /// are considered during lookups. If an entry is found with a mismatched + /// version, then the map behaves as if the entry does not exist. + version: u16, + /// The total number of entries this map can store. + capacity: usize, + /// The actual entries, keyed by hash. Collisions between different states + /// result in the old state being dropped. + map: Vec<Utf8SuffixEntry>, +} + +/// A key that uniquely identifies an NFA state. It is a triple that represents +/// a transition from one state for a particular byte range. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Utf8SuffixKey { + pub from: StateID, + pub start: u8, + pub end: u8, +} + +/// An entry in this map. +#[derive(Clone, Debug, Default)] +struct Utf8SuffixEntry { + /// The version of the map used to produce this entry. If this entry's + /// version does not match the current version of the map, then the map + /// should behave as if this entry does not exist. + version: u16, + /// The key, which consists of a transition in a particular state. + key: Utf8SuffixKey, + /// The identifier that the transition in the key maps to. + val: StateID, +} + +impl Utf8SuffixMap { + /// Create a new bounded map with the given capacity. The map will never + /// grow beyond the given size. + /// + /// Note that this does not allocate. Instead, callers must call `clear` + /// before using this map. `clear` will allocate space if necessary. + /// + /// This avoids the need to pay for the allocation of this map when + /// compiling regexes that lack large Unicode character classes. + pub fn new(capacity: usize) -> Utf8SuffixMap { + assert!(capacity > 0); + Utf8SuffixMap { version: 0, capacity, map: vec![] } + } + + /// Clear this map of all entries, but permit the reuse of allocation + /// if possible. + /// + /// This must be called before the map can be used. + pub fn clear(&mut self) { + if self.map.is_empty() { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } else { + self.version = self.version.wrapping_add(1); + if self.version == 0 { + self.map = vec![Utf8SuffixEntry::default(); self.capacity]; + } + } + } + + /// Return a hash of the given transition. + pub fn hash(&self, key: &Utf8SuffixKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const PRIME: u64 = 1099511628211; + const INIT: u64 = 14695981039346656037; + + let mut h = INIT; + h = (h ^ (key.from as u64)).wrapping_mul(PRIME); + h = (h ^ (key.start as u64)).wrapping_mul(PRIME); + h = (h ^ (key.end as u64)).wrapping_mul(PRIME); + (h as usize) % self.map.len() + } + + /// Retrieve the cached state ID corresponding to the given key. The hash + /// given must have been computed with `hash` using the same key value. + /// + /// If there is no cached state with the given key, then None is returned. + pub fn get( + &mut self, + key: &Utf8SuffixKey, + hash: usize, + ) -> Option<StateID> { + let entry = &self.map[hash]; + if entry.version != self.version { + return None; + } + if key != &entry.key { + return None; + } + Some(entry.val) + } + + /// Add a cached state to this map with the given key. Callers should + /// ensure that `state_id` points to a state that contains precisely the + /// NFA transition given. + /// + /// `hash` must have been computed using the `hash` method with the same + /// key. + pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { + self.map[hash] = + Utf8SuffixEntry { version: self.version, key, val: state_id }; + } +} diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs new file mode 100644 index 000000000..02d0501de --- /dev/null +++ b/vendor/regex-automata/src/nfa/mod.rs @@ -0,0 +1,252 @@ +use std::fmt; + +use classes::ByteClasses; +pub use nfa::compiler::Builder; + +mod compiler; +mod map; +mod range_trie; + +/// The representation for an NFA state identifier. +pub type StateID = usize; + +/// A final compiled NFA. +/// +/// The states of the NFA are indexed by state IDs, which are how transitions +/// are expressed. +#[derive(Clone)] +pub struct NFA { + /// Whether this NFA can only match at the beginning of input or not. + /// + /// When true, a match should only be reported if it begins at the 0th + /// index of the haystack. + anchored: bool, + /// The starting state of this NFA. + start: StateID, + /// The state list. This list is guaranteed to be indexable by the starting + /// state ID, and it is also guaranteed to contain exactly one `Match` + /// state. + states: Vec<State>, + /// A mapping from any byte value to its corresponding equivalence class + /// identifier. Two bytes in the same equivalence class cannot discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_classes: ByteClasses, +} + +impl NFA { + /// Returns an NFA that always matches at every position. + pub fn always_match() -> NFA { + NFA { + anchored: false, + start: 0, + states: vec![State::Match], + byte_classes: ByteClasses::empty(), + } + } + + /// Returns an NFA that never matches at any position. + pub fn never_match() -> NFA { + NFA { + anchored: false, + start: 0, + states: vec![State::Fail], + byte_classes: ByteClasses::empty(), + } + } + + /// Returns true if and only if this NFA is anchored. + pub fn is_anchored(&self) -> bool { + self.anchored + } + + /// Return the number of states in this NFA. + pub fn len(&self) -> usize { + self.states.len() + } + + /// Return the ID of the initial state of this NFA. + pub fn start(&self) -> StateID { + self.start + } + + /// Return the NFA state corresponding to the given ID. + pub fn state(&self, id: StateID) -> &State { + &self.states[id] + } + + /// Return the set of equivalence classes for this NFA. The slice returned + /// always has length 256 and maps each possible byte value to its + /// corresponding equivalence class ID (which is never more than 255). + pub fn byte_classes(&self) -> &ByteClasses { + &self.byte_classes + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for (i, state) in self.states.iter().enumerate() { + let status = if i == self.start { '>' } else { ' ' }; + writeln!(f, "{}{:06}: {:?}", status, i, state)?; + } + Ok(()) + } +} + +/// A state in a final compiled NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state that transitions to `next` if and only if the current input + /// byte is in the range `[start, end]` (inclusive). + /// + /// This is a special case of Sparse in that it encodes only one transition + /// (and therefore avoids the allocation). + Range { range: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions are ordered lexicographically by input range. + /// As such, this may only be used when every transition has equal + /// priority. (In practice, this is only used for encoding large UTF-8 + /// automata.) + Sparse { ranges: Box<[Transition]> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Box<[StateID]> }, + /// A fail state. When encountered, the automaton is guaranteed to never + /// reach a match state. + Fail, + /// A match state. There is exactly one such occurrence of this state in + /// an NFA. + Match, +} + +/// A transition to another state, only if the given byte falls in the +/// inclusive range specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + pub start: u8, + pub end: u8, + pub next: StateID, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + pub fn is_epsilon(&self) -> bool { + match *self { + State::Range { .. } + | State::Sparse { .. } + | State::Fail + | State::Match => false, + State::Union { .. } => true, + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::Range { ref mut range } => range.next = remap[range.next], + State::Sparse { ref mut ranges } => { + for r in ranges.iter_mut() { + r.next = remap[r.next]; + } + } + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::Fail => {} + State::Match => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + State::Range { ref range } => range.fmt(f), + State::Sparse { ref ranges } => { + let rs = ranges + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{}", id)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "alt({})", alts) + } + State::Fail => write!(f, "FAIL"), + State::Match => write!(f, "MATCH"), + } + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{} => {}", escape(start), next) + } else { + write!(f, "{}-{} => {}", escape(start), escape(end), next) + } + } +} + +/// Return the given byte as its escaped string form. +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + use dense; + use dfa::DFA; + + #[test] + fn always_match() { + let nfa = NFA::always_match(); + let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); + + assert_eq!(Some(0), dfa.find_at(b"", 0)); + assert_eq!(Some(0), dfa.find_at(b"a", 0)); + assert_eq!(Some(1), dfa.find_at(b"a", 1)); + assert_eq!(Some(0), dfa.find_at(b"ab", 0)); + assert_eq!(Some(1), dfa.find_at(b"ab", 1)); + assert_eq!(Some(2), dfa.find_at(b"ab", 2)); + } + + #[test] + fn never_match() { + let nfa = NFA::never_match(); + let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap(); + + assert_eq!(None, dfa.find_at(b"", 0)); + assert_eq!(None, dfa.find_at(b"a", 0)); + assert_eq!(None, dfa.find_at(b"a", 1)); + assert_eq!(None, dfa.find_at(b"ab", 0)); + assert_eq!(None, dfa.find_at(b"ab", 1)); + assert_eq!(None, dfa.find_at(b"ab", 2)); + } +} diff --git a/vendor/regex-automata/src/nfa/range_trie.rs b/vendor/regex-automata/src/nfa/range_trie.rs new file mode 100644 index 000000000..50767c7c6 --- /dev/null +++ b/vendor/regex-automata/src/nfa/range_trie.rs @@ -0,0 +1,1048 @@ +// I've called the primary data structure in this module a "range trie." As far +// as I can tell, there is no prior art on a data structure like this, however, +// it's likely someone somewhere has built something like it. Searching for +// "range trie" turns up the paper "Range Tries for Scalable Address Lookup," +// but it does not appear relevant. +// +// The range trie is just like a trie in that it is a special case of a +// deterministic finite state machine. It has states and each state has a set +// of transitions to other states. It is acyclic, and, like a normal trie, +// it makes no attempt to reuse common suffixes among its elements. The key +// difference between a normal trie and a range trie below is that a range trie +// operates on *contiguous sequences* of bytes instead of singleton bytes. +// One could say say that our alphabet is ranges of bytes instead of bytes +// themselves, except a key part of range trie construction is splitting ranges +// apart to ensure there is at most one transition that can be taken for any +// byte in a given state. +// +// I've tried to explain the details of how the range trie works below, so +// for now, we are left with trying to understand what problem we're trying to +// solve. Which is itself fairly involved! +// +// At the highest level, here's what we want to do. We want to convert a +// sequence of Unicode codepoints into a finite state machine whose transitions +// are over *bytes* and *not* Unicode codepoints. We want this because it makes +// said finite state machines much smaller and much faster to execute. As a +// simple example, consider a byte oriented automaton for all Unicode scalar +// values (0x00 through 0x10FFFF, not including surrogate codepoints): +// +// [00-7F] +// [C2-DF][80-BF] +// [E0-E0][A0-BF][80-BF] +// [E1-EC][80-BF][80-BF] +// [ED-ED][80-9F][80-BF] +// [EE-EF][80-BF][80-BF] +// [F0-F0][90-BF][80-BF][80-BF] +// [F1-F3][80-BF][80-BF][80-BF] +// [F4-F4][80-8F][80-BF][80-BF] +// +// (These byte ranges are generated via the regex-syntax::utf8 module, which +// was based on Russ Cox's code in RE2, which was in turn based on Ken +// Thompson's implementation of the same idea in his Plan9 implementation of +// grep.) +// +// It should be fairly straight-forward to see how one could compile this into +// a DFA. The sequences are sorted and non-overlapping. Essentially, you could +// build a trie from this fairly easy. The problem comes when your initial +// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class +// represented by '\w' contains only a tenth of the codepoints that +// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges +// as we did above, the list would stretch to 892 entries! This turns into +// quite a large NFA with a few thousand states. Turning this beast into a DFA +// takes quite a bit of time. We are thus left with trying to trim down the +// number of states we produce as early as possible. +// +// One approach (used by RE2 and still by the regex crate, at time of writing) +// is to try to find common suffixes while building NFA states for the above +// and reuse them. This is very cheap to do and one can control precisely how +// much extra memory you want to use for the cache. +// +// Another approach, however, is to reuse an algorithm for constructing a +// *minimal* DFA from a sorted sequence of inputs. I don't want to go into +// the full details here, but I explain it in more depth in my blog post on +// FSTs[1]. Note that the algorithm not invented by me, but was published +// in paper by Daciuk et al. in 2000 called "Incremental Construction of +// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, +// it is also possible to control the amount of extra memory one uses, although +// this usually comes with the cost of sacrificing true minimality. (But it's +// typically close enough with a reasonably sized cache of states.) +// +// The catch is that Daciuk's algorithm only works if you add your keys in +// lexicographic ascending order. In our case, since we're dealing with ranges, +// we also need the additional requirement that ranges are either equivalent +// or do not overlap at all. For example, if one were given the following byte +// ranges: +// +// [BC-BF][80-BF] +// [BC-BF][90-BF] +// +// Then Daciuk's algorithm also would not work, since there is nothing to +// handle the fact that the ranges overlap. They would need to be split apart. +// Thankfully, Thompson's algorithm for producing byte ranges for Unicode +// codepoint ranges meets both of our requirements. +// +// ... however, we would also like to be able to compile UTF-8 automata in +// reverse. We want this because in order to find the starting location of a +// match using a DFA, we need to run a second DFA---a reversed version of the +// forward DFA---backwards to discover the match location. Unfortunately, if +// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are +// can overlap, even if they are sorted: +// +// [00-7F] +// [80-BF][80-9F][ED-ED] +// [80-BF][80-BF][80-8F][F4-F4] +// [80-BF][80-BF][80-BF][F1-F3] +// [80-BF][80-BF][90-BF][F0-F0] +// [80-BF][80-BF][E1-EC] +// [80-BF][80-BF][EE-EF] +// [80-BF][A0-BF][E0-E0] +// [80-BF][C2-DF] +// +// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have +// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no +// simple way to apply Daciuk's algorithm. +// +// And thus, the range trie was born. The range trie's only purpose is to take +// sequences of byte ranges like the ones above, collect them into a trie and +// then spit them in a sorted fashion with no overlapping ranges. For example, +// 0x00-0x10FFFF gets translated to: +// +// [0-7F] +// [80-BF][80-9F][80-8F][F1-F3] +// [80-BF][80-9F][80-8F][F4] +// [80-BF][80-9F][90-BF][F0] +// [80-BF][80-9F][90-BF][F1-F3] +// [80-BF][80-9F][E1-EC] +// [80-BF][80-9F][ED] +// [80-BF][80-9F][EE-EF] +// [80-BF][A0-BF][80-8F][F1-F3] +// [80-BF][A0-BF][80-8F][F4] +// [80-BF][A0-BF][90-BF][F0] +// [80-BF][A0-BF][90-BF][F1-F3] +// [80-BF][A0-BF][E0] +// [80-BF][A0-BF][E1-EC] +// [80-BF][A0-BF][EE-EF] +// [80-BF][C2-DF] +// +// We've thus satisfied our requirements for running Daciuk's algorithm. All +// sequences of ranges are sorted, and any corresponding ranges are either +// exactly equivalent or non-overlapping. +// +// In effect, a range trie is building a DFA from a sequence of arbitrary +// byte ranges. But it uses an algoritm custom tailored to its input, so it +// is not as costly as traditional DFA construction. While it is still quite +// a bit more costly than the forward's case (which only needs Daciuk's +// algorithm), it winds up saving a substantial amount of time if one is doing +// a full DFA powerset construction later by virtue of producing a much much +// smaller NFA. +// +// [1] - https://blog.burntsushi.net/transducers/ +// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 + +use std::cell::RefCell; +use std::fmt; +use std::mem; +use std::ops::RangeInclusive; +use std::u32; + +use regex_syntax::utf8::Utf8Range; + +/// A smaller state ID means more effective use of the CPU cache and less +/// time spent copying. The implementation below will panic if the state ID +/// space is exhausted, but in order for that to happen, the range trie itself +/// would use well over 100GB of memory. Moreover, it's likely impossible +/// for the state ID space to get that big. In fact, it's likely that even a +/// u16 would be good enough here. But it's not quite clear how to prove this. +type StateID = u32; + +/// There is only one final state in this trie. Every sequence of byte ranges +/// added shares the same final state. +const FINAL: StateID = 0; + +/// The root state of the trie. +const ROOT: StateID = 1; + +/// A range trie represents an ordered set of sequences of bytes. +/// +/// A range trie accepts as input a sequence of byte ranges and merges +/// them into the existing set such that the trie can produce a sorted +/// non-overlapping sequence of byte ranges. The sequence emitted corresponds +/// precisely to the sequence of bytes matched by the given keys, although the +/// byte ranges themselves may be split at different boundaries. +/// +/// The order complexity of this data structure seems difficult to analyze. +/// If the size of a byte is held as a constant, then insertion is clearly +/// O(n) where n is the number of byte ranges in the input key. However, if +/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In +/// particular it seems possible for pathological inputs to cause insertion +/// to do a lot of work. However, for what we use this data structure for, +/// there should be no pathological inputs since the ultimate source is always +/// a sorted set of Unicode scalar value ranges. +/// +/// Internally, this trie is setup like a finite state machine. Note though +/// that it is acyclic. +#[derive(Clone)] +pub struct RangeTrie { + /// The states in this trie. The first is always the shared final state. + /// The second is always the root state. Otherwise, there is no + /// particular order. + states: Vec<State>, + /// A free-list of states. When a range trie is cleared, all of its states + /// are added to list. Creating a new state reuses states from this list + /// before allocating a new one. + free: Vec<State>, + /// A stack for traversing this trie to yield sequences of byte ranges in + /// lexicographic order. + iter_stack: RefCell<Vec<NextIter>>, + /// A bufer that stores the current sequence during iteration. + iter_ranges: RefCell<Vec<Utf8Range>>, + /// A stack used for traversing the trie in order to (deeply) duplicate + /// a state. + dupe_stack: Vec<NextDupe>, + /// A stack used for traversing the trie during insertion of a new + /// sequence of byte ranges. + insert_stack: Vec<NextInsert>, +} + +/// A single state in this trie. +#[derive(Clone)] +struct State { + /// A sorted sequence of non-overlapping transitions to other states. Each + /// transition corresponds to a single range of bytes. + transitions: Vec<Transition>, +} + +/// A transition is a single range of bytes. If a particular byte is in this +/// range, then the corresponding machine may transition to the state pointed +/// to by `next_id`. +#[derive(Clone)] +struct Transition { + /// The byte range. + range: Utf8Range, + /// The next state to transition to. + next_id: StateID, +} + +impl RangeTrie { + /// Create a new empty range trie. + pub fn new() -> RangeTrie { + let mut trie = RangeTrie { + states: vec![], + free: vec![], + iter_stack: RefCell::new(vec![]), + iter_ranges: RefCell::new(vec![]), + dupe_stack: vec![], + insert_stack: vec![], + }; + trie.clear(); + trie + } + + /// Clear this range trie such that it is empty. Clearing a range trie + /// and reusing it can beneficial because this may reuse allocations. + pub fn clear(&mut self) { + self.free.extend(self.states.drain(..)); + self.add_empty(); // final + self.add_empty(); // root + } + + /// Iterate over all of the sequences of byte ranges in this trie, and + /// call the provided function for each sequence. Iteration occurs in + /// lexicographic order. + pub fn iter<F: FnMut(&[Utf8Range])>(&self, mut f: F) { + let mut stack = self.iter_stack.borrow_mut(); + stack.clear(); + let mut ranges = self.iter_ranges.borrow_mut(); + ranges.clear(); + + // We do iteration in a way that permits us to use a single buffer + // for our keys. We iterate in a depth first fashion, while being + // careful to expand our frontier as we move deeper in the trie. + stack.push(NextIter { state_id: ROOT, tidx: 0 }); + while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() { + // This could be implemented more simply without an inner loop + // here, but at the cost of more stack pushes. + loop { + let state = self.state(state_id); + // If we're visited all transitions in this state, then pop + // back to the parent state. + if tidx >= state.transitions.len() { + ranges.pop(); + break; + } + + let t = &state.transitions[tidx]; + ranges.push(t.range); + if t.next_id == FINAL { + f(&ranges); + ranges.pop(); + tidx += 1; + } else { + // Expand our frontier. Once we come back to this state + // via the stack, start in on the next transition. + stack.push(NextIter { state_id, tidx: tidx + 1 }); + // Otherwise, move to the first transition of the next + // state. + state_id = t.next_id; + tidx = 0; + } + } + } + } + + /// Inserts a new sequence of ranges into this trie. + /// + /// The sequence given must be non-empty and must not have a length + /// exceeding 4. + pub fn insert(&mut self, ranges: &[Utf8Range]) { + assert!(!ranges.is_empty()); + assert!(ranges.len() <= 4); + + let mut stack = mem::replace(&mut self.insert_stack, vec![]); + stack.clear(); + + stack.push(NextInsert::new(ROOT, ranges)); + while let Some(next) = stack.pop() { + let (state_id, ranges) = (next.state_id(), next.ranges()); + assert!(!ranges.is_empty()); + + let (mut new, rest) = (ranges[0], &ranges[1..]); + + // i corresponds to the position of the existing transition on + // which we are operating. Typically, the result is to remove the + // transition and replace it with two or more new transitions + // corresponding to the partitions generated by splitting the + // 'new' with the ith transition's range. + let mut i = self.state(state_id).find(new); + + // In this case, there is no overlap *and* the new range is greater + // than all existing ranges. So we can just add it to the end. + if i == self.state(state_id).transitions.len() { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition(state_id, new, next_id); + continue; + } + + // The need for this loop is a bit subtle, buf basically, after + // we've handled the partitions from our initial split, it's + // possible that there will be a partition leftover that overlaps + // with a subsequent transition. If so, then we have to repeat + // the split process again with the leftovers and that subsequent + // transition. + 'OUTER: loop { + let old = self.state(state_id).transitions[i].clone(); + let split = match Split::new(old.range, new) { + Some(split) => split, + None => { + let next_id = NextInsert::push(self, &mut stack, rest); + self.add_transition_at(i, state_id, new, next_id); + continue; + } + }; + let splits = split.as_slice(); + // If we only have one partition, then the ranges must be + // equivalent. There's nothing to do here for this state, so + // just move on to the next one. + if splits.len() == 1 { + // ... but only if we have anything left to do. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + break; + } + // At this point, we know that 'split' is non-empty and there + // must be some overlap AND that the two ranges are not + // equivalent. Therefore, the existing range MUST be removed + // and split up somehow. Instead of actually doing the removal + // and then a subsequent insertion---with all the memory + // shuffling that entails---we simply overwrite the transition + // at position `i` for the first new transition we want to + // insert. After that, we're forced to do expensive inserts. + let mut first = true; + let mut add_trans = + |trie: &mut RangeTrie, pos, from, range, to| { + if first { + trie.set_transition_at(pos, from, range, to); + first = false; + } else { + trie.add_transition_at(pos, from, range, to); + } + }; + for (j, &srange) in splits.iter().enumerate() { + match srange { + SplitRange::Old(r) => { + // Deep clone the state pointed to by the ith + // transition. This is always necessary since 'old' + // is always coupled with at least a 'both' + // partition. We don't want any new changes made + // via the 'both' partition to impact the part of + // the transition that doesn't overlap with the + // new range. + let dup_id = self.duplicate(old.next_id); + add_trans(self, i, state_id, r, dup_id); + } + SplitRange::New(r) => { + // This is a bit subtle, but if this happens to be + // the last partition in our split, it is possible + // that this overlaps with a subsequent transition. + // If it does, then we must repeat the whole + // splitting process over again with `r` and the + // subsequent transition. + { + let trans = &self.state(state_id).transitions; + if j + 1 == splits.len() + && i < trans.len() + && intersects(r, trans[i].range) + { + new = r; + continue 'OUTER; + } + } + + // ... otherwise, setup exploration for a new + // empty state and add a brand new transition for + // this new range. + let next_id = + NextInsert::push(self, &mut stack, rest); + add_trans(self, i, state_id, r, next_id); + } + SplitRange::Both(r) => { + // Continue adding the remaining ranges on this + // path and update the transition with the new + // range. + if !rest.is_empty() { + stack.push(NextInsert::new(old.next_id, rest)); + } + add_trans(self, i, state_id, r, old.next_id); + } + } + i += 1; + } + // If we've reached this point, then we know that there are + // no subsequent transitions with any overlap. Therefore, we + // can stop processing this range and move on to the next one. + break; + } + } + self.insert_stack = stack; + } + + pub fn add_empty(&mut self) -> StateID { + if self.states.len() as u64 > u32::MAX as u64 { + // This generally should not happen since a range trie is only + // ever used to compile a single sequence of Unicode scalar values. + // If we ever got to this point, we would, at *minimum*, be using + // 96GB in just the range trie alone. + panic!("too many sequences added to range trie"); + } + let id = self.states.len() as StateID; + // If we have some free states available, then use them to avoid + // more allocations. + if let Some(mut state) = self.free.pop() { + state.clear(); + self.states.push(state); + } else { + self.states.push(State { transitions: vec![] }); + } + id + } + + /// Performs a deep clone of the given state and returns the duplicate's + /// state ID. + /// + /// A "deep clone" in this context means that the state given along with + /// recursively all states that it points to are copied. Once complete, + /// the given state ID and the returned state ID share nothing. + /// + /// This is useful during range trie insertion when a new range overlaps + /// with an existing range that is bigger than the new one. The part of + /// the existing range that does *not* overlap with the new one is that + /// duplicated so that adding the new range to the overlap doesn't disturb + /// the non-overlapping portion. + /// + /// There's one exception: if old_id is the final state, then it is not + /// duplicated and the same final state is returned. This is because all + /// final states in this trie are equivalent. + fn duplicate(&mut self, old_id: StateID) -> StateID { + if old_id == FINAL { + return FINAL; + } + + let mut stack = mem::replace(&mut self.dupe_stack, vec![]); + stack.clear(); + + let new_id = self.add_empty(); + // old_id is the state we're cloning and new_id is the ID of the + // duplicated state for old_id. + stack.push(NextDupe { old_id, new_id }); + while let Some(NextDupe { old_id, new_id }) = stack.pop() { + for i in 0..self.state(old_id).transitions.len() { + let t = self.state(old_id).transitions[i].clone(); + if t.next_id == FINAL { + // All final states are the same, so there's no need to + // duplicate it. + self.add_transition(new_id, t.range, FINAL); + continue; + } + + let new_child_id = self.add_empty(); + self.add_transition(new_id, t.range, new_child_id); + stack.push(NextDupe { + old_id: t.next_id, + new_id: new_child_id, + }); + } + } + self.dupe_stack = stack; + new_id + } + + /// Adds the given transition to the given state. + /// + /// Callers must ensure that all previous transitions in this state + /// are lexicographically smaller than the given range. + fn add_transition( + &mut self, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .push(Transition { range, next_id }); + } + + /// Like `add_transition`, except this inserts the transition just before + /// the ith transition. + fn add_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id) + .transitions + .insert(i, Transition { range, next_id }); + } + + /// Overwrites the transition at position i with the given transition. + fn set_transition_at( + &mut self, + i: usize, + from_id: StateID, + range: Utf8Range, + next_id: StateID, + ) { + self.state_mut(from_id).transitions[i] = Transition { range, next_id }; + } + + /// Return an immutable borrow for the state with the given ID. + fn state(&self, id: StateID) -> &State { + &self.states[id as usize] + } + + /// Return a mutable borrow for the state with the given ID. + fn state_mut(&mut self, id: StateID) -> &mut State { + &mut self.states[id as usize] + } +} + +impl State { + /// Find the position at which the given range should be inserted in this + /// state. + /// + /// The position returned is always in the inclusive range + /// [0, transitions.len()]. If 'transitions.len()' is returned, then the + /// given range overlaps with no other range in this state *and* is greater + /// than all of them. + /// + /// For all other possible positions, the given range either overlaps + /// with the transition at that position or is otherwise less than it + /// with no overlap (and is greater than the previous transition). In the + /// former case, careful attention must be paid to inserting this range + /// as a new transition. In the latter case, the range can be inserted as + /// a new transition at the given position without disrupting any other + /// transitions. + fn find(&self, range: Utf8Range) -> usize { + /// Returns the position `i` at which `pred(xs[i])` first returns true + /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never + /// returns true, then `xs.len()` is returned. + /// + /// We roll our own binary search because it doesn't seem like the + /// standard library's binary search can be used here. Namely, if + /// there is an overlapping range, then we want to find the first such + /// occurrence, but there may be many. Or at least, it's not quite + /// clear to me how to do it. + fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize + where + F: FnMut(&T) -> bool, + { + let (mut left, mut right) = (0, xs.len()); + while left < right { + // Overflow is impossible because xs.len() <= 256. + let mid = (left + right) / 2; + if pred(&xs[mid]) { + right = mid; + } else { + left = mid + 1; + } + } + left + } + + // Benchmarks suggest that binary search is just a bit faster than + // straight linear search. Specifically when using the debug tool: + // + // hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'" + binary_search(&self.transitions, |t| range.start <= t.range.end) + } + + /// Clear this state such that it has zero transitions. + fn clear(&mut self) { + self.transitions.clear(); + } +} + +/// The next state to process during duplication. +#[derive(Clone, Debug)] +struct NextDupe { + /// The state we want to duplicate. + old_id: StateID, + /// The ID of the new state that is a duplicate of old_id. + new_id: StateID, +} + +/// The next state (and its corresponding transition) that we want to visit +/// during iteration in lexicographic order. +#[derive(Clone, Debug)] +struct NextIter { + state_id: StateID, + tidx: usize, +} + +/// The next state to process during insertion and any remaining ranges that we +/// want to add for a partcular sequence of ranges. The first such instance +/// is always the root state along with all ranges given. +#[derive(Clone, Debug)] +struct NextInsert { + /// The next state to begin inserting ranges. This state should be the + /// state at which `ranges[0]` should be inserted. + state_id: StateID, + /// The ranges to insert. We used a fixed-size array here to avoid an + /// allocation. + ranges: [Utf8Range; 4], + /// The number of valid ranges in the above array. + len: u8, +} + +impl NextInsert { + /// Create the next item to visit. The given state ID should correspond + /// to the state at which the first range in the given slice should be + /// inserted. The slice given must not be empty and it must be no longer + /// than 4. + fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert { + let len = ranges.len(); + assert!(len > 0); + assert!(len <= 4); + + let mut tmp = [Utf8Range { start: 0, end: 0 }; 4]; + tmp[..len].copy_from_slice(ranges); + NextInsert { state_id, ranges: tmp, len: len as u8 } + } + + /// Push a new empty state to visit along with any remaining ranges that + /// still need to be inserted. The ID of the new empty state is returned. + /// + /// If ranges is empty, then no new state is created and FINAL is returned. + fn push( + trie: &mut RangeTrie, + stack: &mut Vec<NextInsert>, + ranges: &[Utf8Range], + ) -> StateID { + if ranges.is_empty() { + FINAL + } else { + let next_id = trie.add_empty(); + stack.push(NextInsert::new(next_id, ranges)); + next_id + } + } + + /// Return the ID of the state to visit. + fn state_id(&self) -> StateID { + self.state_id + } + + /// Return the remaining ranges to insert. + fn ranges(&self) -> &[Utf8Range] { + &self.ranges[..self.len as usize] + } +} + +/// Split represents a partitioning of two ranges into one or more ranges. This +/// is the secret sauce that makes a range trie work, as it's what tells us +/// how to deal with two overlapping but unequal ranges during insertion. +/// +/// Essentially, either two ranges overlap or they don't. If they don't, then +/// handling insertion is easy: just insert the new range into its +/// lexicographically correct position. Since it does not overlap with anything +/// else, no other transitions are impacted by the new range. +/// +/// If they do overlap though, there are generally three possible cases to +/// handle: +/// +/// 1. The part where the two ranges actually overlap. i.e., The intersection. +/// 2. The part of the existing range that is not in the the new range. +/// 3. The part of the new range that is not in the old range. +/// +/// (1) is guaranteed to always occur since all overlapping ranges have a +/// non-empty intersection. If the two ranges are not equivalent, then at +/// least one of (2) or (3) is guaranteed to occur as well. In some cases, +/// e.g., `[0-4]` and `[4-9]`, all three cases will occur. +/// +/// This `Split` type is responsible for providing (1), (2) and (3) for any +/// possible pair of byte ranges. +/// +/// As for insertion, for the overlap in (1), the remaining ranges to insert +/// should be added by following the corresponding transition. However, this +/// should only be done for the overlapping parts of the range. If there was +/// a part of the existing range that was not in the new range, then that +/// existing part must be split off from the transition and duplicated. The +/// remaining parts of the overlap can then be added to using the new ranges +/// without disturbing the existing range. +/// +/// Handling the case for the part of a new range that is not in an existing +/// range is seemingly easy. Just treat it as if it were a non-overlapping +/// range. The problem here is that if this new non-overlapping range occurs +/// after both (1) and (2), then it's possible that it can overlap with the +/// next transition in the current state. If it does, then the whole process +/// must be repeated! +/// +/// # Details of the 3 cases +/// +/// The following details the various cases that are implemented in code +/// below. It's plausible that the number of cases is not actually minimal, +/// but it's important for this code to remain at least somewhat readable. +/// +/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define +/// the follow distinct relationships where at least one must apply. The order +/// of these matters, since multiple can match. The first to match applies. +/// +/// 1. b < x <=> [a,b] < [x,y] +/// 2. y < a <=> [x,y] < [a,b] +/// +/// In the case of (1) and (2), these are the only cases where there is no +/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In +/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The +/// intersection in all of the following cases is non-empty. +/// +/// 3. a = x && b = y <=> [a,b] == [x,y] +/// 4. a = x && b < y <=> [x,y] right-extends [a,b] +/// 5. b = y && a > x <=> [x,y] left-extends [a,b] +/// 6. x = a && y < b <=> [a,b] right-extends [x,y] +/// 7. y = b && x > a <=> [a,b] left-extends [x,y] +/// 8. a > x && b < y <=> [x,y] covers [a,b] +/// 9. x > a && y < b <=> [a,b] covers [x,y] +/// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y] +/// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b] +/// 12. b > x && b < y <=> [a,b] left-overlaps [x,y] +/// 13. y > a && y < b <=> [x,y] left-overlaps [a,b] +/// +/// In cases 3-13, we can form rules that partition the ranges into a +/// non-overlapping ordered sequence of ranges: +/// +/// 3. [a,b] +/// 4. [a,b], [b+1,y] +/// 5. [x,a-1], [a,b] +/// 6. [x,y], [y+1,b] +/// 7. [a,x-1], [x,y] +/// 8. [x,a-1], [a,b], [b+1,y] +/// 9. [a,x-1], [x,y], [y+1,b] +/// 10. [a,b-1], [b,b], [b+1,y] +/// 11. [x,y-1], [y,y], [y+1,b] +/// 12. [a,x-1], [x,b], [b+1,y] +/// 13. [x,a-1], [a,y], [y+1,b] +/// +/// In the code below, we go a step further and identify each of the above +/// outputs as belonging either to the overlap of the two ranges or to one +/// of [a,b] or [x,y] exclusively. +#[derive(Clone, Debug, Eq, PartialEq)] +struct Split { + partitions: [SplitRange; 3], + len: usize, +} + +/// A tagged range indicating how it was derived from a pair of ranges. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum SplitRange { + Old(Utf8Range), + New(Utf8Range), + Both(Utf8Range), +} + +impl Split { + /// Create a partitioning of the given ranges. + /// + /// If the given ranges have an empty intersection, then None is returned. + fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // Use same names as the comment above to make it easier to compare. + let (a, b, x, y) = (o.start, o.end, n.start, n.end); + + if b < x || y < a { + // case 1, case 2 + None + } else if a == x && b == y { + // case 3 + Some(Split::parts1(both(a..=b))) + } else if a == x && b < y { + // case 4 + Some(Split::parts2(both(a..=b), new(b + 1..=y))) + } else if b == y && a > x { + // case 5 + Some(Split::parts2(new(x..=a - 1), both(a..=b))) + } else if x == a && y < b { + // case 6 + Some(Split::parts2(both(x..=y), old(y + 1..=b))) + } else if y == b && x > a { + // case 7 + Some(Split::parts2(old(a..=x - 1), both(x..=y))) + } else if a > x && b < y { + // case 8 + Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y))) + } else if x > a && y < b { + // case 9 + Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b))) + } else if b == x && a < y { + // case 10 + Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y))) + } else if y == a && x < b { + // case 11 + Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b))) + } else if b > x && b < y { + // case 12 + Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y))) + } else if y > a && y < b { + // case 13 + Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b))) + } else { + unreachable!() + } + } + + /// Create a new split with a single partition. This only occurs when two + /// ranges are equivalent. + fn parts1(r1: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, nada, nada], len: 1 } + } + + /// Create a new split with two partitions. + fn parts2(r1: SplitRange, r2: SplitRange) -> Split { + // This value doesn't matter since it is never accessed. + let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); + Split { partitions: [r1, r2, nada], len: 2 } + } + + /// Create a new split with three partitions. + fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split { + Split { partitions: [r1, r2, r3], len: 3 } + } + + /// Return the partitions in this split as a slice. + fn as_slice(&self) -> &[SplitRange] { + &self.partitions[..self.len] + } +} + +impl fmt::Debug for RangeTrie { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "")?; + for (i, state) in self.states.iter().enumerate() { + let status = if i == FINAL as usize { '*' } else { ' ' }; + writeln!(f, "{}{:06}: {:?}", status, i, state)?; + } + Ok(()) + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let rs = self + .transitions + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "{}", rs) + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.range.start == self.range.end { + write!(f, "{:02X} => {:02X}", self.range.start, self.next_id) + } else { + write!( + f, + "{:02X}-{:02X} => {:02X}", + self.range.start, self.range.end, self.next_id + ) + } + } +} + +/// Returns true if and only if the given ranges intersect. +fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { + !(r1.end < r2.start || r2.end < r1.start) +} + +#[cfg(test)] +mod tests { + use std::ops::RangeInclusive; + + use regex_syntax::utf8::Utf8Range; + + use super::*; + + fn r(range: RangeInclusive<u8>) -> Utf8Range { + Utf8Range { start: *range.start(), end: *range.end() } + } + + fn split_maybe( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Option<Split> { + Split::new(r(old), r(new)) + } + + fn split( + old: RangeInclusive<u8>, + new: RangeInclusive<u8>, + ) -> Vec<SplitRange> { + split_maybe(old, new).unwrap().as_slice().to_vec() + } + + #[test] + fn no_splits() { + // case 1 + assert_eq!(None, split_maybe(0..=1, 2..=3)); + // case 2 + assert_eq!(None, split_maybe(2..=3, 0..=1)); + } + + #[test] + fn splits() { + let range = |r: RangeInclusive<u8>| Utf8Range { + start: *r.start(), + end: *r.end(), + }; + let old = |r| SplitRange::Old(range(r)); + let new = |r| SplitRange::New(range(r)); + let both = |r| SplitRange::Both(range(r)); + + // case 3 + assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]); + assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]); + + // case 4 + assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]); + assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]); + assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]); + + // case 5 + assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]); + assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]); + assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]); + + // case 6 + assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]); + assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]); + assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]); + + // case 7 + assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]); + assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]); + assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]); + + // case 8 + assert_eq!( + split(3..=6, 2..=7), + vec![new(2..=2), both(3..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 1..=8), + vec![new(1..=2), both(3..=6), new(7..=8)], + ); + + // case 9 + assert_eq!( + split(2..=7, 3..=6), + vec![old(2..=2), both(3..=6), old(7..=7)], + ); + assert_eq!( + split(1..=8, 3..=6), + vec![old(1..=2), both(3..=6), old(7..=8)], + ); + + // case 10 + assert_eq!( + split(3..=6, 6..=7), + vec![old(3..=5), both(6..=6), new(7..=7)], + ); + assert_eq!( + split(3..=6, 6..=8), + vec![old(3..=5), both(6..=6), new(7..=8)], + ); + assert_eq!( + split(5..=6, 6..=7), + vec![old(5..=5), both(6..=6), new(7..=7)], + ); + + // case 11 + assert_eq!( + split(6..=7, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=7)], + ); + assert_eq!( + split(6..=8, 3..=6), + vec![new(3..=5), both(6..=6), old(7..=8)], + ); + assert_eq!( + split(6..=7, 5..=6), + vec![new(5..=5), both(6..=6), old(7..=7)], + ); + + // case 12 + assert_eq!( + split(3..=7, 5..=9), + vec![old(3..=4), both(5..=7), new(8..=9)], + ); + assert_eq!( + split(3..=5, 4..=6), + vec![old(3..=3), both(4..=5), new(6..=6)], + ); + + // case 13 + assert_eq!( + split(5..=9, 3..=7), + vec![new(3..=4), both(5..=7), old(8..=9)], + ); + assert_eq!( + split(4..=6, 3..=5), + vec![new(3..=3), both(4..=5), old(6..=6)], + ); + } + + // Arguably there should be more tests here, but in practice, this data + // structure is well covered by the huge number of regex tests. +} diff --git a/vendor/regex-automata/src/regex.rs b/vendor/regex-automata/src/regex.rs new file mode 100644 index 000000000..47e1c5819 --- /dev/null +++ b/vendor/regex-automata/src/regex.rs @@ -0,0 +1,771 @@ +#[cfg(feature = "std")] +use dense::{self, DenseDFA}; +use dfa::DFA; +#[cfg(feature = "std")] +use error::Result; +#[cfg(feature = "std")] +use sparse::SparseDFA; +#[cfg(feature = "std")] +use state_id::StateID; + +/// A regular expression that uses deterministic finite automata for fast +/// searching. +/// +/// A regular expression is comprised of two DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of a +/// match while the reverse DFA is responsible for detecting the start of a +/// match. Thus, in order to find the bounds of any given match, a forward +/// search must first be run followed by a reverse search. A match found by +/// the forward DFA guarantees that the reverse DFA will also find a match. +/// +/// The type of the DFA used by a `Regex` corresponds to the `D` type +/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, +/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a +/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but +/// search faster, while sparse DFAs use less memory but search more slowly. +/// +/// By default, a regex's DFA type parameter is set to +/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the +/// most convenient type that gives the best search performance. +/// +/// # Sparse DFAs +/// +/// Since a `Regex` is generic over the `DFA` trait, it can be used with any +/// kind of DFA. While this crate constructs dense DFAs by default, it is easy +/// enough to build corresponding sparse DFAs, and then build a regex from +/// them: +/// +/// ``` +/// use regex_automata::Regex; +/// +/// # fn example() -> Result<(), regex_automata::Error> { +/// // First, build a regex that uses dense DFAs. +/// let dense_re = Regex::new("foo[0-9]+")?; +/// +/// // Second, build sparse DFAs from the forward and reverse dense DFAs. +/// let fwd = dense_re.forward().to_sparse()?; +/// let rev = dense_re.reverse().to_sparse()?; +/// +/// // Third, build a new regex from the constituent sparse DFAs. +/// let sparse_re = Regex::from_dfas(fwd, rev); +/// +/// // A regex that uses sparse DFAs can be used just like with dense DFAs. +/// assert_eq!(true, sparse_re.is_match(b"foo123")); +/// # Ok(()) }; example().unwrap() +/// ``` +#[cfg(feature = "std")] +#[derive(Clone, Debug)] +pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> { + forward: D, + reverse: D, +} + +/// A regular expression that uses deterministic finite automata for fast +/// searching. +/// +/// A regular expression is comprised of two DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of a +/// match while the reverse DFA is responsible for detecting the start of a +/// match. Thus, in order to find the bounds of any given match, a forward +/// search must first be run followed by a reverse search. A match found by +/// the forward DFA guarantees that the reverse DFA will also find a match. +/// +/// The type of the DFA used by a `Regex` corresponds to the `D` type +/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, +/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a +/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but +/// search faster, while sparse DFAs use less memory but search more slowly. +/// +/// When using this crate without the standard library, the `Regex` type has +/// no default type parameter. +/// +/// # Sparse DFAs +/// +/// Since a `Regex` is generic over the `DFA` trait, it can be used with any +/// kind of DFA. While this crate constructs dense DFAs by default, it is easy +/// enough to build corresponding sparse DFAs, and then build a regex from +/// them: +/// +/// ``` +/// use regex_automata::Regex; +/// +/// # fn example() -> Result<(), regex_automata::Error> { +/// // First, build a regex that uses dense DFAs. +/// let dense_re = Regex::new("foo[0-9]+")?; +/// +/// // Second, build sparse DFAs from the forward and reverse dense DFAs. +/// let fwd = dense_re.forward().to_sparse()?; +/// let rev = dense_re.reverse().to_sparse()?; +/// +/// // Third, build a new regex from the constituent sparse DFAs. +/// let sparse_re = Regex::from_dfas(fwd, rev); +/// +/// // A regex that uses sparse DFAs can be used just like with dense DFAs. +/// assert_eq!(true, sparse_re.is_match(b"foo123")); +/// # Ok(()) }; example().unwrap() +/// ``` +#[cfg(not(feature = "std"))] +#[derive(Clone, Debug)] +pub struct Regex<D> { + forward: D, + reverse: D, +} + +#[cfg(feature = "std")] +impl Regex { + /// Parse the given regular expression using a default configuration and + /// return the corresponding regex. + /// + /// The default configuration uses `usize` for state IDs, premultiplies + /// them and reduces the alphabet size by splitting bytes into equivalence + /// classes. The underlying DFAs are *not* minimized. + /// + /// If you want a non-default configuration, then use the + /// [`RegexBuilder`](struct.RegexBuilder.html) + /// to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn new(pattern: &str) -> Result<Regex> { + RegexBuilder::new().build(pattern) + } +} + +#[cfg(feature = "std")] +impl Regex<SparseDFA<Vec<u8>, usize>> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding regex using sparse DFAs. + /// + /// The default configuration uses `usize` for state IDs, reduces the + /// alphabet size by splitting bytes into equivalence classes. The + /// underlying DFAs are *not* minimized. + /// + /// If you want a non-default configuration, then use the + /// [`RegexBuilder`](struct.RegexBuilder.html) + /// to set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let re = Regex::new_sparse("foo[0-9]+bar")?; + /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn new_sparse( + pattern: &str, + ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { + RegexBuilder::new().build_sparse(pattern) + } +} + +impl<D: DFA> Regex<D> { + /// Returns true if and only if the given bytes match. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match(b"foo12345bar")); + /// assert_eq!(false, re.is_match(b"foobar")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn is_match(&self, input: &[u8]) -> bool { + self.is_match_at(input, 0) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!(Some(4), re.shortest_match(b"foo12345")); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the shortest match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(1), re.shortest_match(b"abc")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn shortest_match(&self, input: &[u8]) -> Option<usize> { + self.shortest_match_at(input, 0) + } + + /// Returns the start and end offset of the leftmost first match. If no + /// match exists, then `None` is returned. + /// + /// The "leftmost first" match corresponds to the match with the smallest + /// starting offset, but where the end offset is determined by preferring + /// earlier branches in the original regular expression. For example, + /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will + /// match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some((0, 3)), re.find(b"abc")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> { + self.find_at(input, 0) + } + + /// Returns the same as `is_match`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + pub fn is_match_at(&self, input: &[u8], start: usize) -> bool { + self.forward().is_match_at(input, start) + } + + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + pub fn shortest_match_at( + &self, + input: &[u8], + start: usize, + ) -> Option<usize> { + self.forward().shortest_match_at(input, start) + } + + /// Returns the same as `find`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + pub fn find_at( + &self, + input: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let end = match self.forward().find_at(input, start) { + None => return None, + Some(end) => end, + }; + let start = self + .reverse() + .rfind(&input[start..end]) + .map(|i| start + i) + .expect("reverse search must match if forward search does"); + Some((start, end)) + } + + /// Returns an iterator over all non-overlapping leftmost first matches + /// in the given bytes. If no match exists, then the iterator yields no + /// elements. + /// + /// Note that if the regex can match the empty string, then it is + /// possible for the iterator to yield a zero-width match at a location + /// that is not a valid UTF-8 boundary (for example, between the code units + /// of a UTF-8 encoded codepoint). This can happen regardless of whether + /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) + /// was enabled or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let re = Regex::new("foo[0-9]+")?; + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); + /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> { + Matches::new(self, input) + } + + /// Build a new regex from its constituent forward and reverse DFAs. + /// + /// This is useful when deserializing a regex from some arbitrary + /// memory region. This is also useful for building regexes from other + /// types of DFAs. + /// + /// # Example + /// + /// This example is a bit a contrived. The usual use of these methods + /// would involve serializing `initial_re` somewhere and then deserializing + /// it later to build a regex. + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); + /// let re = Regex::from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// This example shows how you might build smaller DFAs, and then use those + /// smaller DFAs to build a new regex. + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let fwd = initial_re.forward().to_u16()?; + /// let rev = initial_re.reverse().to_u16()?; + /// let re = Regex::from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// This example shows how to build a `Regex` that uses sparse DFAs instead + /// of dense DFAs: + /// + /// ``` + /// use regex_automata::Regex; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let fwd = initial_re.forward().to_sparse()?; + /// let rev = initial_re.reverse().to_sparse()?; + /// let re = Regex::from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn from_dfas(forward: D, reverse: D) -> Regex<D> { + Regex { forward, reverse } + } + + /// Return the underlying DFA responsible for forward matching. + pub fn forward(&self) -> &D { + &self.forward + } + + /// Return the underlying DFA responsible for reverse matching. + pub fn reverse(&self) -> &D { + &self.reverse + } +} + +/// An iterator over all non-overlapping matches for a particular search. +/// +/// The iterator yields a `(usize, usize)` value until no more matches could be +/// found. The first `usize` is the start of the match (inclusive) while the +/// second `usize` is the end of the match (exclusive). +/// +/// `S` is the type used to represent state identifiers in the underlying +/// regex. The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression value itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct Matches<'r, 't, D: DFA + 'r> { + re: &'r Regex<D>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 't, D: DFA> Matches<'r, 't, D> { + fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> { + Matches { re, text, last_end: 0, last_match: None } + } +} + +impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> { + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.last_end > self.text.len() { + return None; + } + let (s, e) = match self.re.find_at(self.text, self.last_end) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = e + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(e) == self.last_match { + return self.next(); + } + } else { + self.last_end = e; + } + self.last_match = Some(e); + Some((s, e)) + } +} + +/// A builder for a regex based on deterministic finite automatons. +/// +/// This builder permits configuring several aspects of the construction +/// process such as case insensitivity, Unicode support and various options +/// that impact the size of the underlying DFAs. In some cases, options (like +/// performing DFA minimization) can come with a substantial additional cost. +/// +/// This builder generally constructs two DFAs, where one is responsible for +/// finding the end of a match and the other is responsible for finding the +/// start of a match. If you only need to detect whether something matched, +/// or only the end of a match, then you should use a +/// [`dense::Builder`](dense/struct.Builder.html) +/// to construct a single DFA, which is cheaper than building two DFAs. +#[cfg(feature = "std")] +#[derive(Clone, Debug)] +pub struct RegexBuilder { + dfa: dense::Builder, +} + +#[cfg(feature = "std")] +impl RegexBuilder { + /// Create a new regex builder with the default configuration. + pub fn new() -> RegexBuilder { + RegexBuilder { dfa: dense::Builder::new() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<Regex> { + self.build_with_size::<usize>(pattern) + } + + /// Build a regex from the given pattern using sparse DFAs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build_sparse( + &self, + pattern: &str, + ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> { + self.build_with_size_sparse::<usize>(pattern) + } + + /// Build a regex from the given pattern using a specific representation + /// for the underlying DFA state IDs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + /// + /// The representation of state IDs is determined by the `S` type + /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` + /// or `usize`, where `usize` is the default used for `build`. The purpose + /// of specifying a representation for state IDs is to reduce the memory + /// footprint of the underlying DFAs. + /// + /// When using this routine, the chosen state ID representation will be + /// used throughout determinization and minimization, if minimization was + /// requested. Even if the minimized DFAs can fit into the chosen state ID + /// representation but the initial determinized DFA cannot, then this will + /// still return an error. To get a minimized DFA with a smaller state ID + /// representation, first build it with a bigger state ID representation, + /// and then shrink the sizes of the DFAs using one of its conversion + /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). + /// Finally, reconstitute the regex via + /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa). + pub fn build_with_size<S: StateID>( + &self, + pattern: &str, + ) -> Result<Regex<DenseDFA<Vec<S>, S>>> { + let forward = self.dfa.build_with_size(pattern)?; + let reverse = self + .dfa + .clone() + .anchored(true) + .reverse(true) + .longest_match(true) + .build_with_size(pattern)?; + Ok(Regex::from_dfas(forward, reverse)) + } + + /// Build a regex from the given pattern using a specific representation + /// for the underlying DFA state IDs using sparse DFAs. + pub fn build_with_size_sparse<S: StateID>( + &self, + pattern: &str, + ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> { + let re = self.build_with_size(pattern)?; + let fwd = re.forward().to_sparse()?; + let rev = re.reverse().to_sparse()?; + Ok(Regex::from_dfas(fwd, rev)) + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of the input. When + /// disabled, the regex will act as if the pattern started with a `.*?`, + /// which enables a match to appear anywhere. + /// + /// By default this is disabled. + pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.anchored(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.case_insensitive(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insigificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.ignore_whitespace(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.dot_matches_new_line(yes); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.unicode(yes); + self + } + + /// When enabled, the builder will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the builder is guaranteed to produce a + /// regex that will only ever match valid UTF-8 (otherwise, the builder + /// will return an error). + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.allow_invalid_utf8(yes); + self + } + + /// Set the nesting limit used for the regular expression parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow when building a finite automaton from a regular expression's + /// abstract syntax tree. In particular, construction currently uses + /// recursion. In the future, the implementation may stop using recursion + /// and this option will no longer be necessary. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since the parser will + /// limit itself to heap space proportional to the lenth of the pattern + /// string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation AST item, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.dfa.nest_limit(limit); + self + } + + /// Minimize the underlying DFAs. + /// + /// When enabled, the DFAs powering the resulting regex will be minimized + /// such that it is as small as possible. + /// + /// Whether one enables minimization or not depends on the types of costs + /// you're willing to pay and how much you care about its benefits. In + /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` + /// space, where `n` is the number of DFA states and `k` is the alphabet + /// size. In practice, minimization can be quite costly in terms of both + /// space and time, so it should only be done if you're willing to wait + /// longer to produce a DFA. In general, you might want a minimal DFA in + /// the following circumstances: + /// + /// 1. You would like to optimize for the size of the automaton. This can + /// manifest in one of two ways. Firstly, if you're converting the + /// DFA into Rust code (or a table embedded in the code), then a minimal + /// DFA will translate into a corresponding reduction in code size, and + /// thus, also the final compiled binary size. Secondly, if you are + /// building many DFAs and putting them on the heap, you'll be able to + /// fit more if they are smaller. Note though that building a minimal + /// DFA itself requires additional space; you only realize the space + /// savings once the minimal DFA is constructed (at which point, the + /// space used for minimization is freed). + /// 2. You've observed that a smaller DFA results in faster match + /// performance. Naively, this isn't guaranteed since there is no + /// inherent difference between matching with a bigger-than-minimal + /// DFA and a minimal DFA. However, a smaller DFA may make use of your + /// CPU's cache more efficiently. + /// 3. You are trying to establish an equivalence between regular + /// languages. The standard method for this is to build a minimal DFA + /// for each language and then compare them. If the DFAs are equivalent + /// (up to state renaming), then the languages are equivalent. + /// + /// This option is disabled by default. + pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.minimize(yes); + self + } + + /// Premultiply state identifiers in the underlying DFA transition tables. + /// + /// When enabled, state identifiers are premultiplied to point to their + /// corresponding row in the DFA's transition table. That is, given the + /// `i`th state, its corresponding premultiplied identifier is `i * k` + /// where `k` is the alphabet size of the DFA. (The alphabet size is at + /// most 256, but is in practice smaller if byte classes is enabled.) + /// + /// When state identifiers are not premultiplied, then the identifier of + /// the `i`th state is `i`. + /// + /// The advantage of premultiplying state identifiers is that is saves + /// a multiplication instruction per byte when searching with the DFA. + /// This has been observed to lead to a 20% performance benefit in + /// micro-benchmarks. + /// + /// The primary disadvantage of premultiplying state identifiers is + /// that they require a larger integer size to represent. For example, + /// if your DFA has 200 states, then its premultiplied form requires + /// 16 bits to represent every possible state identifier, where as its + /// non-premultiplied form only requires 8 bits. + /// + /// This option is enabled by default. + pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.premultiply(yes); + self + } + + /// Shrink the size of the underlying DFA alphabet by mapping bytes to + /// their equivalence classes. + /// + /// When enabled, each DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence classes because they never discriminate between a match + /// and a non-match. + /// + /// The advantage of this map is that the size of the transition table can + /// be reduced drastically from `#states * 256 * sizeof(id)` to + /// `#states * k * sizeof(id)` where `k` is the number of equivalence + /// classes. As a result, total space usage can decrease substantially. + /// Moreover, since a smaller alphabet is used, compilation becomes faster + /// as well. + /// + /// The disadvantage of this map is that every byte searched must be + /// passed through this map before it can be used to determine the next + /// transition. This has a small match time performance cost. + /// + /// This option is enabled by default. + pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.byte_classes(yes); + self + } + + /// Apply best effort heuristics to shrink the NFA at the expense of more + /// time/memory. + /// + /// This may be exposed in the future, but for now is exported for use in + /// the `regex-automata-debug` tool. + #[doc(hidden)] + pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder { + self.dfa.shrink(yes); + self + } +} + +#[cfg(feature = "std")] +impl Default for RegexBuilder { + fn default() -> RegexBuilder { + RegexBuilder::new() + } +} diff --git a/vendor/regex-automata/src/sparse.rs b/vendor/regex-automata/src/sparse.rs new file mode 100644 index 000000000..d18024b34 --- /dev/null +++ b/vendor/regex-automata/src/sparse.rs @@ -0,0 +1,1256 @@ +#[cfg(feature = "std")] +use core::fmt; +#[cfg(feature = "std")] +use core::iter; +use core::marker::PhantomData; +use core::mem::size_of; +#[cfg(feature = "std")] +use std::collections::HashMap; + +#[cfg(feature = "std")] +use byteorder::{BigEndian, LittleEndian}; +use byteorder::{ByteOrder, NativeEndian}; + +use classes::ByteClasses; +use dense; +use dfa::DFA; +#[cfg(feature = "std")] +use error::{Error, Result}; +#[cfg(feature = "std")] +use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID}; +#[cfg(not(feature = "std"))] +use state_id::{dead_id, StateID}; + +/// A sparse table-based deterministic finite automaton (DFA). +/// +/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a +/// more space efficient representation for its transition table. Consequently, +/// sparse DFAs can use much less memory than dense DFAs, but this comes at a +/// price. In particular, reading the more space efficient transitions takes +/// more work, and consequently, searching using a sparse DFA is typically +/// slower than a dense DFA. +/// +/// A sparse DFA can be built using the default configuration via the +/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise, +/// one can configure various aspects of a dense DFA via +/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense +/// DFA to a sparse DFA using +/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse). +/// +/// In general, a sparse DFA supports all the same operations as a dense DFA. +/// +/// Making the choice between a dense and sparse DFA depends on your specific +/// work load. If you can sacrifice a bit of search time performance, then a +/// sparse DFA might be the best choice. In particular, while sparse DFAs are +/// probably always slower than dense DFAs, you may find that they are easily +/// fast enough for your purposes! +/// +/// # State size +/// +/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to +/// the type of the DFA's transition table while `S` corresponds to the +/// representation used for the DFA's state identifiers as described by the +/// [`StateID`](trait.StateID.html) trait. This type parameter is typically +/// `usize`, but other valid choices provided by this crate include `u8`, +/// `u16`, `u32` and `u64`. The primary reason for choosing a different state +/// identifier representation than the default is to reduce the amount of +/// memory used by a DFA. Note though, that if the chosen representation cannot +/// accommodate the size of your DFA, then building the DFA will fail and +/// return an error. +/// +/// While the reduction in heap memory used by a DFA is one reason for choosing +/// a smaller state identifier representation, another possible reason is for +/// decreasing the serialization size of a DFA, as returned by +/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian), +/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian) +/// or +/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). +/// +/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`, +/// depending on where the transition table is stored. Note that this is +/// different than a dense DFA, whose transition table is typically +/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads +/// its transition table from raw bytes because the table is compactly packed. +/// +/// # Variants +/// +/// This DFA is defined as a non-exhaustive enumeration of different types of +/// dense DFAs. All of the variants use the same internal representation +/// for the transition table, but they vary in how the transition table is +/// read. A DFA's specific variant depends on the configuration options set via +/// [`dense::Builder`](dense/struct.Builder.html). The default variant is +/// `ByteClass`. +/// +/// # The `DFA` trait +/// +/// This type implements the [`DFA`](trait.DFA.html) trait, which means it +/// can be used for searching. For example: +/// +/// ``` +/// use regex_automata::{DFA, SparseDFA}; +/// +/// # fn example() -> Result<(), regex_automata::Error> { +/// let dfa = SparseDFA::new("foo[0-9]+")?; +/// assert_eq!(Some(8), dfa.find(b"foo12345")); +/// # Ok(()) }; example().unwrap() +/// ``` +/// +/// The `DFA` trait also provides an assortment of other lower level methods +/// for DFAs, such as `start_state` and `next_state`. While these are correctly +/// implemented, it is an anti-pattern to use them in performance sensitive +/// code on the `SparseDFA` type directly. Namely, each implementation requires +/// a branch to determine which type of sparse DFA is being used. Instead, +/// this branch should be pushed up a layer in the code since walking the +/// transitions of a DFA is usually a hot path. If you do need to use these +/// lower level methods in performance critical code, then you should match on +/// the variants of this DFA and use each variant's implementation of the `DFA` +/// trait directly. +#[derive(Clone, Debug)] +pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> { + /// A standard DFA that does not use byte classes. + Standard(Standard<T, S>), + /// A DFA that shrinks its alphabet to a set of equivalence classes instead + /// of using all possible byte values. Any two bytes belong to the same + /// equivalence class if and only if they can be used interchangeably + /// anywhere in the DFA while never discriminating between a match and a + /// non-match. + /// + /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much + /// from using byte classes. In some cases, using byte classes can even + /// marginally increase the size of a sparse DFA's transition table. The + /// reason for this is that a sparse DFA already compacts each state's + /// transitions separate from whether byte classes are used. + ByteClass(ByteClass<T, S>), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +#[cfg(feature = "std")] +impl SparseDFA<Vec<u8>, usize> { + /// Parse the given regular expression using a default configuration and + /// return the corresponding sparse DFA. + /// + /// The default configuration uses `usize` for state IDs and reduces the + /// alphabet size by splitting bytes into equivalence classes. The + /// resulting DFA is *not* minimized. + /// + /// If you want a non-default configuration, then use the + /// [`dense::Builder`](dense/struct.Builder.html) + /// to set your own configuration, and then call + /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse) + /// to create a sparse DFA. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{DFA, SparseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa = SparseDFA::new("foo[0-9]+bar")?; + /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> { + dense::Builder::new() + .build(pattern) + .and_then(|dense| dense.to_sparse()) + } +} + +#[cfg(feature = "std")] +impl<S: StateID> SparseDFA<Vec<u8>, S> { + /// Create a new empty sparse DFA that never matches any input. + /// + /// # Example + /// + /// In order to build an empty DFA, callers must provide a type hint + /// indicating their choice of state identifier representation. + /// + /// ``` + /// use regex_automata::{DFA, SparseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty(); + /// assert_eq!(None, dfa.find(b"")); + /// assert_eq!(None, dfa.find(b"foo")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn empty() -> SparseDFA<Vec<u8>, S> { + dense::DenseDFA::empty().to_sparse().unwrap() + } + + pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>( + dfa: &dense::Repr<T, S>, + ) -> Result<SparseDFA<Vec<u8>, A>> { + Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa()) + } +} + +impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { + /// Cheaply return a borrowed version of this sparse DFA. Specifically, the + /// DFA returned always uses `&[u8]` for its transition table while keeping + /// the same state identifier representation. + pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> { + match *self { + SparseDFA::Standard(Standard(ref r)) => { + SparseDFA::Standard(Standard(r.as_ref())) + } + SparseDFA::ByteClass(ByteClass(ref r)) => { + SparseDFA::ByteClass(ByteClass(r.as_ref())) + } + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + /// Return an owned version of this sparse DFA. Specifically, the DFA + /// returned always uses `Vec<u8>` for its transition table while keeping + /// the same state identifier representation. + /// + /// Effectively, this returns a sparse DFA whose transition table lives + /// on the heap. + #[cfg(feature = "std")] + pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> { + match *self { + SparseDFA::Standard(Standard(ref r)) => { + SparseDFA::Standard(Standard(r.to_owned())) + } + SparseDFA::ByteClass(ByteClass(ref r)) => { + SparseDFA::ByteClass(ByteClass(r.to_owned())) + } + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA's transition table. This typically corresponds to + /// heap memory usage. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, used `std::mem::size_of::<SparseDFA>()`. + pub fn memory_usage(&self) -> usize { + self.repr().memory_usage() + } + + fn repr(&self) -> &Repr<T, S> { + match *self { + SparseDFA::Standard(ref r) => &r.0, + SparseDFA::ByteClass(ref r) => &r.0, + SparseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +/// Routines for converting a sparse DFA to other representations, such as +/// smaller state identifiers or raw bytes suitable for persistent storage. +#[cfg(feature = "std")] +impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u8` for the representation of state + /// identifiers. If `u8` is insufficient to represent all state identifiers + /// in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u8>()`. + pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u16` for the representation of state + /// identifiers. If `u16` is insufficient to represent all state + /// identifiers in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u16>()`. + pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u32` for the representation of state + /// identifiers. If `u32` is insufficient to represent all state + /// identifiers in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u32>()`. + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `u64` for the representation of state + /// identifiers. If `u64` is insufficient to represent all state + /// identifiers in this DFA, then this returns an error. + /// + /// This is a convenience routine for `to_sized::<u64>()`. + #[cfg(target_pointer_width = "64")] + pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> { + self.to_sized() + } + + /// Create a new sparse DFA whose match semantics are equivalent to + /// this DFA, but attempt to use `A` for the representation of state + /// identifiers. If `A` is insufficient to represent all state identifiers + /// in this DFA, then this returns an error. + /// + /// An alternative way to construct such a DFA is to use + /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized). + /// In general, picking the appropriate size upon initial construction of + /// a sparse DFA is preferred, since it will do the conversion in one + /// step instead of two. + pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> { + self.repr().to_sized().map(|r| r.into_sparse_dfa()) + } + + /// Serialize a sparse DFA to raw bytes in little endian format. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<LittleEndian>() + } + + /// Serialize a sparse DFA to raw bytes in big endian format. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<BigEndian>() + } + + /// Serialize a sparse DFA to raw bytes in native endian format. + /// Generally, it is better to pick an explicit endianness using either + /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is + /// useful in tests where the DFA is serialized and deserialized on the + /// same platform. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> { + self.repr().to_bytes::<NativeEndian>() + } +} + +impl<'a, S: StateID> SparseDFA<&'a [u8], S> { + /// Deserialize a sparse DFA with a specific state identifier + /// representation. + /// + /// Deserializing a DFA using this routine will never allocate heap memory. + /// This is also guaranteed to be a constant time operation that does not + /// vary with the size of the DFA. + /// + /// The bytes given should be generated by the serialization of a DFA with + /// either the + /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) + /// method or the + /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) + /// endian, depending on the endianness of the machine you are + /// deserializing this DFA from. + /// + /// If the state identifier representation is `usize`, then deserialization + /// is dependent on the pointer size. For this reason, it is best to + /// serialize DFAs using a fixed size representation for your state + /// identifiers, such as `u8`, `u16`, `u32` or `u64`. + /// + /// # Panics + /// + /// The bytes given should be *trusted*. In particular, if the bytes + /// are not a valid serialization of a DFA, or if the endianness of the + /// serialized bytes is different than the endianness of the machine that + /// is deserializing the DFA, then this routine will panic. Moreover, it + /// is possible for this deserialization routine to succeed even if the + /// given bytes do not represent a valid serialized sparse DFA. + /// + /// # Safety + /// + /// This routine is unsafe because it permits callers to provide an + /// arbitrary transition table with possibly incorrect transitions. While + /// the various serialization routines will never return an incorrect + /// transition table, there is no guarantee that the bytes provided here + /// are correct. While deserialization does many checks (as documented + /// above in the panic conditions), this routine does not check that the + /// transition table is correct. Given an incorrect transition table, it is + /// possible for the search routines to access out-of-bounds memory because + /// of explicit bounds check elision. + /// + /// # Example + /// + /// This example shows how to serialize a DFA to raw bytes, deserialize it + /// and then use it for searching. Note that we first convert the DFA to + /// using `u16` for its state identifier representation before serializing + /// it. While this isn't strictly necessary, it's good practice in order to + /// decrease the size of the DFA and to avoid platform specific pitfalls + /// such as differing pointer sizes. + /// + /// ``` + /// use regex_automata::{DFA, DenseDFA, SparseDFA}; + /// + /// # fn example() -> Result<(), regex_automata::Error> { + /// let sparse = SparseDFA::new("foo[0-9]+")?; + /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?; + /// + /// let dfa: SparseDFA<&[u8], u16> = unsafe { + /// SparseDFA::from_bytes(&bytes) + /// }; + /// + /// assert_eq!(Some(8), dfa.find(b"foo12345")); + /// # Ok(()) }; example().unwrap() + /// ``` + pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> { + Repr::from_bytes(buf).into_sparse_dfa() + } +} + +impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.repr().start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.repr().is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.repr().is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + match *self { + SparseDFA::Standard(ref r) => r.next_state(current, input), + SparseDFA::ByteClass(ref r) => r.next_state(current, input), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + self.next_state(current, input) + } + + // We specialize the following methods because it lets us lift the + // case analysis between the different types of sparse DFAs. Instead of + // doing the case analysis for every transition, we do it once before + // searching. For sparse DFAs, this doesn't seem to benefit performance as + // much as it does for the dense DFAs, but it's easy to do so we might as + // well do it. + + #[inline] + fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { + match *self { + SparseDFA::Standard(ref r) => r.is_match_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + SparseDFA::Standard(ref r) => r.find_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.find_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } + + #[inline] + fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { + match *self { + SparseDFA::Standard(ref r) => r.rfind_at(bytes, start), + SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), + SparseDFA::__Nonexhaustive => unreachable!(), + } + } +} + +/// A standard sparse DFA that does not use premultiplication or byte classes. +/// +/// Generally, it isn't necessary to use this type directly, since a +/// `SparseDFA` can be used for searching directly. One possible reason why +/// one might want to use this type directly is if you are implementing your +/// own search routines by walking a DFA's transitions directly. In that case, +/// you'll want to use this type (or any of the other DFA variant types) +/// directly, since they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); + +impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + self.0.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + self.next_state(current, input) + } +} + +/// A sparse DFA that shrinks its alphabet. +/// +/// Alphabet shrinking is achieved by using a set of equivalence classes +/// instead of using all possible byte values. Any two bytes belong to the same +/// equivalence class if and only if they can be used interchangeably anywhere +/// in the DFA while never discriminating between a match and a non-match. +/// +/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from +/// using byte classes. In some cases, using byte classes can even marginally +/// increase the size of a sparse DFA's transition table. The reason for this +/// is that a sparse DFA already compacts each state's transitions separate +/// from whether byte classes are used. +/// +/// Generally, it isn't necessary to use this type directly, since a +/// `SparseDFA` can be used for searching directly. One possible reason why +/// one might want to use this type directly is if you are implementing your +/// own search routines by walking a DFA's transitions directly. In that case, +/// you'll want to use this type (or any of the other DFA variant types) +/// directly, since they implement `next_state` more efficiently. +#[derive(Clone, Debug)] +pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); + +impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> { + type ID = S; + + #[inline] + fn start_state(&self) -> S { + self.0.start_state() + } + + #[inline] + fn is_match_state(&self, id: S) -> bool { + self.0.is_match_state(id) + } + + #[inline] + fn is_dead_state(&self, id: S) -> bool { + self.0.is_dead_state(id) + } + + #[inline] + fn is_match_or_dead_state(&self, id: S) -> bool { + self.0.is_match_or_dead_state(id) + } + + #[inline] + fn is_anchored(&self) -> bool { + self.0.is_anchored() + } + + #[inline] + fn next_state(&self, current: S, input: u8) -> S { + let input = self.0.byte_classes.get(input); + self.0.state(current).next(input) + } + + #[inline] + unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { + self.next_state(current, input) + } +} + +/// The underlying representation of a sparse DFA. This is shared by all of +/// the different variants of a sparse DFA. +#[derive(Clone)] +#[cfg_attr(not(feature = "std"), derive(Debug))] +struct Repr<T: AsRef<[u8]>, S: StateID = usize> { + anchored: bool, + start: S, + state_count: usize, + max_match: S, + byte_classes: ByteClasses, + trans: T, +} + +impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> { + fn into_sparse_dfa(self) -> SparseDFA<T, S> { + if self.byte_classes.is_singleton() { + SparseDFA::Standard(Standard(self)) + } else { + SparseDFA::ByteClass(ByteClass(self)) + } + } + + fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> { + Repr { + anchored: self.anchored, + start: self.start, + state_count: self.state_count, + max_match: self.max_match, + byte_classes: self.byte_classes.clone(), + trans: self.trans(), + } + } + + #[cfg(feature = "std")] + fn to_owned(&self) -> Repr<Vec<u8>, S> { + Repr { + anchored: self.anchored, + start: self.start, + state_count: self.state_count, + max_match: self.max_match, + byte_classes: self.byte_classes.clone(), + trans: self.trans().to_vec(), + } + } + + /// Return a convenient representation of the given state. + /// + /// This is marked as inline because it doesn't seem to get inlined + /// otherwise, which leads to a fairly significant performance loss (~25%). + #[inline] + fn state<'a>(&'a self, id: S) -> State<'a, S> { + let mut pos = id.to_usize(); + let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize; + pos += 2; + let input_ranges = &self.trans()[pos..pos + (ntrans * 2)]; + pos += 2 * ntrans; + let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())]; + State { _state_id_repr: PhantomData, ntrans, input_ranges, next } + } + + /// Return an iterator over all of the states in this DFA. + /// + /// The iterator returned yields tuples, where the first element is the + /// state ID and the second element is the state itself. + #[cfg(feature = "std")] + fn states<'a>(&'a self) -> StateIter<'a, T, S> { + StateIter { dfa: self, id: dead_id() } + } + + fn memory_usage(&self) -> usize { + self.trans().len() + } + + fn start_state(&self) -> S { + self.start + } + + fn is_match_state(&self, id: S) -> bool { + self.is_match_or_dead_state(id) && !self.is_dead_state(id) + } + + fn is_dead_state(&self, id: S) -> bool { + id == dead_id() + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + id <= self.max_match + } + + fn is_anchored(&self) -> bool { + self.anchored + } + + fn trans(&self) -> &[u8] { + self.trans.as_ref() + } + + /// Create a new sparse DFA whose match semantics are equivalent to this + /// DFA, but attempt to use `A` for the representation of state + /// identifiers. If `A` is insufficient to represent all state identifiers + /// in this DFA, then this returns an error. + #[cfg(feature = "std")] + fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> { + // To build the new DFA, we proceed much like the initial construction + // of the sparse DFA. Namely, since the state ID size is changing, + // we don't actually know all of our state IDs until we've allocated + // all necessary space. So we do one pass that allocates all of the + // storage we need, and then another pass to fill in the transitions. + + let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count); + let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count); + for (old_id, state) in self.states() { + let pos = trans.len(); + map.insert(old_id, usize_to_state_id(pos)?); + + let n = state.ntrans; + let zeros = 2 + (n * 2) + (n * size_of::<A>()); + trans.extend(iter::repeat(0).take(zeros)); + + NativeEndian::write_u16(&mut trans[pos..], n as u16); + let (s, e) = (pos + 2, pos + 2 + (n * 2)); + trans[s..e].copy_from_slice(state.input_ranges); + } + + let mut new = Repr { + anchored: self.anchored, + start: map[&self.start], + state_count: self.state_count, + max_match: map[&self.max_match], + byte_classes: self.byte_classes.clone(), + trans, + }; + for (&old_id, &new_id) in map.iter() { + let old_state = self.state(old_id); + let mut new_state = new.state_mut(new_id); + for i in 0..new_state.ntrans { + let next = map[&old_state.next_at(i)]; + new_state.set_next_at(i, usize_to_state_id(next.to_usize())?); + } + } + new.start = map[&self.start]; + new.max_match = map[&self.max_match]; + Ok(new) + } + + /// Serialize a sparse DFA to raw bytes using the provided endianness. + /// + /// If the state identifier representation of this DFA has a size different + /// than 1, 2, 4 or 8 bytes, then this returns an error. All + /// implementations of `StateID` provided by this crate satisfy this + /// requirement. + /// + /// Unlike dense DFAs, the result is not necessarily aligned since a + /// sparse DFA's transition table is always read as a sequence of bytes. + #[cfg(feature = "std")] + fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> { + let label = b"rust-regex-automata-sparse-dfa\x00"; + let size = + // For human readable label. + label.len() + // endiannes check, must be equal to 0xFEFF for native endian + + 2 + // For version number. + + 2 + // Size of state ID representation, in bytes. + // Must be 1, 2, 4 or 8. + + 2 + // For DFA misc options. (Currently unused.) + + 2 + // For start state. + + 8 + // For state count. + + 8 + // For max match state. + + 8 + // For byte class map. + + 256 + // For transition table. + + self.trans().len(); + + let mut i = 0; + let mut buf = vec![0; size]; + + // write label + for &b in label { + buf[i] = b; + i += 1; + } + // endianness check + A::write_u16(&mut buf[i..], 0xFEFF); + i += 2; + // version number + A::write_u16(&mut buf[i..], 1); + i += 2; + // size of state ID + let state_size = size_of::<S>(); + if ![1, 2, 4, 8].contains(&state_size) { + return Err(Error::serialize(&format!( + "state size of {} not supported, must be 1, 2, 4 or 8", + state_size + ))); + } + A::write_u16(&mut buf[i..], state_size as u16); + i += 2; + // DFA misc options + let mut options = 0u16; + if self.anchored { + options |= dense::MASK_ANCHORED; + } + A::write_u16(&mut buf[i..], options); + i += 2; + // start state + A::write_u64(&mut buf[i..], self.start.to_usize() as u64); + i += 8; + // state count + A::write_u64(&mut buf[i..], self.state_count as u64); + i += 8; + // max match state + A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64); + i += 8; + // byte class map + for b in (0..256).map(|b| b as u8) { + buf[i] = self.byte_classes.get(b); + i += 1; + } + // transition table + for (_, state) in self.states() { + A::write_u16(&mut buf[i..], state.ntrans as u16); + i += 2; + buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges); + i += state.ntrans * 2; + for j in 0..state.ntrans { + write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j)); + i += size_of::<S>(); + } + } + + assert_eq!(size, i, "expected to consume entire buffer"); + + Ok(buf) + } +} + +impl<'a, S: StateID> Repr<&'a [u8], S> { + /// The implementation for deserializing a sparse DFA from raw bytes. + unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> { + // skip over label + match buf.iter().position(|&b| b == b'\x00') { + None => panic!("could not find label"), + Some(i) => buf = &buf[i + 1..], + } + + // check that current endianness is same as endianness of DFA + let endian_check = NativeEndian::read_u16(buf); + buf = &buf[2..]; + if endian_check != 0xFEFF { + panic!( + "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ + are you trying to load a SparseDFA serialized with a \ + different endianness?", + endian_check, + ); + } + + // check that the version number is supported + let version = NativeEndian::read_u16(buf); + buf = &buf[2..]; + if version != 1 { + panic!( + "expected version 1, but found unsupported version {}", + version, + ); + } + + // read size of state + let state_size = NativeEndian::read_u16(buf) as usize; + if state_size != size_of::<S>() { + panic!( + "state size of SparseDFA ({}) does not match \ + requested state size ({})", + state_size, + size_of::<S>(), + ); + } + buf = &buf[2..]; + + // read miscellaneous options + let opts = NativeEndian::read_u16(buf); + buf = &buf[2..]; + + // read start state + let start = S::from_usize(NativeEndian::read_u64(buf) as usize); + buf = &buf[8..]; + + // read state count + let state_count = NativeEndian::read_u64(buf) as usize; + buf = &buf[8..]; + + // read max match state + let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); + buf = &buf[8..]; + + // read byte classes + let byte_classes = ByteClasses::from_slice(&buf[..256]); + buf = &buf[256..]; + + Repr { + anchored: opts & dense::MASK_ANCHORED > 0, + start, + state_count, + max_match, + byte_classes, + trans: buf, + } + } +} + +#[cfg(feature = "std")] +impl<S: StateID> Repr<Vec<u8>, S> { + /// The implementation for constructing a sparse DFA from a dense DFA. + fn from_dense_sized<T: AsRef<[S]>, A: StateID>( + dfa: &dense::Repr<T, S>, + ) -> Result<Repr<Vec<u8>, A>> { + // In order to build the transition table, we need to be able to write + // state identifiers for each of the "next" transitions in each state. + // Our state identifiers correspond to the byte offset in the + // transition table at which the state is encoded. Therefore, we do not + // actually know what the state identifiers are until we've allocated + // exactly as much space as we need for each state. Thus, construction + // of the transition table happens in two passes. + // + // In the first pass, we fill out the shell of each state, which + // includes the transition count, the input byte ranges and zero-filled + // space for the transitions. In this first pass, we also build up a + // map from the state identifier index of the dense DFA to the state + // identifier in this sparse DFA. + // + // In the second pass, we fill in the transitions based on the map + // built in the first pass. + + let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count()); + let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()]; + for (old_id, state) in dfa.states() { + let pos = trans.len(); + + remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?; + // zero-filled space for the transition count + trans.push(0); + trans.push(0); + + let mut trans_count = 0; + for (b1, b2, _) in state.sparse_transitions() { + trans_count += 1; + trans.push(b1); + trans.push(b2); + } + // fill in the transition count + NativeEndian::write_u16(&mut trans[pos..], trans_count); + + // zero-fill the actual transitions + let zeros = trans_count as usize * size_of::<A>(); + trans.extend(iter::repeat(0).take(zeros)); + } + + let mut new = Repr { + anchored: dfa.is_anchored(), + start: remap[dfa.state_id_to_index(dfa.start_state())], + state_count: dfa.state_count(), + max_match: remap[dfa.state_id_to_index(dfa.max_match_state())], + byte_classes: dfa.byte_classes().clone(), + trans, + }; + for (old_id, old_state) in dfa.states() { + let new_id = remap[dfa.state_id_to_index(old_id)]; + let mut new_state = new.state_mut(new_id); + let sparse = old_state.sparse_transitions(); + for (i, (_, _, next)) in sparse.enumerate() { + let next = remap[dfa.state_id_to_index(next)]; + new_state.set_next_at(i, next); + } + } + Ok(new) + } + + /// Return a convenient mutable representation of the given state. + fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> { + let mut pos = id.to_usize(); + let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize; + pos += 2; + + let size = (ntrans * 2) + (ntrans * size_of::<S>()); + let ranges_and_next = &mut self.trans[pos..pos + size]; + let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2); + StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next } + } +} + +#[cfg(feature = "std")] +impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn state_status<T: AsRef<[u8]>, S: StateID>( + dfa: &Repr<T, S>, + id: S, + ) -> &'static str { + if id == dead_id() { + if dfa.is_match_state(id) { + "D*" + } else { + "D " + } + } else if id == dfa.start_state() { + if dfa.is_match_state(id) { + ">*" + } else { + "> " + } + } else { + if dfa.is_match_state(id) { + " *" + } else { + " " + } + } + } + + writeln!(f, "SparseDFA(")?; + for (id, state) in self.states() { + let status = state_status(self, id); + writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over all states in a sparse DFA. +/// +/// This iterator yields tuples, where the first element is the state ID and +/// the second element is the state itself. +#[cfg(feature = "std")] +#[derive(Debug)] +struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> { + dfa: &'a Repr<T, S>, + id: S, +} + +#[cfg(feature = "std")] +impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> { + type Item = (S, State<'a, S>); + + fn next(&mut self) -> Option<(S, State<'a, S>)> { + if self.id.to_usize() >= self.dfa.trans().len() { + return None; + } + let id = self.id; + let state = self.dfa.state(id); + self.id = S::from_usize(self.id.to_usize() + state.bytes()); + Some((id, state)) + } +} + +/// A representation of a sparse DFA state that can be cheaply materialized +/// from a state identifier. +#[derive(Clone)] +struct State<'a, S: StateID = usize> { + /// The state identifier representation used by the DFA from which this + /// state was extracted. Since our transition table is compacted in a + /// &[u8], we don't actually use the state ID type parameter explicitly + /// anywhere, so we fake it. This prevents callers from using an incorrect + /// state ID representation to read from this state. + _state_id_repr: PhantomData<S>, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a [u8], +} + +impl<'a, S: StateID> State<'a, S> { + /// Searches for the next transition given an input byte. If no such + /// transition could be found, then a dead state is returned. + fn next(&self, input: u8) -> S { + // This straight linear search was observed to be much better than + // binary search on ASCII haystacks, likely because a binary search + // visits the ASCII case last but a linear search sees it first. A + // binary search does do a little better on non-ASCII haystacks, but + // not by much. There might be a better trade off lurking here. + for i in 0..self.ntrans { + let (start, end) = self.range(i); + if start <= input && input <= end { + return self.next_at(i); + } + // We could bail early with an extra branch: if input < b1, then + // we know we'll never find a matching transition. Interestingly, + // this extra branch seems to not help performance, or will even + // hurt it. It's likely very dependent on the DFA itself and what + // is being searched. + } + dead_id() + } + + /// Returns the inclusive input byte range for the ith transition in this + /// state. + fn range(&self, i: usize) -> (u8, u8) { + (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) + } + + /// Returns the next state for the ith transition in this state. + fn next_at(&self, i: usize) -> S { + S::read_bytes(&self.next[i * size_of::<S>()..]) + } + + /// Return the total number of bytes that this state consumes in its + /// encoded form. + #[cfg(feature = "std")] + fn bytes(&self) -> usize { + 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>()) + } +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> fmt::Debug for State<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut transitions = vec![]; + for i in 0..self.ntrans { + let next = self.next_at(i); + if next == dead_id() { + continue; + } + + let (start, end) = self.range(i); + if start == end { + transitions.push(format!( + "{} => {}", + escape(start), + next.to_usize() + )); + } else { + transitions.push(format!( + "{}-{} => {}", + escape(start), + escape(end), + next.to_usize(), + )); + } + } + write!(f, "{}", transitions.join(", ")) + } +} + +/// A representation of a mutable sparse DFA state that can be cheaply +/// materialized from a state identifier. +#[cfg(feature = "std")] +struct StateMut<'a, S: StateID = usize> { + /// The state identifier representation used by the DFA from which this + /// state was extracted. Since our transition table is compacted in a + /// &[u8], we don't actually use the state ID type parameter explicitly + /// anywhere, so we fake it. This prevents callers from using an incorrect + /// state ID representation to read from this state. + _state_id_repr: PhantomData<S>, + /// The number of transitions in this state. + ntrans: usize, + /// Pairs of input ranges, where there is one pair for each transition. + /// Each pair specifies an inclusive start and end byte range for the + /// corresponding transition. + input_ranges: &'a mut [u8], + /// Transitions to the next state. This slice contains native endian + /// encoded state identifiers, with `S` as the representation. Thus, there + /// are `ntrans * size_of::<S>()` bytes in this slice. + next: &'a mut [u8], +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> StateMut<'a, S> { + /// Sets the ith transition to the given state. + fn set_next_at(&mut self, i: usize, next: S) { + next.write_bytes(&mut self.next[i * size_of::<S>()..]); + } +} + +#[cfg(feature = "std")] +impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let state = State { + _state_id_repr: self._state_id_repr, + ntrans: self.ntrans, + input_ranges: self.input_ranges, + next: self.next, + }; + fmt::Debug::fmt(&state, f) + } +} + +/// Return the given byte as its escaped string form. +#[cfg(feature = "std")] +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() +} + +/// A binary search routine specialized specifically to a sparse DFA state's +/// transitions. Specifically, the transitions are defined as a set of pairs +/// of input bytes that delineate an inclusive range of bytes. If the input +/// byte is in the range, then the corresponding transition is a match. +/// +/// This binary search accepts a slice of these pairs and returns the position +/// of the matching pair (the ith transition), or None if no matching pair +/// could be found. +/// +/// Note that this routine is not currently used since it was observed to +/// either decrease performance when searching ASCII, or did not provide enough +/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here +/// for posterity in case we can find a way to use it. +/// +/// In theory, we could use the standard library's search routine if we could +/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently +/// guaranteed to be safe and is thus UB (since I don't think the in-memory +/// representation of `(u8, u8)` has been nailed down). +#[inline(always)] +#[allow(dead_code)] +fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { + debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); + debug_assert!(ranges.len() <= 512, "ranges should be short"); + + let (mut left, mut right) = (0, ranges.len() / 2); + while left < right { + let mid = (left + right) / 2; + let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); + if needle < b1 { + right = mid; + } else if needle > b2 { + left = mid + 1; + } else { + return Some(mid); + } + } + None +} diff --git a/vendor/regex-automata/src/sparse_set.rs b/vendor/regex-automata/src/sparse_set.rs new file mode 100644 index 000000000..56743b033 --- /dev/null +++ b/vendor/regex-automata/src/sparse_set.rs @@ -0,0 +1,60 @@ +use std::slice; + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse sparse sets, so the initial allocation cost is bareable. However, its +/// other properties listed above are extremely useful. +#[derive(Clone, Debug)] +pub struct SparseSet { + /// Dense contains the instruction pointers in the order in which they + /// were inserted. + dense: Vec<usize>, + /// Sparse maps instruction pointers to their location in dense. + /// + /// An instruction pointer is in the set if and only if + /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. + sparse: Box<[usize]>, +} + +impl SparseSet { + pub fn new(size: usize) -> SparseSet { + SparseSet { + dense: Vec::with_capacity(size), + sparse: vec![0; size].into_boxed_slice(), + } + } + + pub fn len(&self) -> usize { + self.dense.len() + } + + pub fn insert(&mut self, value: usize) { + let i = self.len(); + assert!(i < self.dense.capacity()); + self.dense.push(value); + self.sparse[value] = i; + } + + pub fn contains(&self, value: usize) -> bool { + let i = self.sparse[value]; + self.dense.get(i) == Some(&value) + } + + pub fn clear(&mut self) { + self.dense.clear(); + } +} + +impl<'a> IntoIterator for &'a SparseSet { + type Item = &'a usize; + type IntoIter = slice::Iter<'a, usize>; + fn into_iter(self) -> Self::IntoIter { + self.dense.iter() + } +} diff --git a/vendor/regex-automata/src/state_id.rs b/vendor/regex-automata/src/state_id.rs new file mode 100644 index 000000000..c9bac1941 --- /dev/null +++ b/vendor/regex-automata/src/state_id.rs @@ -0,0 +1,291 @@ +use core::fmt::Debug; +use core::hash::Hash; +use core::mem::size_of; + +use byteorder::{ByteOrder, NativeEndian}; + +#[cfg(feature = "std")] +pub use self::std::*; + +#[cfg(feature = "std")] +mod std { + use byteorder::ByteOrder; + use core::mem::size_of; + use error::{Error, Result}; + + use super::StateID; + + /// Check that the premultiplication of the given state identifier can + /// fit into the representation indicated by `S`. If it cannot, or if it + /// overflows `usize` itself, then an error is returned. + pub fn premultiply_overflow_error<S: StateID>( + last_state: S, + alphabet_len: usize, + ) -> Result<()> { + let requested = match last_state.to_usize().checked_mul(alphabet_len) { + Some(requested) => requested, + None => return Err(Error::premultiply_overflow(0, 0)), + }; + if requested > S::max_id() { + return Err(Error::premultiply_overflow(S::max_id(), requested)); + } + Ok(()) + } + + /// Allocate the next sequential identifier for a fresh state given + /// the previously constructed state identified by `current`. If the + /// next sequential identifier would overflow `usize` or the chosen + /// representation indicated by `S`, then an error is returned. + pub fn next_state_id<S: StateID>(current: S) -> Result<S> { + let next = match current.to_usize().checked_add(1) { + Some(next) => next, + None => return Err(Error::state_id_overflow(::std::usize::MAX)), + }; + if next > S::max_id() { + return Err(Error::state_id_overflow(S::max_id())); + } + Ok(S::from_usize(next)) + } + + /// Convert the given `usize` to the chosen state identifier + /// representation. If the given value cannot fit in the chosen + /// representation, then an error is returned. + pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> { + if value > S::max_id() { + Err(Error::state_id_overflow(S::max_id())) + } else { + Ok(S::from_usize(value)) + } + } + + /// Write the given identifier to the given slice of bytes using the + /// specified endianness. The given slice must have length at least + /// `size_of::<S>()`. + /// + /// The given state identifier representation must have size 1, 2, 4 or 8. + pub fn write_state_id_bytes<E: ByteOrder, S: StateID>( + slice: &mut [u8], + id: S, + ) { + assert!( + 1 == size_of::<S>() + || 2 == size_of::<S>() + || 4 == size_of::<S>() + || 8 == size_of::<S>() + ); + + match size_of::<S>() { + 1 => slice[0] = id.to_usize() as u8, + 2 => E::write_u16(slice, id.to_usize() as u16), + 4 => E::write_u32(slice, id.to_usize() as u32), + 8 => E::write_u64(slice, id.to_usize() as u64), + _ => unreachable!(), + } + } +} + +/// Return the unique identifier for a DFA's dead state in the chosen +/// representation indicated by `S`. +pub fn dead_id<S: StateID>() -> S { + S::from_usize(0) +} + +/// A trait describing the representation of a DFA's state identifier. +/// +/// The purpose of this trait is to safely express both the possible state +/// identifier representations that can be used in a DFA and to convert between +/// state identifier representations and types that can be used to efficiently +/// index memory (such as `usize`). +/// +/// In general, one should not need to implement this trait explicitly. In +/// particular, this crate provides implementations for `u8`, `u16`, `u32`, +/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can +/// represent all corresponding values in a `usize`.) +/// +/// # Safety +/// +/// This trait is unsafe because the correctness of its implementations may be +/// relied upon by other unsafe code. For example, one possible way to +/// implement this trait incorrectly would be to return a maximum identifier +/// in `max_id` that is greater than the real maximum identifier. This will +/// likely result in wrap-on-overflow semantics in release mode, which can in +/// turn produce incorrect state identifiers. Those state identifiers may then +/// in turn access out-of-bounds memory in a DFA's search routine, where bounds +/// checks are explicitly elided for performance reasons. +pub unsafe trait StateID: + Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord +{ + /// Convert from a `usize` to this implementation's representation. + /// + /// Implementors may assume that `n <= Self::max_id`. That is, implementors + /// do not need to check whether `n` can fit inside this implementation's + /// representation. + fn from_usize(n: usize) -> Self; + + /// Convert this implementation's representation to a `usize`. + /// + /// Implementors must not return a `usize` value greater than + /// `Self::max_id` and must not permit overflow when converting between the + /// implementor's representation and `usize`. In general, the preferred + /// way for implementors to achieve this is to simply not provide + /// implementations of `StateID` that cannot fit into the target platform's + /// `usize`. + fn to_usize(self) -> usize; + + /// Return the maximum state identifier supported by this representation. + /// + /// Implementors must return a correct bound. Doing otherwise may result + /// in memory unsafety. + fn max_id() -> usize; + + /// Read a single state identifier from the given slice of bytes in native + /// endian format. + /// + /// Implementors may assume that the given slice has length at least + /// `size_of::<Self>()`. + fn read_bytes(slice: &[u8]) -> Self; + + /// Write this state identifier to the given slice of bytes in native + /// endian format. + /// + /// Implementors may assume that the given slice has length at least + /// `size_of::<Self>()`. + fn write_bytes(self, slice: &mut [u8]); +} + +unsafe impl StateID for usize { + #[inline] + fn from_usize(n: usize) -> usize { + n + } + + #[inline] + fn to_usize(self) -> usize { + self + } + + #[inline] + fn max_id() -> usize { + ::core::usize::MAX + } + + #[inline] + fn read_bytes(slice: &[u8]) -> Self { + NativeEndian::read_uint(slice, size_of::<usize>()) as usize + } + + #[inline] + fn write_bytes(self, slice: &mut [u8]) { + NativeEndian::write_uint(slice, self as u64, size_of::<usize>()) + } +} + +unsafe impl StateID for u8 { + #[inline] + fn from_usize(n: usize) -> u8 { + n as u8 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::core::u8::MAX as usize + } + + #[inline] + fn read_bytes(slice: &[u8]) -> Self { + slice[0] + } + + #[inline] + fn write_bytes(self, slice: &mut [u8]) { + slice[0] = self; + } +} + +unsafe impl StateID for u16 { + #[inline] + fn from_usize(n: usize) -> u16 { + n as u16 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::core::u16::MAX as usize + } + + #[inline] + fn read_bytes(slice: &[u8]) -> Self { + NativeEndian::read_u16(slice) + } + + #[inline] + fn write_bytes(self, slice: &mut [u8]) { + NativeEndian::write_u16(slice, self) + } +} + +#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] +unsafe impl StateID for u32 { + #[inline] + fn from_usize(n: usize) -> u32 { + n as u32 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::core::u32::MAX as usize + } + + #[inline] + fn read_bytes(slice: &[u8]) -> Self { + NativeEndian::read_u32(slice) + } + + #[inline] + fn write_bytes(self, slice: &mut [u8]) { + NativeEndian::write_u32(slice, self) + } +} + +#[cfg(target_pointer_width = "64")] +unsafe impl StateID for u64 { + #[inline] + fn from_usize(n: usize) -> u64 { + n as u64 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::core::u64::MAX as usize + } + + #[inline] + fn read_bytes(slice: &[u8]) -> Self { + NativeEndian::read_u64(slice) + } + + #[inline] + fn write_bytes(self, slice: &mut [u8]) { + NativeEndian::write_u64(slice, self) + } +} diff --git a/vendor/regex-automata/src/transducer.rs b/vendor/regex-automata/src/transducer.rs new file mode 100644 index 000000000..679c75726 --- /dev/null +++ b/vendor/regex-automata/src/transducer.rs @@ -0,0 +1,107 @@ +use fst::Automaton; + +use crate::{StateID, DFA}; + +macro_rules! imp { + ($ty:ty, $id:ty) => { + impl<T: AsRef<[$id]>, S: StateID> Automaton for $ty { + type State = S; + + #[inline] + fn start(&self) -> S { + self.start_state() + } + + #[inline] + fn is_match(&self, state: &S) -> bool { + self.is_match_state(*state) + } + + #[inline] + fn accept(&self, state: &S, byte: u8) -> S { + self.next_state(*state, byte) + } + + #[inline] + fn can_match(&self, state: &S) -> bool { + !self.is_dead_state(*state) + } + } + }; +} + +imp!(crate::dense::DenseDFA<T, S>, S); +imp!(crate::dense::Standard<T, S>, S); +imp!(crate::dense::ByteClass<T, S>, S); +imp!(crate::dense::Premultiplied<T, S>, S); +imp!(crate::dense::PremultipliedByteClass<T, S>, S); +imp!(crate::sparse::SparseDFA<T, S>, u8); +imp!(crate::sparse::Standard<T, S>, u8); +imp!(crate::sparse::ByteClass<T, S>, u8); + +#[cfg(test)] +mod tests { + use bstr::BString; + use fst::{Automaton, IntoStreamer, Set, Streamer}; + + use crate::dense::{self, DenseDFA}; + use crate::sparse::SparseDFA; + + fn search<A: Automaton, D: AsRef<[u8]>>( + set: &Set<D>, + aut: A, + ) -> Vec<BString> { + let mut stream = set.search(aut).into_stream(); + + let mut results = vec![]; + while let Some(key) = stream.next() { + results.push(BString::from(key)); + } + results + } + + #[test] + fn dense_anywhere() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = DenseDFA::new("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); + } + + #[test] + fn dense_anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new().anchored(true).build("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } + + #[test] + fn sparse_anywhere() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = SparseDFA::new("ba.*").unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); + } + + #[test] + fn sparse_anchored() { + let set = + Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) + .unwrap(); + let dfa = dense::Builder::new() + .anchored(true) + .build("ba.*") + .unwrap() + .to_sparse() + .unwrap(); + let got = search(&set, &dfa); + assert_eq!(got, vec!["bar", "baz"]); + } +} diff --git a/vendor/regex-automata/tests/collection.rs b/vendor/regex-automata/tests/collection.rs new file mode 100644 index 000000000..68b03229e --- /dev/null +++ b/vendor/regex-automata/tests/collection.rs @@ -0,0 +1,461 @@ +use std::collections::BTreeMap; +use std::env; +use std::fmt::{self, Write}; +use std::thread; + +use regex; +use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA}; +use serde_bytes; +use toml; + +macro_rules! load { + ($col:ident, $path:expr) => { + $col.extend(RegexTests::load( + concat!("../data/tests/", $path), + include_bytes!(concat!("../data/tests/", $path)), + )); + }; +} + +lazy_static! { + pub static ref SUITE: RegexTestCollection = { + let mut col = RegexTestCollection::new(); + load!(col, "fowler/basic.toml"); + load!(col, "fowler/nullsubexpr.toml"); + load!(col, "fowler/repetition.toml"); + load!(col, "fowler/repetition-long.toml"); + load!(col, "crazy.toml"); + load!(col, "flags.toml"); + load!(col, "iter.toml"); + load!(col, "no-unicode.toml"); + load!(col, "unicode.toml"); + col + }; +} + +#[derive(Clone, Debug)] +pub struct RegexTestCollection { + pub by_name: BTreeMap<String, RegexTest>, +} + +#[derive(Clone, Debug, Deserialize)] +pub struct RegexTests { + pub tests: Vec<RegexTest>, +} + +#[derive(Clone, Debug, Deserialize)] +pub struct RegexTest { + pub name: String, + #[serde(default)] + pub options: Vec<RegexTestOption>, + pub pattern: String, + #[serde(with = "serde_bytes")] + pub input: Vec<u8>, + #[serde(rename = "matches")] + pub matches: Vec<Match>, + #[serde(default)] + pub captures: Vec<Option<Match>>, + #[serde(default)] + pub fowler_line_number: Option<u64>, +} + +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "kebab-case")] +pub enum RegexTestOption { + Anchored, + CaseInsensitive, + NoUnicode, + Escaped, + #[serde(rename = "invalid-utf8")] + InvalidUTF8, +} + +#[derive(Clone, Copy, Deserialize, Eq, PartialEq)] +pub struct Match { + pub start: usize, + pub end: usize, +} + +impl RegexTestCollection { + fn new() -> RegexTestCollection { + RegexTestCollection { by_name: BTreeMap::new() } + } + + fn extend(&mut self, tests: RegexTests) { + for test in tests.tests { + let name = test.name.clone(); + if self.by_name.contains_key(&name) { + panic!("found duplicate test {}", name); + } + self.by_name.insert(name, test); + } + } + + pub fn tests(&self) -> Vec<&RegexTest> { + self.by_name.values().collect() + } +} + +impl RegexTests { + fn load(path: &str, slice: &[u8]) -> RegexTests { + let mut data: RegexTests = toml::from_slice(slice) + .expect(&format!("failed to load {}", path)); + for test in &mut data.tests { + if test.options.contains(&RegexTestOption::Escaped) { + test.input = unescape_bytes(&test.input); + } + } + data + } +} + +#[derive(Debug)] +pub struct RegexTester { + asserted: bool, + results: RegexTestResults, + skip_expensive: bool, + whitelist: Vec<regex::Regex>, + blacklist: Vec<regex::Regex>, +} + +impl Drop for RegexTester { + fn drop(&mut self) { + // If we haven't asserted yet, then the test is probably buggy, so + // fail it. But if we're already panicking (e.g., a bug in the regex + // engine), then don't double-panic, which causes an immediate abort. + if !thread::panicking() && !self.asserted { + panic!("must call RegexTester::assert at end of test"); + } + } +} + +impl RegexTester { + pub fn new() -> RegexTester { + let mut tester = RegexTester { + asserted: false, + results: RegexTestResults::default(), + skip_expensive: false, + whitelist: vec![], + blacklist: vec![], + }; + for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") { + let x = x.trim(); + if x.is_empty() { + continue; + } + if x.starts_with("-") { + tester = tester.blacklist(&x[1..]); + } else { + tester = tester.whitelist(x); + } + } + tester + } + + pub fn skip_expensive(mut self) -> RegexTester { + self.skip_expensive = true; + self + } + + pub fn whitelist(mut self, name: &str) -> RegexTester { + self.whitelist.push(regex::Regex::new(name).unwrap()); + self + } + + pub fn blacklist(mut self, name: &str) -> RegexTester { + self.blacklist.push(regex::Regex::new(name).unwrap()); + self + } + + pub fn assert(&mut self) { + self.asserted = true; + self.results.assert(); + } + + pub fn build_regex<S: StateID>( + &self, + mut builder: RegexBuilder, + test: &RegexTest, + ) -> Option<Regex<DenseDFA<Vec<S>, S>>> { + if self.skip(test) { + return None; + } + self.apply_options(test, &mut builder); + + match builder.build_with_size::<S>(&test.pattern) { + Ok(re) => Some(re), + Err(err) => { + if let ErrorKind::Unsupported(_) = *err.kind() { + None + } else { + panic!( + "failed to build {:?} with pattern '{:?}': {}", + test.name, test.pattern, err + ); + } + } + } + } + + pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I) + where + I: IntoIterator<IntoIter = T, Item = &'a RegexTest>, + T: Iterator<Item = &'a RegexTest>, + { + for test in tests { + let builder = builder.clone(); + let re: Regex = match self.build_regex(builder, test) { + None => continue, + Some(re) => re, + }; + self.test(test, &re); + } + } + + pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) { + self.test_is_match(test, re); + self.test_find(test, re); + // Some tests (namely, fowler) are designed only to detect the + // first match even if there are more subsequent matches. To that + // end, we only test match iteration when the number of matches + // expected is not 1, or if the test name has 'iter' in it. + if test.name.contains("iter") || test.matches.len() != 1 { + self.test_find_iter(test, re); + } + } + + pub fn test_is_match<'a, D: DFA>( + &mut self, + test: &RegexTest, + re: &Regex<D>, + ) { + self.asserted = false; + + let got = re.is_match(&test.input); + let expected = test.matches.len() >= 1; + if got == expected { + self.results.succeeded.push(test.clone()); + return; + } + self.results.failed.push(RegexTestFailure { + test: test.clone(), + kind: RegexTestFailureKind::IsMatch, + }); + } + + pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) { + self.asserted = false; + + let got = + re.find(&test.input).map(|(start, end)| Match { start, end }); + if got == test.matches.get(0).map(|&m| m) { + self.results.succeeded.push(test.clone()); + return; + } + self.results.failed.push(RegexTestFailure { + test: test.clone(), + kind: RegexTestFailureKind::Find { got }, + }); + } + + pub fn test_find_iter<'a, D: DFA>( + &mut self, + test: &RegexTest, + re: &Regex<D>, + ) { + self.asserted = false; + + let got: Vec<Match> = re + .find_iter(&test.input) + .map(|(start, end)| Match { start, end }) + .collect(); + if got == test.matches { + self.results.succeeded.push(test.clone()); + return; + } + self.results.failed.push(RegexTestFailure { + test: test.clone(), + kind: RegexTestFailureKind::FindIter { got }, + }); + } + + fn skip(&self, test: &RegexTest) -> bool { + if self.skip_expensive { + if test.name.starts_with("repetition-long") { + return true; + } + } + if !self.blacklist.is_empty() { + if self.blacklist.iter().any(|re| re.is_match(&test.name)) { + return true; + } + } + if !self.whitelist.is_empty() { + if !self.whitelist.iter().any(|re| re.is_match(&test.name)) { + return true; + } + } + false + } + + fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) { + for opt in &test.options { + match *opt { + RegexTestOption::Anchored => { + builder.anchored(true); + } + RegexTestOption::CaseInsensitive => { + builder.case_insensitive(true); + } + RegexTestOption::NoUnicode => { + builder.unicode(false); + } + RegexTestOption::Escaped => {} + RegexTestOption::InvalidUTF8 => { + builder.allow_invalid_utf8(true); + } + } + } + } +} + +#[derive(Clone, Debug, Default)] +pub struct RegexTestResults { + /// Tests that succeeded. + pub succeeded: Vec<RegexTest>, + /// Failed tests, indexed by group name. + pub failed: Vec<RegexTestFailure>, +} + +#[derive(Clone, Debug)] +pub struct RegexTestFailure { + test: RegexTest, + kind: RegexTestFailureKind, +} + +#[derive(Clone, Debug)] +pub enum RegexTestFailureKind { + IsMatch, + Find { got: Option<Match> }, + FindIter { got: Vec<Match> }, +} + +impl RegexTestResults { + pub fn assert(&self) { + if self.failed.is_empty() { + return; + } + let failures = self + .failed + .iter() + .map(|f| f.to_string()) + .collect::<Vec<String>>() + .join("\n\n"); + panic!( + "found {} failures:\n{}\n{}\n{}\n\n\ + Set the REGEX_TEST environment variable to filter tests, \n\ + e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\ + whose name contains crazy-misc but not crazy-misc2\n\n", + self.failed.len(), + "~".repeat(79), + failures.trim(), + "~".repeat(79) + ) + } +} + +impl fmt::Display for RegexTestFailure { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}: {}\n \ + options: {:?}\n \ + pattern: {}\n \ + pattern (escape): {}\n \ + input: {}\n \ + input (escape): {}\n \ + input (hex): {}", + self.test.name, + self.kind.fmt(&self.test)?, + self.test.options, + self.test.pattern, + escape_default(&self.test.pattern), + nice_raw_bytes(&self.test.input), + escape_bytes(&self.test.input), + hex_bytes(&self.test.input) + ) + } +} + +impl RegexTestFailureKind { + fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> { + let mut buf = String::new(); + match *self { + RegexTestFailureKind::IsMatch => { + if let Some(&m) = test.matches.get(0) { + write!(buf, "expected match (at {}), but none found", m)? + } else { + write!(buf, "expected no match, but found a match")? + } + } + RegexTestFailureKind::Find { got } => write!( + buf, + "expected {:?}, but found {:?}", + test.matches.get(0), + got + )?, + RegexTestFailureKind::FindIter { ref got } => write!( + buf, + "expected {:?}, but found {:?}", + test.matches, got + )?, + } + Ok(buf) + } +} + +impl fmt::Display for Match { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "({}, {})", self.start, self.end) + } +} + +impl fmt::Debug for Match { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "({}, {})", self.start, self.end) + } +} + +fn nice_raw_bytes(bytes: &[u8]) -> String { + use std::str; + + match str::from_utf8(bytes) { + Ok(s) => s.to_string(), + Err(_) => escape_bytes(bytes), + } +} + +fn escape_bytes(bytes: &[u8]) -> String { + use std::ascii; + + let escaped = bytes + .iter() + .flat_map(|&b| ascii::escape_default(b)) + .collect::<Vec<u8>>(); + String::from_utf8(escaped).unwrap() +} + +fn hex_bytes(bytes: &[u8]) -> String { + bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect() +} + +fn escape_default(s: &str) -> String { + s.chars().flat_map(|c| c.escape_default()).collect() +} + +fn unescape_bytes(bytes: &[u8]) -> Vec<u8> { + use std::str; + use unescape::unescape; + + unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8")) +} diff --git a/vendor/regex-automata/tests/regression.rs b/vendor/regex-automata/tests/regression.rs new file mode 100644 index 000000000..c2d2c1226 --- /dev/null +++ b/vendor/regex-automata/tests/regression.rs @@ -0,0 +1,42 @@ +use regex_automata::{dense, DFA}; + +// A regression test for checking that minimization correctly translates +// whether a state is a match state or not. Previously, it was possible for +// minimization to mark a non-matching state as matching. +#[test] +fn minimize_sets_correct_match_states() { + let pattern = + // This is a subset of the grapheme matching regex. I couldn't seem + // to get a repro any smaller than this unfortunately. + r"(?x) + (?: + \p{gcb=Prepend}* + (?: + (?: + (?: + \p{gcb=L}* + (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT}) + \p{gcb=T}* + ) + | + \p{gcb=L}+ + | + \p{gcb=T}+ + ) + | + \p{Extended_Pictographic} + (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})* + | + [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] + ) + [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]* + ) + "; + + let dfa = dense::Builder::new() + .minimize(true) + .anchored(true) + .build(pattern) + .unwrap(); + assert_eq!(None, dfa.find(b"\xE2")); +} diff --git a/vendor/regex-automata/tests/suite.rs b/vendor/regex-automata/tests/suite.rs new file mode 100644 index 000000000..839719403 --- /dev/null +++ b/vendor/regex-automata/tests/suite.rs @@ -0,0 +1,250 @@ +use regex_automata::{DenseDFA, Regex, RegexBuilder, SparseDFA}; + +use collection::{RegexTester, SUITE}; + +#[test] +fn unminimized_standard() { + let mut builder = RegexBuilder::new(); + builder.minimize(false).premultiply(false).byte_classes(false); + + let mut tester = RegexTester::new().skip_expensive(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn unminimized_premultiply() { + let mut builder = RegexBuilder::new(); + builder.minimize(false).premultiply(true).byte_classes(false); + + let mut tester = RegexTester::new().skip_expensive(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn unminimized_byte_class() { + let mut builder = RegexBuilder::new(); + builder.minimize(false).premultiply(false).byte_classes(true); + + let mut tester = RegexTester::new(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn unminimized_premultiply_byte_class() { + let mut builder = RegexBuilder::new(); + builder.minimize(false).premultiply(true).byte_classes(true); + + let mut tester = RegexTester::new(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn unminimized_standard_no_nfa_shrink() { + let mut builder = RegexBuilder::new(); + builder + .minimize(false) + .premultiply(false) + .byte_classes(false) + .shrink(false); + + let mut tester = RegexTester::new().skip_expensive(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn minimized_standard() { + let mut builder = RegexBuilder::new(); + builder.minimize(true).premultiply(false).byte_classes(false); + + let mut tester = RegexTester::new().skip_expensive(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn minimized_premultiply() { + let mut builder = RegexBuilder::new(); + builder.minimize(true).premultiply(true).byte_classes(false); + + let mut tester = RegexTester::new().skip_expensive(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn minimized_byte_class() { + let mut builder = RegexBuilder::new(); + builder.minimize(true).premultiply(false).byte_classes(true); + + let mut tester = RegexTester::new(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn minimized_premultiply_byte_class() { + let mut builder = RegexBuilder::new(); + builder.minimize(true).premultiply(true).byte_classes(true); + + let mut tester = RegexTester::new(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +#[test] +fn minimized_standard_no_nfa_shrink() { + let mut builder = RegexBuilder::new(); + builder + .minimize(true) + .premultiply(false) + .byte_classes(false) + .shrink(false); + + let mut tester = RegexTester::new().skip_expensive(); + tester.test_all(builder, SUITE.tests()); + tester.assert(); +} + +// A basic sanity test that checks we can convert a regex to a smaller +// representation and that the resulting regex still passes our tests. +// +// If tests grow minimal regexes that cannot be represented in 16 bits, then +// we'll either want to skip those or increase the size to test to u32. +#[test] +fn u16() { + let mut builder = RegexBuilder::new(); + builder.minimize(true).premultiply(false).byte_classes(true); + + let mut tester = RegexTester::new().skip_expensive(); + for test in SUITE.tests() { + let builder = builder.clone(); + let re: Regex = match tester.build_regex(builder, test) { + None => continue, + Some(re) => re, + }; + let small_re = Regex::from_dfas( + re.forward().to_u16().unwrap(), + re.reverse().to_u16().unwrap(), + ); + + tester.test(test, &small_re); + } + tester.assert(); +} + +// Test that sparse DFAs work using the standard configuration. +#[test] +fn sparse_unminimized_standard() { + let mut builder = RegexBuilder::new(); + builder.minimize(false).premultiply(false).byte_classes(false); + + let mut tester = RegexTester::new().skip_expensive(); + for test in SUITE.tests() { + let builder = builder.clone(); + let re: Regex = match tester.build_regex(builder, test) { + None => continue, + Some(re) => re, + }; + let fwd = re.forward().to_sparse().unwrap(); + let rev = re.reverse().to_sparse().unwrap(); + let sparse_re = Regex::from_dfas(fwd, rev); + + tester.test(test, &sparse_re); + } + tester.assert(); +} + +// Test that sparse DFAs work after converting them to a different state ID +// representation. +#[test] +fn sparse_u16() { + let mut builder = RegexBuilder::new(); + builder.minimize(true).premultiply(false).byte_classes(false); + + let mut tester = RegexTester::new().skip_expensive(); + for test in SUITE.tests() { + let builder = builder.clone(); + let re: Regex = match tester.build_regex(builder, test) { + None => continue, + Some(re) => re, + }; + let fwd = re.forward().to_sparse().unwrap().to_u16().unwrap(); + let rev = re.reverse().to_sparse().unwrap().to_u16().unwrap(); + let sparse_re = Regex::from_dfas(fwd, rev); + + tester.test(test, &sparse_re); + } + tester.assert(); +} + +// Another basic sanity test that checks we can serialize and then deserialize +// a regex, and that the resulting regex can be used for searching correctly. +#[test] +fn serialization_roundtrip() { + let mut builder = RegexBuilder::new(); + builder.premultiply(false).byte_classes(true); + + let mut tester = RegexTester::new().skip_expensive(); + for test in SUITE.tests() { + let builder = builder.clone(); + let re: Regex = match tester.build_regex(builder, test) { + None => continue, + Some(re) => re, + }; + + let fwd_bytes = re.forward().to_bytes_native_endian().unwrap(); + let rev_bytes = re.reverse().to_bytes_native_endian().unwrap(); + let fwd: DenseDFA<&[usize], usize> = + unsafe { DenseDFA::from_bytes(&fwd_bytes) }; + let rev: DenseDFA<&[usize], usize> = + unsafe { DenseDFA::from_bytes(&rev_bytes) }; + let re = Regex::from_dfas(fwd, rev); + + tester.test(test, &re); + } + tester.assert(); +} + +// A basic sanity test that checks we can serialize and then deserialize a +// regex using sparse DFAs, and that the resulting regex can be used for +// searching correctly. +#[test] +fn sparse_serialization_roundtrip() { + let mut builder = RegexBuilder::new(); + builder.byte_classes(true); + + let mut tester = RegexTester::new().skip_expensive(); + for test in SUITE.tests() { + let builder = builder.clone(); + let re: Regex = match tester.build_regex(builder, test) { + None => continue, + Some(re) => re, + }; + + let fwd_bytes = re + .forward() + .to_sparse() + .unwrap() + .to_bytes_native_endian() + .unwrap(); + let rev_bytes = re + .reverse() + .to_sparse() + .unwrap() + .to_bytes_native_endian() + .unwrap(); + let fwd: SparseDFA<&[u8], usize> = + unsafe { SparseDFA::from_bytes(&fwd_bytes) }; + let rev: SparseDFA<&[u8], usize> = + unsafe { SparseDFA::from_bytes(&rev_bytes) }; + let re = Regex::from_dfas(fwd, rev); + + tester.test(test, &re); + } + tester.assert(); +} diff --git a/vendor/regex-automata/tests/tests.rs b/vendor/regex-automata/tests/tests.rs new file mode 100644 index 000000000..fb4cd7717 --- /dev/null +++ b/vendor/regex-automata/tests/tests.rs @@ -0,0 +1,25 @@ +#[cfg(feature = "std")] +#[macro_use] +extern crate lazy_static; +#[cfg(feature = "std")] +extern crate regex; +#[cfg(feature = "std")] +extern crate regex_automata; +#[cfg(feature = "std")] +extern crate serde; +#[cfg(feature = "std")] +extern crate serde_bytes; +#[cfg(feature = "std")] +#[macro_use] +extern crate serde_derive; +#[cfg(feature = "std")] +extern crate toml; + +#[cfg(feature = "std")] +mod collection; +#[cfg(feature = "std")] +mod regression; +#[cfg(feature = "std")] +mod suite; +#[cfg(feature = "std")] +mod unescape; diff --git a/vendor/regex-automata/tests/unescape.rs b/vendor/regex-automata/tests/unescape.rs new file mode 100644 index 000000000..43fe04e71 --- /dev/null +++ b/vendor/regex-automata/tests/unescape.rs @@ -0,0 +1,84 @@ +#[derive(Clone, Copy, Eq, PartialEq)] +enum State { + /// The state after seeing a `\`. + Escape, + /// The state after seeing a `\x`. + HexFirst, + /// The state after seeing a `\x[0-9A-Fa-f]`. + HexSecond(char), + /// Default state. + Literal, +} + +pub fn unescape(s: &str) -> Vec<u8> { + use self::State::*; + + let mut bytes = vec![]; + let mut state = Literal; + for c in s.chars() { + match state { + Escape => match c { + '\\' => { + bytes.push(b'\\'); + state = Literal; + } + 'n' => { + bytes.push(b'\n'); + state = Literal; + } + 'r' => { + bytes.push(b'\r'); + state = Literal; + } + 't' => { + bytes.push(b'\t'); + state = Literal; + } + 'x' => { + state = HexFirst; + } + c => { + bytes.extend(format!(r"\{}", c).into_bytes()); + state = Literal; + } + }, + HexFirst => match c { + '0'..='9' | 'A'..='F' | 'a'..='f' => { + state = HexSecond(c); + } + c => { + bytes.extend(format!(r"\x{}", c).into_bytes()); + state = Literal; + } + }, + HexSecond(first) => match c { + '0'..='9' | 'A'..='F' | 'a'..='f' => { + let ordinal = format!("{}{}", first, c); + let byte = u8::from_str_radix(&ordinal, 16).unwrap(); + bytes.push(byte); + state = Literal; + } + c => { + let original = format!(r"\x{}{}", first, c); + bytes.extend(original.into_bytes()); + state = Literal; + } + }, + Literal => match c { + '\\' => { + state = Escape; + } + c => { + bytes.extend(c.to_string().as_bytes()); + } + }, + } + } + match state { + Escape => bytes.push(b'\\'), + HexFirst => bytes.extend(b"\\x"), + HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), + Literal => {} + } + bytes +} |