diff options
Diffstat (limited to '')
-rw-r--r-- | src/pcrepp/CMakeLists.txt | 16 | ||||
-rw-r--r-- | src/pcrepp/Makefile.am | 33 | ||||
-rw-r--r-- | src/pcrepp/pcre2pp.cc | 473 | ||||
-rw-r--r-- | src/pcrepp/pcre2pp.hh | 368 | ||||
-rw-r--r-- | src/pcrepp/test_pcre2pp.cc | 260 |
5 files changed, 1150 insertions, 0 deletions
diff --git a/src/pcrepp/CMakeLists.txt b/src/pcrepp/CMakeLists.txt new file mode 100644 index 0000000..1af8845 --- /dev/null +++ b/src/pcrepp/CMakeLists.txt @@ -0,0 +1,16 @@ +add_library(pcrepp STATIC + ../config.h.in + pcre2pp.hh + pcre2pp.cc) + +target_include_directories(pcrepp PUBLIC . .. ../third-party/scnlib/include + ${CMAKE_CURRENT_BINARY_DIR}/..) +target_link_libraries(pcrepp cppfmt pcre::libpcre pcre2::pcre2) + +add_executable(test_pcre2pp test_pcre2pp.cc) +target_include_directories( + test_pcre2pp + PUBLIC + ../third-party/doctest-root) +target_link_libraries(test_pcre2pp pcrepp) +add_test(NAME test_pcre2pp COMMAND test_pcre2pp) diff --git a/src/pcrepp/Makefile.am b/src/pcrepp/Makefile.am new file mode 100644 index 0000000..72e8319 --- /dev/null +++ b/src/pcrepp/Makefile.am @@ -0,0 +1,33 @@ + +include $(top_srcdir)/aminclude_static.am + +AM_CPPFLAGS = \ + $(CODE_COVERAGE_CPPFLAGS) \ + $(PCRE_CFLAGS) \ + -Wall \ + -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/fmtlib \ + -I$(top_srcdir)/src/third-party/scnlib/include + +AM_LIBS = $(CODE_COVERAGE_LIBS) +AM_CFLAGS = $(CODE_COVERAGE_CFLAGS) +AM_CXXFLAGS = $(CODE_COVERAGE_CXXFLAGS) + +noinst_LIBRARIES = libpcrepp.a + +noinst_HEADERS = \ + pcre2pp.hh + +libpcrepp_a_SOURCES = \ + pcre2pp.cc + +test_pcre2pp_SOURCES = test_pcre2pp.cc +test_pcre2pp_LDADD = \ + libpcrepp.a \ + $(PCRE_LIBS) + +check_PROGRAMS = \ + test_pcre2pp + +TESTS = \ + test_pcre2pp diff --git a/src/pcrepp/pcre2pp.cc b/src/pcrepp/pcre2pp.cc new file mode 100644 index 0000000..c7429d1 --- /dev/null +++ b/src/pcrepp/pcre2pp.cc @@ -0,0 +1,473 @@ +/** + * Copyright (c) 2022, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @file pcrepp.cc + */ + +#include "pcre2pp.hh" + +#include "config.h" + +namespace lnav { +namespace pcre2pp { + +std::string +quote(const char* unquoted) +{ + std::string retval; + + for (int lpc = 0; unquoted[lpc]; lpc++) { + if (isalnum(unquoted[lpc]) || unquoted[lpc] == '_' + || unquoted[lpc] & 0x80) + { + retval.push_back(unquoted[lpc]); + } else { + retval.push_back('\\'); + retval.push_back(unquoted[lpc]); + } + } + + return retval; +} + +matcher +capture_builder::into(lnav::pcre2pp::match_data& md) && +{ + if (md.get_capacity() < this->mb_code.get_match_data_capacity()) { + md = this->mb_code.create_match_data(); + } + + return matcher{ + this->mb_code, + this->mb_input, + md, + }; +} + +match_data +code::create_match_data() const +{ + auto_mem<pcre2_match_data> md(pcre2_match_data_free); + + md = pcre2_match_data_create_from_pattern(this->p_code, nullptr); + + return match_data{std::move(md)}; +} + +Result<code, compile_error> +code::from(string_fragment sf, int options) +{ + compile_error ce; + auto_mem<pcre2_code> co(pcre2_code_free); + + options |= PCRE2_UTF; + co = pcre2_compile( + sf.udata(), sf.length(), options, &ce.ce_code, &ce.ce_offset, nullptr); + + if (co == nullptr) { + ce.ce_pattern = sf.to_string(); + return Err(ce); + } + + auto jit_rc = pcre2_jit_compile(co, PCRE2_JIT_COMPLETE); + if (jit_rc < 0) { + // log_error("failed to JIT compile pattern: %d", jit_rc); + } + + return Ok(code{std::move(co), sf.to_string()}); +} + +code::named_captures +code::get_named_captures() const +{ + named_captures retval; + + pcre2_pattern_info( + this->p_code.in(), PCRE2_INFO_NAMECOUNT, &retval.nc_count); + pcre2_pattern_info( + this->p_code.in(), PCRE2_INFO_NAMEENTRYSIZE, &retval.nc_entry_size); + pcre2_pattern_info( + this->p_code.in(), PCRE2_INFO_NAMETABLE, &retval.nc_name_table); + + return retval; +} + +size_t +code::match_partial(string_fragment in) const +{ + auto md = this->create_match_data(); + auto length = in.length(); + + do { + auto rc = pcre2_match(this->p_code.in(), + in.udata(), + length, + 0, + PCRE2_PARTIAL_HARD, + md.md_data.in(), + nullptr); + + if (rc == PCRE2_ERROR_PARTIAL) { + return md.md_ovector[1]; + } + + if (length > 0) { + length -= 1; + } + } while (length > 0); + + return 0; +} + +const char* +code::get_name_for_capture(size_t index) const +{ + for (const auto cap : this->get_named_captures()) { + if (cap.get_index() == index) { + return cap.get_name().data(); + } + } + + return nullptr; +} + +size_t +code::get_capture_count() const +{ + uint32_t retval; + + pcre2_pattern_info(this->p_code.in(), PCRE2_INFO_CAPTURECOUNT, &retval); + + return retval; +} + +std::vector<string_fragment> +code::get_captures() const +{ + bool in_class = false, in_escape = false, in_literal = false; + auto pat_frag = string_fragment::from_str(this->p_pattern); + std::vector<string_fragment> cap_in_progress; + std::vector<string_fragment> retval; + + for (int lpc = 0; this->p_pattern[lpc]; lpc++) { + if (in_escape) { + in_escape = false; + if (this->p_pattern[lpc] == 'Q') { + in_literal = true; + } + } else if (in_class) { + if (this->p_pattern[lpc] == ']') { + in_class = false; + } + if (this->p_pattern[lpc] == '\\') { + in_escape = true; + } + } else if (in_literal) { + if (this->p_pattern[lpc] == '\\' && this->p_pattern[lpc + 1] == 'E') + { + in_literal = false; + lpc += 1; + } + } else { + switch (this->p_pattern[lpc]) { + case '\\': + in_escape = true; + break; + case '[': + in_class = true; + break; + case '(': + cap_in_progress.emplace_back(pat_frag.sub_range(lpc, lpc)); + break; + case ')': { + if (!cap_in_progress.empty()) { + static const auto DEFINE_SF + = string_fragment::from_const("(?(DEFINE)"); + + auto& cap = cap_in_progress.back(); + char first = '\0', second = '\0', third = '\0'; + bool is_cap = false; + + cap.sf_end = lpc + 1; + if (cap.length() >= 2) { + first = this->p_pattern[cap.sf_begin + 1]; + } + if (cap.length() >= 3) { + second = this->p_pattern[cap.sf_begin + 2]; + } + if (cap.length() >= 4) { + third = this->p_pattern[cap.sf_begin + 3]; + } + if (cap.sf_begin >= 2) { + auto poss_define = string_fragment::from_str_range( + this->p_pattern, cap.sf_begin - 2, cap.sf_end); + if (poss_define == DEFINE_SF) { + cap_in_progress.pop_back(); + continue; + } + } + if (first == '?') { + if (second == '\'') { + is_cap = true; + } + if (second == '<' + && (isalpha(third) || third == '_')) + { + is_cap = true; + } + if (second == 'P' && third == '<') { + is_cap = true; + } + } else if (first != '*') { + is_cap = true; + } + if (is_cap) { + retval.emplace_back(cap); + } + cap_in_progress.pop_back(); + } + break; + } + } + } + } + + assert((size_t) this->get_capture_count() == retval.size()); + + return retval; +} + +std::string +code::replace(string_fragment str, const char* repl) const +{ + std::string retval; + std::string::size_type start = 0; + string_fragment remaining = str; + + auto md = this->create_match_data(); + while (remaining.is_valid()) { + auto find_res = this->capture_from(str) + .at(remaining) + .into(md) + .matches() + .ignore_error(); + if (!find_res) { + break; + } + auto all = find_res->f_all; + remaining = find_res->f_remaining; + bool in_escape = false; + + retval.append(str.data(), start, (all.sf_begin - start)); + start = all.sf_end; + for (int lpc = 0; repl[lpc]; lpc++) { + auto ch = repl[lpc]; + + if (in_escape) { + if (isdigit(ch)) { + auto capture_index = size_t(ch - '0'); + + if (capture_index < md.get_count()) { + auto cap = md[capture_index]; + if (cap) { + retval.append(cap->data(), cap->length()); + } + } else if (capture_index > this->get_capture_count()) { + retval.push_back('\\'); + retval.push_back(ch); + } + } else { + if (ch != '\\') { + retval.push_back('\\'); + } + retval.push_back(ch); + } + in_escape = false; + } else { + switch (ch) { + case '\\': + in_escape = true; + break; + default: + retval.push_back(ch); + break; + } + } + } + } + if (remaining.is_valid()) { + retval.append(str.data(), remaining.sf_begin, std::string::npos); + } + + return retval; +} + +int +code::name_index(const char* name) const +{ + return pcre2_substring_number_from_name(this->p_code.in(), + (PCRE2_SPTR) name); +} + +size_t +code::named_capture::get_index() const +{ + return (this->nc_entry[0] << 8) | (this->nc_entry[1] & 0xff); +} + +string_fragment +code::named_capture::get_name() const +{ + return string_fragment::from_bytes( + &this->nc_entry[2], strlen((const char*) &this->nc_entry[2])); +} + +code::named_capture +code::named_captures::iterator::operator*() const +{ + return code::named_capture{this->i_entry}; +} + +code::named_captures::iterator& +code::named_captures::iterator::operator++() +{ + this->i_entry += this->i_entry_size; + + return *this; +} + +bool +code::named_captures::iterator::operator==(const iterator& other) const +{ + return this->i_entry == other.i_entry + && this->i_entry_size == other.i_entry_size; +} + +bool +code::named_captures::iterator::operator!=(const iterator& other) const +{ + return this->i_entry != other.i_entry + || this->i_entry_size != other.i_entry_size; +} + +code::named_captures::iterator +code::named_captures::begin() const +{ + return iterator{this->nc_entry_size, this->nc_name_table}; +} + +code::named_captures::iterator +code::named_captures::end() const +{ + return iterator{ + this->nc_entry_size, + this->nc_name_table + (this->nc_count * this->nc_entry_size), + }; +} + +matcher::matches_result +matcher::matches(uint32_t options) +{ + this->mb_input.i_offset = this->mb_input.i_next_offset; + + if (this->mb_input.i_offset == -1) { + return not_found{}; + } + + auto rc = pcre2_match(this->mb_code.p_code.in(), + this->mb_input.i_string.udata(), + this->mb_input.i_string.length(), + this->mb_input.i_offset, + options, + this->mb_match_data.md_data.in(), + nullptr); + + if (rc > 0) { + this->mb_match_data.md_input = this->mb_input; + this->mb_match_data.md_code = &this->mb_code; + this->mb_match_data.md_capture_end = rc; + if (this->mb_match_data[0]->empty() + && this->mb_match_data[0]->sf_end >= this->mb_input.i_string.sf_end) + { + this->mb_input.i_next_offset = -1; + } else if (this->mb_match_data[0]->empty()) { + this->mb_input.i_next_offset + = this->mb_match_data.md_ovector[1] + 1; + } else { + this->mb_input.i_next_offset = this->mb_match_data.md_ovector[1]; + } + this->mb_match_data.md_input.i_next_offset + = this->mb_input.i_next_offset; + return found{ + this->mb_match_data[0].value(), + this->mb_match_data.remaining(), + }; + } + + this->mb_match_data.md_input = this->mb_input; + this->mb_match_data.md_ovector[0] = this->mb_input.i_offset; + this->mb_match_data.md_ovector[1] = this->mb_input.i_offset; + this->mb_match_data.md_capture_end = 1; + if (rc == PCRE2_ERROR_NOMATCH) { + return not_found{}; + } + + return error{&this->mb_code, rc}; +} + +void +matcher::matches_result::handle_error(matcher::error err) +{ + unsigned char buffer[1024]; + + pcre2_get_error_message(err.e_error_code, buffer, sizeof(buffer)); + // log_error("pcre2_match failure: %s", buffer); +} + +std::string +compile_error::get_message() const +{ + unsigned char buffer[1024]; + + pcre2_get_error_message(this->ce_code, buffer, sizeof(buffer)); + + return {(const char*) buffer}; +} + +std::string +matcher::error::get_message() +{ + unsigned char buffer[1024]; + + pcre2_get_error_message(this->e_error_code, buffer, sizeof(buffer)); + + return {(const char*) buffer}; +} + +} // namespace pcre2pp +} // namespace lnav diff --git a/src/pcrepp/pcre2pp.hh b/src/pcrepp/pcre2pp.hh new file mode 100644 index 0000000..59a2cf1 --- /dev/null +++ b/src/pcrepp/pcre2pp.hh @@ -0,0 +1,368 @@ +/** + * Copyright (c) 2022, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef lnav_pcre2pp_hh +#define lnav_pcre2pp_hh + +#define PCRE2_CODE_UNIT_WIDTH 8 + +#include <memory> +#include <string> +#include <vector> + +#include <pcre2.h> + +#include "base/auto_mem.hh" +#include "base/intern_string.hh" +#include "base/result.h" +#include "mapbox/variant.hpp" + +namespace lnav { +namespace pcre2pp { + +std::string quote(const char* unquoted); + +inline std::string +quote(const std::string& unquoted) +{ + return quote(unquoted.c_str()); +} + +class code; +struct capture_builder; +class matcher; + +struct input { + string_fragment i_string; + int i_offset{0}; + int i_next_offset{0}; +}; + +class match_data { +public: + static match_data unitialized() { return match_data{}; } + + string_fragment leading() const + { + return this->md_input.i_string.sub_range(this->md_input.i_offset, + this->md_ovector[0]); + } + + string_fragment remaining() const + { + if (this->md_capture_end == 0 || this->md_input.i_next_offset == -1) { + return string_fragment::invalid(); + } + + return string_fragment::from_byte_range( + this->md_input.i_string.sf_string, + this->md_input.i_string.sf_begin + this->md_input.i_next_offset, + this->md_input.i_string.sf_end); + } + + nonstd::optional<string_fragment> operator[](size_t index) const + { + if (index >= this->md_capture_end) { + return nonstd::nullopt; + } + + auto start = this->md_ovector[(index * 2)]; + auto stop = this->md_ovector[(index * 2) + 1]; + if (start == PCRE2_UNSET || stop == PCRE2_UNSET) { + return nonstd::nullopt; + } + + return this->md_input.i_string.sub_range(start, stop); + } + + template<typename T, std::size_t N> + nonstd::optional<string_fragment> operator[](const T (&name)[N]) const; + + size_t get_count() const { return this->md_capture_end; } + + uint32_t get_capacity() const { return this->md_ovector_count; } + +private: + friend matcher; + friend code; + + match_data() = default; + + explicit match_data(auto_mem<pcre2_match_data> dat) + : md_data(std::move(dat)), + md_ovector(pcre2_get_ovector_pointer(this->md_data.in())), + md_ovector_count(pcre2_get_ovector_count(this->md_data.in())) + { + } + + auto_mem<pcre2_match_data> md_data; + const code* md_code{nullptr}; + input md_input; + PCRE2_SIZE* md_ovector{nullptr}; + uint32_t md_ovector_count{0}; + size_t md_capture_end{0}; +}; + +class matcher { +public: + struct found { + string_fragment f_all; + string_fragment f_remaining; + }; + struct not_found {}; + struct error { + const code* e_code{nullptr}; + int e_error_code{0}; + std::string get_message(); + }; + + class matches_result + : public mapbox::util::variant<found, not_found, error> { + public: + using variant::variant; + + nonstd::optional<found> ignore_error() + { + return this->match( + [](found fo) { return nonstd::make_optional(fo); }, + [](not_found) { return nonstd::nullopt; }, + [](error err) { + handle_error(err); + return nonstd::nullopt; + }); + } + + private: + static void handle_error(error err); + }; + + matcher& reload_input(string_fragment sf, int next_offset) + { + this->mb_input = input{sf, next_offset, next_offset}; + + return *this; + } + + matches_result matches(uint32_t options = 0); + + int get_next_offset() const { return this->mb_input.i_next_offset; } + +private: + friend capture_builder; + + matcher(const code& co, input& in, match_data& md) + : mb_code(co), mb_input(in), mb_match_data(md) + { + } + + const code& mb_code; + input mb_input; + match_data& mb_match_data; +}; + +struct capture_builder { + const code& mb_code; + input mb_input; + + capture_builder at(const string_fragment& remaining) && + { + this->mb_input.i_offset = this->mb_input.i_next_offset + = remaining.sf_begin; + return *this; + } + + matcher into(match_data& md) &&; + + template<uint32_t Options = 0, typename F> + Result<string_fragment, matcher::error> for_each(F func) &&; +}; + +struct compile_error { + std::string ce_pattern; + int ce_code{0}; + size_t ce_offset{0}; + + std::string get_message() const; +}; + +class code { +public: + class named_capture { + public: + size_t get_index() const; + string_fragment get_name() const; + + PCRE2_SPTR nc_entry; + }; + + class named_captures { + public: + struct iterator { + named_capture operator*() const; + iterator& operator++(); + bool operator==(const iterator& other) const; + bool operator!=(const iterator& other) const; + + uint32_t i_entry_size; + PCRE2_SPTR i_entry; + }; + + iterator begin() const; + iterator end() const; + bool empty() const { return this->nc_count == 0; } + size_t size() const { return this->nc_count; } + + private: + friend code; + + named_captures() = default; + + uint32_t nc_count{0}; + uint32_t nc_entry_size{0}; + PCRE2_SPTR nc_name_table{nullptr}; + }; + + static Result<code, compile_error> from(string_fragment sf, + int options = 0); + + template<typename T, std::size_t N> + static code from_const(const T (&str)[N], int options = 0) + { + return from(string_fragment::from_const(str), options).unwrap(); + } + + const std::string& get_pattern() const { return this->p_pattern; } + + named_captures get_named_captures() const; + + const char* get_name_for_capture(size_t index) const; + + size_t get_capture_count() const; + + int name_index(const char* name) const; + + std::vector<string_fragment> get_captures() const; + + uint32_t get_match_data_capacity() const + { + return this->p_match_proto.md_ovector_count; + } + + match_data create_match_data() const; + + capture_builder capture_from(string_fragment in) const + { + return capture_builder{ + *this, + input{in}, + }; + } + + matcher::matches_result find_in(string_fragment in, + uint32_t options = 0) const + { + static thread_local match_data md = this->create_match_data(); + + if (md.md_ovector_count < this->p_match_proto.md_ovector_count) { + md = this->create_match_data(); + } + + return this->capture_from(in).into(md).matches(options); + } + + size_t match_partial(string_fragment in) const; + + std::string replace(string_fragment str, const char* repl) const; + + std::shared_ptr<code> to_shared() && + { + return std::make_shared<code>(std::move(this->p_code), + std::move(this->p_pattern)); + } + + code(auto_mem<pcre2_code> code, std::string pattern) + : p_code(std::move(code)), p_pattern(std::move(pattern)), + p_match_proto(this->create_match_data()) + { + } + +private: + friend matcher; + friend match_data; + + auto_mem<pcre2_code> p_code; + std::string p_pattern; + match_data p_match_proto; +}; + +template<typename T, std::size_t N> +nonstd::optional<string_fragment> +match_data::operator[](const T (&name)[N]) const +{ + auto index = pcre2_substring_number_from_name( + this->md_code->p_code.in(), + reinterpret_cast<const unsigned char*>(name)); + + return this->operator[](index); +} + +template<uint32_t Options, typename F> +Result<string_fragment, matcher::error> +capture_builder::for_each(F func) && +{ + auto md = this->mb_code.create_match_data(); + auto mat = matcher{this->mb_code, this->mb_input, md}; + + bool done = false; + matcher::error eret; + + while (!done) { + auto match_res = mat.matches(Options); + done = match_res.match( + [mat, &func](matcher::found) { + func(mat.mb_match_data); + return false; + }, + [](matcher::not_found) { return true; }, + [&eret](matcher::error err) { + eret = err; + return true; + }); + } + + if (eret.e_error_code == 0) { + return Ok(md.remaining()); + } + return Err(eret); +} + +} // namespace pcre2pp +} // namespace lnav + +#endif diff --git a/src/pcrepp/test_pcre2pp.cc b/src/pcrepp/test_pcre2pp.cc new file mode 100644 index 0000000..ce5b0c5 --- /dev/null +++ b/src/pcrepp/test_pcre2pp.cc @@ -0,0 +1,260 @@ +/** + * Copyright (c) 2022, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "doctest/doctest.h" +#include "pcre2pp.hh" + +TEST_CASE("bad pattern") +{ + auto compile_res + = lnav::pcre2pp::code::from(string_fragment::from_const("[abc")); + + CHECK(compile_res.isErr()); + auto ce = compile_res.unwrapErr(); + CHECK(ce.ce_offset == 4); +} + +TEST_CASE("named captures") +{ + auto compile_res = lnav::pcre2pp::code::from( + string_fragment::from_const("(?<abc>a)(b)(?<def>c)")); + + CHECK(compile_res.isOk()); + + const std::vector<std::pair<size_t, string_fragment>> expected_caps = { + {1, string_fragment::from_const("abc")}, + {3, string_fragment::from_const("def")}, + }; + + int caps_index = 0; + auto co = compile_res.unwrap(); + for (const auto cap : co.get_named_captures()) { + const auto& expected_cap = expected_caps[caps_index]; + + CHECK(expected_cap.first == cap.get_index()); + CHECK(expected_cap.second == cap.get_name()); + caps_index += 1; + } +} + +TEST_CASE("match") +{ + static const char INPUT[] = "key1=1234;key2=5678;"; + + auto co + = lnav::pcre2pp::code::from_const(R"((?<key>\w+)=(?<value>[^;]+);)"); + + co.capture_from(string_fragment::from_const(INPUT)) + .for_each([](lnav::pcre2pp::match_data& md) { + printf("got '%s' %s = %s\n", + md[0]->to_string().c_str(), + md[1]->to_string().c_str(), + md[2]->to_string().c_str()); + }); +} + +TEST_CASE("partial") +{ + static const char INPUT[] = "key1=1234"; + + auto co = lnav::pcre2pp::code::from_const(R"([a-z]+=.*)"); + auto matched = co.match_partial(string_fragment::from_const(INPUT)); + CHECK(matched == 3); +} + +TEST_CASE("capture_name") +{ + auto co = lnav::pcre2pp::code::from_const("(?<abc>def)(ghi)"); + + CHECK(co.get_capture_count() == 2); + CHECK(string_fragment::from_c_str(co.get_name_for_capture(1)) == "abc"); + CHECK(co.get_name_for_capture(2) == nullptr); +} + +TEST_CASE("get_capture_count") +{ + auto co = lnav::pcre2pp::code::from_const("(DEFINE)"); + + CHECK(co.get_capture_count() == 1); +} + +TEST_CASE("get_captures") +{ + auto co = lnav::pcre2pp::code::from_const(R"((?<abc>\w+)-(def)-)"); + + CHECK(co.get_capture_count() == 2); + const auto& caps = co.get_captures(); + CHECK(caps.size() == 2); + CHECK(caps[0].to_string() == R"((?<abc>\w+))"); + CHECK(caps[1].to_string() == R"((def))"); +} + +TEST_CASE("replace") +{ + static const char INPUT[] = "test 1 2 3"; + + auto co = lnav::pcre2pp::code::from_const(R"(\w*)"); + auto in = string_fragment::from_const(INPUT); + + auto res = co.replace(in, R"({\0})"); + CHECK(res == "{test}{} {1}{} {2}{} {3}{}"); +} + +TEST_CASE("replace-empty") +{ + static const char INPUT[] = ""; + + auto co = lnav::pcre2pp::code::from_const(R"(\w*)"); + auto in = string_fragment::from_const(INPUT); + + auto res = co.replace(in, R"({\0})"); + CHECK(res == "{}"); +} + +TEST_CASE("for_each-all") +{ + static const char INPUT[] = "Hello, World!\n"; + + auto co = lnav::pcre2pp::code::from_const(R"(.*)"); + auto in = string_fragment::from_const(INPUT); + + co.capture_from(in).for_each([](lnav::pcre2pp::match_data& md) { + printf("range %d:%d\n", md[0]->sf_begin, md[0]->sf_end); + }); +} + +TEST_CASE("capture_count") +{ + auto co = lnav::pcre2pp::code::from_const(R"(^(\w+)=([^;]+);)"); + + CHECK(co.get_capture_count() == 2); +} + +TEST_CASE("no-caps") +{ + const static std::string empty_cap_regexes[] = { + "foo (?:bar)", + "foo [(]", + "foo \\Q(bar)\\E", + "(?i)", + }; + + for (auto re : empty_cap_regexes) { + auto co = lnav::pcre2pp::code::from(re).unwrap(); + + CHECK(co.get_captures().empty()); + } +} + +TEST_CASE("ipmatcher") +{ + auto co = lnav::pcre2pp::code::from_const( + R"((?(DEFINE)(?<byte>2[0-4]\d|25[0-5]|1\d\d|[1-9]?\d))\b(?&byte)(\.(?&byte)){3}\b)"); + auto inp = string_fragment::from_const("192.168.1.1"); + + auto find_res = co.find_in(inp).ignore_error(); + CHECK(find_res.has_value()); + CHECK(find_res->f_all.sf_begin == 0); +} + +TEST_CASE("get_captures-nested") +{ + auto re = lnav::pcre2pp::code::from_const("foo (bar (?:baz)?)"); + + CHECK(re.get_captures().size() == 1); + CHECK(re.get_captures()[0].sf_begin == 4); + CHECK(re.get_captures()[0].sf_end == 18); + CHECK(re.get_captures()[0].length() == 14); +} + +TEST_CASE("get_captures-basic") +{ + auto re = lnav::pcre2pp::code::from_const("(a)(b)(c)"); + + assert(re.get_captures().size() == 3); + assert(re.get_captures()[0].sf_begin == 0); + assert(re.get_captures()[0].sf_end == 3); + assert(re.get_captures()[1].sf_begin == 3); + assert(re.get_captures()[1].sf_end == 6); + assert(re.get_captures()[2].sf_begin == 6); + assert(re.get_captures()[2].sf_end == 9); +} + +TEST_CASE("get_captures-escape") +{ + auto re = lnav::pcre2pp::code::from_const("\\(a\\)(b)"); + + assert(re.get_captures().size() == 1); + assert(re.get_captures()[0].sf_begin == 5); + assert(re.get_captures()[0].sf_end == 8); +} + +TEST_CASE("get_captures-named") +{ + auto re = lnav::pcre2pp::code::from_const("(?<named>b)"); + + assert(re.get_captures().size() == 1); + assert(re.get_captures()[0].sf_begin == 0); + assert(re.get_captures()[0].sf_end == 11); +} + +TEST_CASE("get_captures-namedP") +{ + auto re = lnav::pcre2pp::code::from_const("(?P<named>b)"); + + assert(re.get_captures().size() == 1); + assert(re.get_captures()[0].sf_begin == 0); + assert(re.get_captures()[0].sf_end == 12); +} + +TEST_CASE("get_captures-namedq") +{ + auto re = lnav::pcre2pp::code::from_const("(?'named'b)"); + + CHECK(re.get_captures().size() == 1); + CHECK(re.get_captures()[0].sf_begin == 0); + CHECK(re.get_captures()[0].sf_end == 11); +} + +TEST_CASE("anchored") +{ + auto re = lnav::pcre2pp::code::from_const( + "abc", PCRE2_ANCHORED | PCRE2_ENDANCHORED); + + const auto sub1 = string_fragment::from_const("abc"); + const auto sub2 = string_fragment::from_const("abcd"); + const auto sub3 = string_fragment::from_const("0abc"); + + CHECK(re.find_in(sub1).ignore_error().has_value()); + CHECK_FALSE(re.find_in(sub2).ignore_error().has_value()); + CHECK_FALSE(re.find_in(sub3).ignore_error().has_value()); +} |