summaryrefslogtreecommitdiffstats
path: root/src/pcrepp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/pcrepp/CMakeLists.txt16
-rw-r--r--src/pcrepp/Makefile.am33
-rw-r--r--src/pcrepp/pcre2pp.cc473
-rw-r--r--src/pcrepp/pcre2pp.hh368
-rw-r--r--src/pcrepp/test_pcre2pp.cc260
5 files changed, 1150 insertions, 0 deletions
diff --git a/src/pcrepp/CMakeLists.txt b/src/pcrepp/CMakeLists.txt
new file mode 100644
index 0000000..1af8845
--- /dev/null
+++ b/src/pcrepp/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_library(pcrepp STATIC
+ ../config.h.in
+ pcre2pp.hh
+ pcre2pp.cc)
+
+target_include_directories(pcrepp PUBLIC . .. ../third-party/scnlib/include
+ ${CMAKE_CURRENT_BINARY_DIR}/..)
+target_link_libraries(pcrepp cppfmt pcre::libpcre pcre2::pcre2)
+
+add_executable(test_pcre2pp test_pcre2pp.cc)
+target_include_directories(
+ test_pcre2pp
+ PUBLIC
+ ../third-party/doctest-root)
+target_link_libraries(test_pcre2pp pcrepp)
+add_test(NAME test_pcre2pp COMMAND test_pcre2pp)
diff --git a/src/pcrepp/Makefile.am b/src/pcrepp/Makefile.am
new file mode 100644
index 0000000..72e8319
--- /dev/null
+++ b/src/pcrepp/Makefile.am
@@ -0,0 +1,33 @@
+
+include $(top_srcdir)/aminclude_static.am
+
+AM_CPPFLAGS = \
+ $(CODE_COVERAGE_CPPFLAGS) \
+ $(PCRE_CFLAGS) \
+ -Wall \
+ -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/fmtlib \
+ -I$(top_srcdir)/src/third-party/scnlib/include
+
+AM_LIBS = $(CODE_COVERAGE_LIBS)
+AM_CFLAGS = $(CODE_COVERAGE_CFLAGS)
+AM_CXXFLAGS = $(CODE_COVERAGE_CXXFLAGS)
+
+noinst_LIBRARIES = libpcrepp.a
+
+noinst_HEADERS = \
+ pcre2pp.hh
+
+libpcrepp_a_SOURCES = \
+ pcre2pp.cc
+
+test_pcre2pp_SOURCES = test_pcre2pp.cc
+test_pcre2pp_LDADD = \
+ libpcrepp.a \
+ $(PCRE_LIBS)
+
+check_PROGRAMS = \
+ test_pcre2pp
+
+TESTS = \
+ test_pcre2pp
diff --git a/src/pcrepp/pcre2pp.cc b/src/pcrepp/pcre2pp.cc
new file mode 100644
index 0000000..c7429d1
--- /dev/null
+++ b/src/pcrepp/pcre2pp.cc
@@ -0,0 +1,473 @@
+/**
+ * Copyright (c) 2022, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @file pcrepp.cc
+ */
+
+#include "pcre2pp.hh"
+
+#include "config.h"
+
+namespace lnav {
+namespace pcre2pp {
+
+std::string
+quote(const char* unquoted)
+{
+ std::string retval;
+
+ for (int lpc = 0; unquoted[lpc]; lpc++) {
+ if (isalnum(unquoted[lpc]) || unquoted[lpc] == '_'
+ || unquoted[lpc] & 0x80)
+ {
+ retval.push_back(unquoted[lpc]);
+ } else {
+ retval.push_back('\\');
+ retval.push_back(unquoted[lpc]);
+ }
+ }
+
+ return retval;
+}
+
+matcher
+capture_builder::into(lnav::pcre2pp::match_data& md) &&
+{
+ if (md.get_capacity() < this->mb_code.get_match_data_capacity()) {
+ md = this->mb_code.create_match_data();
+ }
+
+ return matcher{
+ this->mb_code,
+ this->mb_input,
+ md,
+ };
+}
+
+match_data
+code::create_match_data() const
+{
+ auto_mem<pcre2_match_data> md(pcre2_match_data_free);
+
+ md = pcre2_match_data_create_from_pattern(this->p_code, nullptr);
+
+ return match_data{std::move(md)};
+}
+
+Result<code, compile_error>
+code::from(string_fragment sf, int options)
+{
+ compile_error ce;
+ auto_mem<pcre2_code> co(pcre2_code_free);
+
+ options |= PCRE2_UTF;
+ co = pcre2_compile(
+ sf.udata(), sf.length(), options, &ce.ce_code, &ce.ce_offset, nullptr);
+
+ if (co == nullptr) {
+ ce.ce_pattern = sf.to_string();
+ return Err(ce);
+ }
+
+ auto jit_rc = pcre2_jit_compile(co, PCRE2_JIT_COMPLETE);
+ if (jit_rc < 0) {
+ // log_error("failed to JIT compile pattern: %d", jit_rc);
+ }
+
+ return Ok(code{std::move(co), sf.to_string()});
+}
+
+code::named_captures
+code::get_named_captures() const
+{
+ named_captures retval;
+
+ pcre2_pattern_info(
+ this->p_code.in(), PCRE2_INFO_NAMECOUNT, &retval.nc_count);
+ pcre2_pattern_info(
+ this->p_code.in(), PCRE2_INFO_NAMEENTRYSIZE, &retval.nc_entry_size);
+ pcre2_pattern_info(
+ this->p_code.in(), PCRE2_INFO_NAMETABLE, &retval.nc_name_table);
+
+ return retval;
+}
+
+size_t
+code::match_partial(string_fragment in) const
+{
+ auto md = this->create_match_data();
+ auto length = in.length();
+
+ do {
+ auto rc = pcre2_match(this->p_code.in(),
+ in.udata(),
+ length,
+ 0,
+ PCRE2_PARTIAL_HARD,
+ md.md_data.in(),
+ nullptr);
+
+ if (rc == PCRE2_ERROR_PARTIAL) {
+ return md.md_ovector[1];
+ }
+
+ if (length > 0) {
+ length -= 1;
+ }
+ } while (length > 0);
+
+ return 0;
+}
+
+const char*
+code::get_name_for_capture(size_t index) const
+{
+ for (const auto cap : this->get_named_captures()) {
+ if (cap.get_index() == index) {
+ return cap.get_name().data();
+ }
+ }
+
+ return nullptr;
+}
+
+size_t
+code::get_capture_count() const
+{
+ uint32_t retval;
+
+ pcre2_pattern_info(this->p_code.in(), PCRE2_INFO_CAPTURECOUNT, &retval);
+
+ return retval;
+}
+
+std::vector<string_fragment>
+code::get_captures() const
+{
+ bool in_class = false, in_escape = false, in_literal = false;
+ auto pat_frag = string_fragment::from_str(this->p_pattern);
+ std::vector<string_fragment> cap_in_progress;
+ std::vector<string_fragment> retval;
+
+ for (int lpc = 0; this->p_pattern[lpc]; lpc++) {
+ if (in_escape) {
+ in_escape = false;
+ if (this->p_pattern[lpc] == 'Q') {
+ in_literal = true;
+ }
+ } else if (in_class) {
+ if (this->p_pattern[lpc] == ']') {
+ in_class = false;
+ }
+ if (this->p_pattern[lpc] == '\\') {
+ in_escape = true;
+ }
+ } else if (in_literal) {
+ if (this->p_pattern[lpc] == '\\' && this->p_pattern[lpc + 1] == 'E')
+ {
+ in_literal = false;
+ lpc += 1;
+ }
+ } else {
+ switch (this->p_pattern[lpc]) {
+ case '\\':
+ in_escape = true;
+ break;
+ case '[':
+ in_class = true;
+ break;
+ case '(':
+ cap_in_progress.emplace_back(pat_frag.sub_range(lpc, lpc));
+ break;
+ case ')': {
+ if (!cap_in_progress.empty()) {
+ static const auto DEFINE_SF
+ = string_fragment::from_const("(?(DEFINE)");
+
+ auto& cap = cap_in_progress.back();
+ char first = '\0', second = '\0', third = '\0';
+ bool is_cap = false;
+
+ cap.sf_end = lpc + 1;
+ if (cap.length() >= 2) {
+ first = this->p_pattern[cap.sf_begin + 1];
+ }
+ if (cap.length() >= 3) {
+ second = this->p_pattern[cap.sf_begin + 2];
+ }
+ if (cap.length() >= 4) {
+ third = this->p_pattern[cap.sf_begin + 3];
+ }
+ if (cap.sf_begin >= 2) {
+ auto poss_define = string_fragment::from_str_range(
+ this->p_pattern, cap.sf_begin - 2, cap.sf_end);
+ if (poss_define == DEFINE_SF) {
+ cap_in_progress.pop_back();
+ continue;
+ }
+ }
+ if (first == '?') {
+ if (second == '\'') {
+ is_cap = true;
+ }
+ if (second == '<'
+ && (isalpha(third) || third == '_'))
+ {
+ is_cap = true;
+ }
+ if (second == 'P' && third == '<') {
+ is_cap = true;
+ }
+ } else if (first != '*') {
+ is_cap = true;
+ }
+ if (is_cap) {
+ retval.emplace_back(cap);
+ }
+ cap_in_progress.pop_back();
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ assert((size_t) this->get_capture_count() == retval.size());
+
+ return retval;
+}
+
+std::string
+code::replace(string_fragment str, const char* repl) const
+{
+ std::string retval;
+ std::string::size_type start = 0;
+ string_fragment remaining = str;
+
+ auto md = this->create_match_data();
+ while (remaining.is_valid()) {
+ auto find_res = this->capture_from(str)
+ .at(remaining)
+ .into(md)
+ .matches()
+ .ignore_error();
+ if (!find_res) {
+ break;
+ }
+ auto all = find_res->f_all;
+ remaining = find_res->f_remaining;
+ bool in_escape = false;
+
+ retval.append(str.data(), start, (all.sf_begin - start));
+ start = all.sf_end;
+ for (int lpc = 0; repl[lpc]; lpc++) {
+ auto ch = repl[lpc];
+
+ if (in_escape) {
+ if (isdigit(ch)) {
+ auto capture_index = size_t(ch - '0');
+
+ if (capture_index < md.get_count()) {
+ auto cap = md[capture_index];
+ if (cap) {
+ retval.append(cap->data(), cap->length());
+ }
+ } else if (capture_index > this->get_capture_count()) {
+ retval.push_back('\\');
+ retval.push_back(ch);
+ }
+ } else {
+ if (ch != '\\') {
+ retval.push_back('\\');
+ }
+ retval.push_back(ch);
+ }
+ in_escape = false;
+ } else {
+ switch (ch) {
+ case '\\':
+ in_escape = true;
+ break;
+ default:
+ retval.push_back(ch);
+ break;
+ }
+ }
+ }
+ }
+ if (remaining.is_valid()) {
+ retval.append(str.data(), remaining.sf_begin, std::string::npos);
+ }
+
+ return retval;
+}
+
+int
+code::name_index(const char* name) const
+{
+ return pcre2_substring_number_from_name(this->p_code.in(),
+ (PCRE2_SPTR) name);
+}
+
+size_t
+code::named_capture::get_index() const
+{
+ return (this->nc_entry[0] << 8) | (this->nc_entry[1] & 0xff);
+}
+
+string_fragment
+code::named_capture::get_name() const
+{
+ return string_fragment::from_bytes(
+ &this->nc_entry[2], strlen((const char*) &this->nc_entry[2]));
+}
+
+code::named_capture
+code::named_captures::iterator::operator*() const
+{
+ return code::named_capture{this->i_entry};
+}
+
+code::named_captures::iterator&
+code::named_captures::iterator::operator++()
+{
+ this->i_entry += this->i_entry_size;
+
+ return *this;
+}
+
+bool
+code::named_captures::iterator::operator==(const iterator& other) const
+{
+ return this->i_entry == other.i_entry
+ && this->i_entry_size == other.i_entry_size;
+}
+
+bool
+code::named_captures::iterator::operator!=(const iterator& other) const
+{
+ return this->i_entry != other.i_entry
+ || this->i_entry_size != other.i_entry_size;
+}
+
+code::named_captures::iterator
+code::named_captures::begin() const
+{
+ return iterator{this->nc_entry_size, this->nc_name_table};
+}
+
+code::named_captures::iterator
+code::named_captures::end() const
+{
+ return iterator{
+ this->nc_entry_size,
+ this->nc_name_table + (this->nc_count * this->nc_entry_size),
+ };
+}
+
+matcher::matches_result
+matcher::matches(uint32_t options)
+{
+ this->mb_input.i_offset = this->mb_input.i_next_offset;
+
+ if (this->mb_input.i_offset == -1) {
+ return not_found{};
+ }
+
+ auto rc = pcre2_match(this->mb_code.p_code.in(),
+ this->mb_input.i_string.udata(),
+ this->mb_input.i_string.length(),
+ this->mb_input.i_offset,
+ options,
+ this->mb_match_data.md_data.in(),
+ nullptr);
+
+ if (rc > 0) {
+ this->mb_match_data.md_input = this->mb_input;
+ this->mb_match_data.md_code = &this->mb_code;
+ this->mb_match_data.md_capture_end = rc;
+ if (this->mb_match_data[0]->empty()
+ && this->mb_match_data[0]->sf_end >= this->mb_input.i_string.sf_end)
+ {
+ this->mb_input.i_next_offset = -1;
+ } else if (this->mb_match_data[0]->empty()) {
+ this->mb_input.i_next_offset
+ = this->mb_match_data.md_ovector[1] + 1;
+ } else {
+ this->mb_input.i_next_offset = this->mb_match_data.md_ovector[1];
+ }
+ this->mb_match_data.md_input.i_next_offset
+ = this->mb_input.i_next_offset;
+ return found{
+ this->mb_match_data[0].value(),
+ this->mb_match_data.remaining(),
+ };
+ }
+
+ this->mb_match_data.md_input = this->mb_input;
+ this->mb_match_data.md_ovector[0] = this->mb_input.i_offset;
+ this->mb_match_data.md_ovector[1] = this->mb_input.i_offset;
+ this->mb_match_data.md_capture_end = 1;
+ if (rc == PCRE2_ERROR_NOMATCH) {
+ return not_found{};
+ }
+
+ return error{&this->mb_code, rc};
+}
+
+void
+matcher::matches_result::handle_error(matcher::error err)
+{
+ unsigned char buffer[1024];
+
+ pcre2_get_error_message(err.e_error_code, buffer, sizeof(buffer));
+ // log_error("pcre2_match failure: %s", buffer);
+}
+
+std::string
+compile_error::get_message() const
+{
+ unsigned char buffer[1024];
+
+ pcre2_get_error_message(this->ce_code, buffer, sizeof(buffer));
+
+ return {(const char*) buffer};
+}
+
+std::string
+matcher::error::get_message()
+{
+ unsigned char buffer[1024];
+
+ pcre2_get_error_message(this->e_error_code, buffer, sizeof(buffer));
+
+ return {(const char*) buffer};
+}
+
+} // namespace pcre2pp
+} // namespace lnav
diff --git a/src/pcrepp/pcre2pp.hh b/src/pcrepp/pcre2pp.hh
new file mode 100644
index 0000000..59a2cf1
--- /dev/null
+++ b/src/pcrepp/pcre2pp.hh
@@ -0,0 +1,368 @@
+/**
+ * Copyright (c) 2022, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef lnav_pcre2pp_hh
+#define lnav_pcre2pp_hh
+
+#define PCRE2_CODE_UNIT_WIDTH 8
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <pcre2.h>
+
+#include "base/auto_mem.hh"
+#include "base/intern_string.hh"
+#include "base/result.h"
+#include "mapbox/variant.hpp"
+
+namespace lnav {
+namespace pcre2pp {
+
+std::string quote(const char* unquoted);
+
+inline std::string
+quote(const std::string& unquoted)
+{
+ return quote(unquoted.c_str());
+}
+
+class code;
+struct capture_builder;
+class matcher;
+
+struct input {
+ string_fragment i_string;
+ int i_offset{0};
+ int i_next_offset{0};
+};
+
+class match_data {
+public:
+ static match_data unitialized() { return match_data{}; }
+
+ string_fragment leading() const
+ {
+ return this->md_input.i_string.sub_range(this->md_input.i_offset,
+ this->md_ovector[0]);
+ }
+
+ string_fragment remaining() const
+ {
+ if (this->md_capture_end == 0 || this->md_input.i_next_offset == -1) {
+ return string_fragment::invalid();
+ }
+
+ return string_fragment::from_byte_range(
+ this->md_input.i_string.sf_string,
+ this->md_input.i_string.sf_begin + this->md_input.i_next_offset,
+ this->md_input.i_string.sf_end);
+ }
+
+ nonstd::optional<string_fragment> operator[](size_t index) const
+ {
+ if (index >= this->md_capture_end) {
+ return nonstd::nullopt;
+ }
+
+ auto start = this->md_ovector[(index * 2)];
+ auto stop = this->md_ovector[(index * 2) + 1];
+ if (start == PCRE2_UNSET || stop == PCRE2_UNSET) {
+ return nonstd::nullopt;
+ }
+
+ return this->md_input.i_string.sub_range(start, stop);
+ }
+
+ template<typename T, std::size_t N>
+ nonstd::optional<string_fragment> operator[](const T (&name)[N]) const;
+
+ size_t get_count() const { return this->md_capture_end; }
+
+ uint32_t get_capacity() const { return this->md_ovector_count; }
+
+private:
+ friend matcher;
+ friend code;
+
+ match_data() = default;
+
+ explicit match_data(auto_mem<pcre2_match_data> dat)
+ : md_data(std::move(dat)),
+ md_ovector(pcre2_get_ovector_pointer(this->md_data.in())),
+ md_ovector_count(pcre2_get_ovector_count(this->md_data.in()))
+ {
+ }
+
+ auto_mem<pcre2_match_data> md_data;
+ const code* md_code{nullptr};
+ input md_input;
+ PCRE2_SIZE* md_ovector{nullptr};
+ uint32_t md_ovector_count{0};
+ size_t md_capture_end{0};
+};
+
+class matcher {
+public:
+ struct found {
+ string_fragment f_all;
+ string_fragment f_remaining;
+ };
+ struct not_found {};
+ struct error {
+ const code* e_code{nullptr};
+ int e_error_code{0};
+ std::string get_message();
+ };
+
+ class matches_result
+ : public mapbox::util::variant<found, not_found, error> {
+ public:
+ using variant::variant;
+
+ nonstd::optional<found> ignore_error()
+ {
+ return this->match(
+ [](found fo) { return nonstd::make_optional(fo); },
+ [](not_found) { return nonstd::nullopt; },
+ [](error err) {
+ handle_error(err);
+ return nonstd::nullopt;
+ });
+ }
+
+ private:
+ static void handle_error(error err);
+ };
+
+ matcher& reload_input(string_fragment sf, int next_offset)
+ {
+ this->mb_input = input{sf, next_offset, next_offset};
+
+ return *this;
+ }
+
+ matches_result matches(uint32_t options = 0);
+
+ int get_next_offset() const { return this->mb_input.i_next_offset; }
+
+private:
+ friend capture_builder;
+
+ matcher(const code& co, input& in, match_data& md)
+ : mb_code(co), mb_input(in), mb_match_data(md)
+ {
+ }
+
+ const code& mb_code;
+ input mb_input;
+ match_data& mb_match_data;
+};
+
+struct capture_builder {
+ const code& mb_code;
+ input mb_input;
+
+ capture_builder at(const string_fragment& remaining) &&
+ {
+ this->mb_input.i_offset = this->mb_input.i_next_offset
+ = remaining.sf_begin;
+ return *this;
+ }
+
+ matcher into(match_data& md) &&;
+
+ template<uint32_t Options = 0, typename F>
+ Result<string_fragment, matcher::error> for_each(F func) &&;
+};
+
+struct compile_error {
+ std::string ce_pattern;
+ int ce_code{0};
+ size_t ce_offset{0};
+
+ std::string get_message() const;
+};
+
+class code {
+public:
+ class named_capture {
+ public:
+ size_t get_index() const;
+ string_fragment get_name() const;
+
+ PCRE2_SPTR nc_entry;
+ };
+
+ class named_captures {
+ public:
+ struct iterator {
+ named_capture operator*() const;
+ iterator& operator++();
+ bool operator==(const iterator& other) const;
+ bool operator!=(const iterator& other) const;
+
+ uint32_t i_entry_size;
+ PCRE2_SPTR i_entry;
+ };
+
+ iterator begin() const;
+ iterator end() const;
+ bool empty() const { return this->nc_count == 0; }
+ size_t size() const { return this->nc_count; }
+
+ private:
+ friend code;
+
+ named_captures() = default;
+
+ uint32_t nc_count{0};
+ uint32_t nc_entry_size{0};
+ PCRE2_SPTR nc_name_table{nullptr};
+ };
+
+ static Result<code, compile_error> from(string_fragment sf,
+ int options = 0);
+
+ template<typename T, std::size_t N>
+ static code from_const(const T (&str)[N], int options = 0)
+ {
+ return from(string_fragment::from_const(str), options).unwrap();
+ }
+
+ const std::string& get_pattern() const { return this->p_pattern; }
+
+ named_captures get_named_captures() const;
+
+ const char* get_name_for_capture(size_t index) const;
+
+ size_t get_capture_count() const;
+
+ int name_index(const char* name) const;
+
+ std::vector<string_fragment> get_captures() const;
+
+ uint32_t get_match_data_capacity() const
+ {
+ return this->p_match_proto.md_ovector_count;
+ }
+
+ match_data create_match_data() const;
+
+ capture_builder capture_from(string_fragment in) const
+ {
+ return capture_builder{
+ *this,
+ input{in},
+ };
+ }
+
+ matcher::matches_result find_in(string_fragment in,
+ uint32_t options = 0) const
+ {
+ static thread_local match_data md = this->create_match_data();
+
+ if (md.md_ovector_count < this->p_match_proto.md_ovector_count) {
+ md = this->create_match_data();
+ }
+
+ return this->capture_from(in).into(md).matches(options);
+ }
+
+ size_t match_partial(string_fragment in) const;
+
+ std::string replace(string_fragment str, const char* repl) const;
+
+ std::shared_ptr<code> to_shared() &&
+ {
+ return std::make_shared<code>(std::move(this->p_code),
+ std::move(this->p_pattern));
+ }
+
+ code(auto_mem<pcre2_code> code, std::string pattern)
+ : p_code(std::move(code)), p_pattern(std::move(pattern)),
+ p_match_proto(this->create_match_data())
+ {
+ }
+
+private:
+ friend matcher;
+ friend match_data;
+
+ auto_mem<pcre2_code> p_code;
+ std::string p_pattern;
+ match_data p_match_proto;
+};
+
+template<typename T, std::size_t N>
+nonstd::optional<string_fragment>
+match_data::operator[](const T (&name)[N]) const
+{
+ auto index = pcre2_substring_number_from_name(
+ this->md_code->p_code.in(),
+ reinterpret_cast<const unsigned char*>(name));
+
+ return this->operator[](index);
+}
+
+template<uint32_t Options, typename F>
+Result<string_fragment, matcher::error>
+capture_builder::for_each(F func) &&
+{
+ auto md = this->mb_code.create_match_data();
+ auto mat = matcher{this->mb_code, this->mb_input, md};
+
+ bool done = false;
+ matcher::error eret;
+
+ while (!done) {
+ auto match_res = mat.matches(Options);
+ done = match_res.match(
+ [mat, &func](matcher::found) {
+ func(mat.mb_match_data);
+ return false;
+ },
+ [](matcher::not_found) { return true; },
+ [&eret](matcher::error err) {
+ eret = err;
+ return true;
+ });
+ }
+
+ if (eret.e_error_code == 0) {
+ return Ok(md.remaining());
+ }
+ return Err(eret);
+}
+
+} // namespace pcre2pp
+} // namespace lnav
+
+#endif
diff --git a/src/pcrepp/test_pcre2pp.cc b/src/pcrepp/test_pcre2pp.cc
new file mode 100644
index 0000000..ce5b0c5
--- /dev/null
+++ b/src/pcrepp/test_pcre2pp.cc
@@ -0,0 +1,260 @@
+/**
+ * Copyright (c) 2022, Timothy Stack
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * * Neither the name of Timothy Stack nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include "doctest/doctest.h"
+#include "pcre2pp.hh"
+
+TEST_CASE("bad pattern")
+{
+ auto compile_res
+ = lnav::pcre2pp::code::from(string_fragment::from_const("[abc"));
+
+ CHECK(compile_res.isErr());
+ auto ce = compile_res.unwrapErr();
+ CHECK(ce.ce_offset == 4);
+}
+
+TEST_CASE("named captures")
+{
+ auto compile_res = lnav::pcre2pp::code::from(
+ string_fragment::from_const("(?<abc>a)(b)(?<def>c)"));
+
+ CHECK(compile_res.isOk());
+
+ const std::vector<std::pair<size_t, string_fragment>> expected_caps = {
+ {1, string_fragment::from_const("abc")},
+ {3, string_fragment::from_const("def")},
+ };
+
+ int caps_index = 0;
+ auto co = compile_res.unwrap();
+ for (const auto cap : co.get_named_captures()) {
+ const auto& expected_cap = expected_caps[caps_index];
+
+ CHECK(expected_cap.first == cap.get_index());
+ CHECK(expected_cap.second == cap.get_name());
+ caps_index += 1;
+ }
+}
+
+TEST_CASE("match")
+{
+ static const char INPUT[] = "key1=1234;key2=5678;";
+
+ auto co
+ = lnav::pcre2pp::code::from_const(R"((?<key>\w+)=(?<value>[^;]+);)");
+
+ co.capture_from(string_fragment::from_const(INPUT))
+ .for_each([](lnav::pcre2pp::match_data& md) {
+ printf("got '%s' %s = %s\n",
+ md[0]->to_string().c_str(),
+ md[1]->to_string().c_str(),
+ md[2]->to_string().c_str());
+ });
+}
+
+TEST_CASE("partial")
+{
+ static const char INPUT[] = "key1=1234";
+
+ auto co = lnav::pcre2pp::code::from_const(R"([a-z]+=.*)");
+ auto matched = co.match_partial(string_fragment::from_const(INPUT));
+ CHECK(matched == 3);
+}
+
+TEST_CASE("capture_name")
+{
+ auto co = lnav::pcre2pp::code::from_const("(?<abc>def)(ghi)");
+
+ CHECK(co.get_capture_count() == 2);
+ CHECK(string_fragment::from_c_str(co.get_name_for_capture(1)) == "abc");
+ CHECK(co.get_name_for_capture(2) == nullptr);
+}
+
+TEST_CASE("get_capture_count")
+{
+ auto co = lnav::pcre2pp::code::from_const("(DEFINE)");
+
+ CHECK(co.get_capture_count() == 1);
+}
+
+TEST_CASE("get_captures")
+{
+ auto co = lnav::pcre2pp::code::from_const(R"((?<abc>\w+)-(def)-)");
+
+ CHECK(co.get_capture_count() == 2);
+ const auto& caps = co.get_captures();
+ CHECK(caps.size() == 2);
+ CHECK(caps[0].to_string() == R"((?<abc>\w+))");
+ CHECK(caps[1].to_string() == R"((def))");
+}
+
+TEST_CASE("replace")
+{
+ static const char INPUT[] = "test 1 2 3";
+
+ auto co = lnav::pcre2pp::code::from_const(R"(\w*)");
+ auto in = string_fragment::from_const(INPUT);
+
+ auto res = co.replace(in, R"({\0})");
+ CHECK(res == "{test}{} {1}{} {2}{} {3}{}");
+}
+
+TEST_CASE("replace-empty")
+{
+ static const char INPUT[] = "";
+
+ auto co = lnav::pcre2pp::code::from_const(R"(\w*)");
+ auto in = string_fragment::from_const(INPUT);
+
+ auto res = co.replace(in, R"({\0})");
+ CHECK(res == "{}");
+}
+
+TEST_CASE("for_each-all")
+{
+ static const char INPUT[] = "Hello, World!\n";
+
+ auto co = lnav::pcre2pp::code::from_const(R"(.*)");
+ auto in = string_fragment::from_const(INPUT);
+
+ co.capture_from(in).for_each([](lnav::pcre2pp::match_data& md) {
+ printf("range %d:%d\n", md[0]->sf_begin, md[0]->sf_end);
+ });
+}
+
+TEST_CASE("capture_count")
+{
+ auto co = lnav::pcre2pp::code::from_const(R"(^(\w+)=([^;]+);)");
+
+ CHECK(co.get_capture_count() == 2);
+}
+
+TEST_CASE("no-caps")
+{
+ const static std::string empty_cap_regexes[] = {
+ "foo (?:bar)",
+ "foo [(]",
+ "foo \\Q(bar)\\E",
+ "(?i)",
+ };
+
+ for (auto re : empty_cap_regexes) {
+ auto co = lnav::pcre2pp::code::from(re).unwrap();
+
+ CHECK(co.get_captures().empty());
+ }
+}
+
+TEST_CASE("ipmatcher")
+{
+ auto co = lnav::pcre2pp::code::from_const(
+ R"((?(DEFINE)(?<byte>2[0-4]\d|25[0-5]|1\d\d|[1-9]?\d))\b(?&byte)(\.(?&byte)){3}\b)");
+ auto inp = string_fragment::from_const("192.168.1.1");
+
+ auto find_res = co.find_in(inp).ignore_error();
+ CHECK(find_res.has_value());
+ CHECK(find_res->f_all.sf_begin == 0);
+}
+
+TEST_CASE("get_captures-nested")
+{
+ auto re = lnav::pcre2pp::code::from_const("foo (bar (?:baz)?)");
+
+ CHECK(re.get_captures().size() == 1);
+ CHECK(re.get_captures()[0].sf_begin == 4);
+ CHECK(re.get_captures()[0].sf_end == 18);
+ CHECK(re.get_captures()[0].length() == 14);
+}
+
+TEST_CASE("get_captures-basic")
+{
+ auto re = lnav::pcre2pp::code::from_const("(a)(b)(c)");
+
+ assert(re.get_captures().size() == 3);
+ assert(re.get_captures()[0].sf_begin == 0);
+ assert(re.get_captures()[0].sf_end == 3);
+ assert(re.get_captures()[1].sf_begin == 3);
+ assert(re.get_captures()[1].sf_end == 6);
+ assert(re.get_captures()[2].sf_begin == 6);
+ assert(re.get_captures()[2].sf_end == 9);
+}
+
+TEST_CASE("get_captures-escape")
+{
+ auto re = lnav::pcre2pp::code::from_const("\\(a\\)(b)");
+
+ assert(re.get_captures().size() == 1);
+ assert(re.get_captures()[0].sf_begin == 5);
+ assert(re.get_captures()[0].sf_end == 8);
+}
+
+TEST_CASE("get_captures-named")
+{
+ auto re = lnav::pcre2pp::code::from_const("(?<named>b)");
+
+ assert(re.get_captures().size() == 1);
+ assert(re.get_captures()[0].sf_begin == 0);
+ assert(re.get_captures()[0].sf_end == 11);
+}
+
+TEST_CASE("get_captures-namedP")
+{
+ auto re = lnav::pcre2pp::code::from_const("(?P<named>b)");
+
+ assert(re.get_captures().size() == 1);
+ assert(re.get_captures()[0].sf_begin == 0);
+ assert(re.get_captures()[0].sf_end == 12);
+}
+
+TEST_CASE("get_captures-namedq")
+{
+ auto re = lnav::pcre2pp::code::from_const("(?'named'b)");
+
+ CHECK(re.get_captures().size() == 1);
+ CHECK(re.get_captures()[0].sf_begin == 0);
+ CHECK(re.get_captures()[0].sf_end == 11);
+}
+
+TEST_CASE("anchored")
+{
+ auto re = lnav::pcre2pp::code::from_const(
+ "abc", PCRE2_ANCHORED | PCRE2_ENDANCHORED);
+
+ const auto sub1 = string_fragment::from_const("abc");
+ const auto sub2 = string_fragment::from_const("abcd");
+ const auto sub3 = string_fragment::from_const("0abc");
+
+ CHECK(re.find_in(sub1).ignore_error().has_value());
+ CHECK_FALSE(re.find_in(sub2).ignore_error().has_value());
+ CHECK_FALSE(re.find_in(sub3).ignore_error().has_value());
+}