From 5068d34c08f951a7ea6257d305a1627b09a95817 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 19:44:55 +0200 Subject: Adding upstream version 0.11.1. Signed-off-by: Daniel Baumann --- src/pcrepp/pcre2pp.cc | 473 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 473 insertions(+) create mode 100644 src/pcrepp/pcre2pp.cc (limited to 'src/pcrepp/pcre2pp.cc') diff --git a/src/pcrepp/pcre2pp.cc b/src/pcrepp/pcre2pp.cc new file mode 100644 index 0000000..9e5c6bf --- /dev/null +++ b/src/pcrepp/pcre2pp.cc @@ -0,0 +1,473 @@ +/** + * Copyright (c) 2022, Timothy Stack + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * * Neither the name of Timothy Stack nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @file pcrepp.cc + */ + +#include "pcre2pp.hh" + +#include "config.h" + +namespace lnav { +namespace pcre2pp { + +std::string +quote(const char* unquoted) +{ + std::string retval; + + for (int lpc = 0; unquoted[lpc]; lpc++) { + if (isalnum(unquoted[lpc]) || unquoted[lpc] == '_' + || unquoted[lpc] & 0x80) + { + retval.push_back(unquoted[lpc]); + } else { + retval.push_back('\\'); + retval.push_back(unquoted[lpc]); + } + } + + return retval; +} + +matcher +capture_builder::into(lnav::pcre2pp::match_data& md) && +{ + if (md.get_capacity() < this->mb_code.get_match_data_capacity()) { + md = this->mb_code.create_match_data(); + } + + return matcher{ + this->mb_code, + this->mb_input, + md, + }; +} + +match_data +code::create_match_data() const +{ + auto_mem md(pcre2_match_data_free); + + md = pcre2_match_data_create_from_pattern(this->p_code, nullptr); + + return match_data{std::move(md)}; +} + +Result +code::from(string_fragment sf, int options) +{ + compile_error ce; + auto_mem co(pcre2_code_free); + + options |= PCRE2_UTF; + co = pcre2_compile( + sf.udata(), sf.length(), options, &ce.ce_code, &ce.ce_offset, nullptr); + + if (co == nullptr) { + ce.ce_pattern = sf.to_string(); + return Err(ce); + } + + auto jit_rc = pcre2_jit_compile(co, PCRE2_JIT_COMPLETE); + if (jit_rc < 0) { + // log_error("failed to JIT compile pattern: %d", jit_rc); + } + + return Ok(code{std::move(co), sf.to_string()}); +} + +code::named_captures +code::get_named_captures() const +{ + named_captures retval; + + pcre2_pattern_info( + this->p_code.in(), PCRE2_INFO_NAMECOUNT, &retval.nc_count); + pcre2_pattern_info( + this->p_code.in(), PCRE2_INFO_NAMEENTRYSIZE, &retval.nc_entry_size); + pcre2_pattern_info( + this->p_code.in(), PCRE2_INFO_NAMETABLE, &retval.nc_name_table); + + return retval; +} + +size_t +code::match_partial(string_fragment in) const +{ + auto md = this->create_match_data(); + auto length = in.length(); + + do { + auto rc = pcre2_match(this->p_code.in(), + in.udata(), + length, + 0, + PCRE2_PARTIAL_HARD, + md.md_data.in(), + nullptr); + + if (rc == PCRE2_ERROR_PARTIAL) { + return md.md_ovector[1]; + } + + if (length > 0) { + length -= 1; + } + } while (length > 0); + + return 0; +} + +const char* +code::get_name_for_capture(size_t index) const +{ + for (const auto cap : this->get_named_captures()) { + if (cap.get_index() == index) { + return cap.get_name().data(); + } + } + + return nullptr; +} + +size_t +code::get_capture_count() const +{ + uint32_t retval; + + pcre2_pattern_info(this->p_code.in(), PCRE2_INFO_CAPTURECOUNT, &retval); + + return retval; +} + +std::vector +code::get_captures() const +{ + bool in_class = false, in_escape = false, in_literal = false; + auto pat_frag = string_fragment::from_str(this->p_pattern); + std::vector cap_in_progress; + std::vector retval; + + for (int lpc = 0; this->p_pattern[lpc]; lpc++) { + if (in_escape) { + in_escape = false; + if (this->p_pattern[lpc] == 'Q') { + in_literal = true; + } + } else if (in_class) { + if (this->p_pattern[lpc] == ']') { + in_class = false; + } + if (this->p_pattern[lpc] == '\\') { + in_escape = true; + } + } else if (in_literal) { + if (this->p_pattern[lpc] == '\\' && this->p_pattern[lpc + 1] == 'E') + { + in_literal = false; + lpc += 1; + } + } else { + switch (this->p_pattern[lpc]) { + case '\\': + in_escape = true; + break; + case '[': + in_class = true; + break; + case '(': + cap_in_progress.emplace_back(pat_frag.sub_range(lpc, lpc)); + break; + case ')': { + if (!cap_in_progress.empty()) { + static const auto DEFINE_SF + = string_fragment::from_const("(?(DEFINE)"); + + auto& cap = cap_in_progress.back(); + char first = '\0', second = '\0', third = '\0'; + bool is_cap = false; + + cap.sf_end = lpc + 1; + if (cap.length() >= 2) { + first = this->p_pattern[cap.sf_begin + 1]; + } + if (cap.length() >= 3) { + second = this->p_pattern[cap.sf_begin + 2]; + } + if (cap.length() >= 4) { + third = this->p_pattern[cap.sf_begin + 3]; + } + if (cap.sf_begin >= 2) { + auto poss_define = string_fragment::from_str_range( + this->p_pattern, cap.sf_begin - 2, cap.sf_end); + if (poss_define == DEFINE_SF) { + cap_in_progress.pop_back(); + continue; + } + } + if (first == '?') { + if (second == '\'') { + is_cap = true; + } + if (second == '<' + && (isalpha(third) || third == '_')) + { + is_cap = true; + } + if (second == 'P' && third == '<') { + is_cap = true; + } + } else if (first != '*') { + is_cap = true; + } + if (is_cap) { + retval.emplace_back(cap); + } + cap_in_progress.pop_back(); + } + break; + } + } + } + } + + assert((size_t) this->get_capture_count() == retval.size()); + + return retval; +} + +std::string +code::replace(string_fragment str, const char* repl) const +{ + std::string retval; + std::string::size_type start = 0; + string_fragment remaining = str; + + auto md = this->create_match_data(); + while (remaining.is_valid()) { + auto find_res = this->capture_from(str) + .at(remaining) + .into(md) + .matches() + .ignore_error(); + if (!find_res) { + break; + } + auto all = find_res->f_all; + remaining = find_res->f_remaining; + bool in_escape = false; + + retval.append(str.data(), start, (all.sf_begin - start)); + start = all.sf_end; + for (int lpc = 0; repl[lpc]; lpc++) { + auto ch = repl[lpc]; + + if (in_escape) { + if (isdigit(ch)) { + auto capture_index = (ch - '0'); + + if (capture_index < md.get_count()) { + auto cap = md[capture_index]; + if (cap) { + retval.append(cap->data(), cap->length()); + } + } else if (capture_index > this->get_capture_count()) { + retval.push_back('\\'); + retval.push_back(ch); + } + } else { + if (ch != '\\') { + retval.push_back('\\'); + } + retval.push_back(ch); + } + in_escape = false; + } else { + switch (ch) { + case '\\': + in_escape = true; + break; + default: + retval.push_back(ch); + break; + } + } + } + } + if (remaining.is_valid()) { + retval.append(str.data(), remaining.sf_begin, std::string::npos); + } + + return retval; +} + +int +code::name_index(const char* name) const +{ + return pcre2_substring_number_from_name(this->p_code.in(), + (PCRE2_SPTR) name); +} + +size_t +code::named_capture::get_index() const +{ + return (this->nc_entry[0] << 8) | (this->nc_entry[1] & 0xff); +} + +string_fragment +code::named_capture::get_name() const +{ + return string_fragment::from_bytes( + &this->nc_entry[2], strlen((const char*) &this->nc_entry[2])); +} + +code::named_capture +code::named_captures::iterator::operator*() const +{ + return code::named_capture{this->i_entry}; +} + +code::named_captures::iterator& +code::named_captures::iterator::operator++() +{ + this->i_entry += this->i_entry_size; + + return *this; +} + +bool +code::named_captures::iterator::operator==(const iterator& other) const +{ + return this->i_entry == other.i_entry + && this->i_entry_size == other.i_entry_size; +} + +bool +code::named_captures::iterator::operator!=(const iterator& other) const +{ + return this->i_entry != other.i_entry + || this->i_entry_size != other.i_entry_size; +} + +code::named_captures::iterator +code::named_captures::begin() const +{ + return iterator{this->nc_entry_size, this->nc_name_table}; +} + +code::named_captures::iterator +code::named_captures::end() const +{ + return iterator{ + this->nc_entry_size, + this->nc_name_table + (this->nc_count * this->nc_entry_size), + }; +} + +matcher::matches_result +matcher::matches(uint32_t options) +{ + this->mb_input.i_offset = this->mb_input.i_next_offset; + + if (this->mb_input.i_offset == -1) { + return not_found{}; + } + + auto rc = pcre2_match(this->mb_code.p_code.in(), + this->mb_input.i_string.udata(), + this->mb_input.i_string.length(), + this->mb_input.i_offset, + options, + this->mb_match_data.md_data.in(), + nullptr); + + if (rc > 0) { + this->mb_match_data.md_input = this->mb_input; + this->mb_match_data.md_code = &this->mb_code; + this->mb_match_data.md_capture_end = rc; + if (this->mb_match_data[0]->empty() + && this->mb_match_data[0]->sf_end >= this->mb_input.i_string.sf_end) + { + this->mb_input.i_next_offset = -1; + } else if (this->mb_match_data[0]->empty()) { + this->mb_input.i_next_offset + = this->mb_match_data.md_ovector[1] + 1; + } else { + this->mb_input.i_next_offset = this->mb_match_data.md_ovector[1]; + } + this->mb_match_data.md_input.i_next_offset + = this->mb_input.i_next_offset; + return found{ + this->mb_match_data[0].value(), + this->mb_match_data.remaining(), + }; + } + + this->mb_match_data.md_input = this->mb_input; + this->mb_match_data.md_ovector[0] = this->mb_input.i_offset; + this->mb_match_data.md_ovector[1] = this->mb_input.i_offset; + this->mb_match_data.md_capture_end = 1; + if (rc == PCRE2_ERROR_NOMATCH) { + return not_found{}; + } + + return error{&this->mb_code, rc}; +} + +void +matcher::matches_result::handle_error(matcher::error err) +{ + unsigned char buffer[1024]; + + pcre2_get_error_message(err.e_error_code, buffer, sizeof(buffer)); + // log_error("pcre2_match failure: %s", buffer); +} + +std::string +compile_error::get_message() const +{ + unsigned char buffer[1024]; + + pcre2_get_error_message(this->ce_code, buffer, sizeof(buffer)); + + return {(const char*) buffer}; +} + +std::string +matcher::error::get_message() +{ + unsigned char buffer[1024]; + + pcre2_get_error_message(this->e_error_code, buffer, sizeof(buffer)); + + return {(const char*) buffer}; +} + +} // namespace pcre2pp +} // namespace lnav -- cgit v1.2.3