diff options
Diffstat (limited to 'wsutil/regex.c')
-rw-r--r-- | wsutil/regex.c | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/wsutil/regex.c b/wsutil/regex.c new file mode 100644 index 00000000..bb27189b --- /dev/null +++ b/wsutil/regex.c @@ -0,0 +1,203 @@ +/* + * Wireshark - Network traffic analyzer + * By Gerald Combs <gerald@wireshark.org> + * Copyright 1998 Gerald Combs + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "config.h" + +#include "regex.h" + +#include <wsutil/str_util.h> +#include <pcre2.h> + + +struct _ws_regex { + pcre2_code *code; + char *pattern; +}; + +#define ERROR_MAXLEN_IN_CODE_UNITS 128 + +static char * +get_error_msg(int errorcode) +{ + char *buffer; + + /* + * We have to provide a buffer and we don't know how long the + * error message is or even the maximum size. From pcre2api(3): + * "None of the messages are very long; a + * buffer size of 120 code units is ample." + */ + /* Code unit = one byte */ + buffer = g_malloc(ERROR_MAXLEN_IN_CODE_UNITS); + /* Message is returned with a trailing zero. */ + pcre2_get_error_message(errorcode, buffer, ERROR_MAXLEN_IN_CODE_UNITS); + /* One more at the end for good luck. */ + buffer[ERROR_MAXLEN_IN_CODE_UNITS-1] = '\0'; + return buffer; +} + + +static pcre2_code * +compile_pcre2(const char *patt, ssize_t size, char **errmsg, unsigned flags) +{ + pcre2_code *code; + int errorcode; + PCRE2_SIZE length; + PCRE2_SIZE erroroffset; + uint32_t options = 0; + + if (size < 0) + length = PCRE2_ZERO_TERMINATED; + else + length = (PCRE2_SIZE)size; + + if (flags & WS_REGEX_NEVER_UTF) + options |= PCRE2_NEVER_UTF; + if (flags & WS_REGEX_CASELESS) + options |= PCRE2_CASELESS; + + /* By default UTF-8 is off. */ + code = pcre2_compile_8((PCRE2_SPTR)patt, + length, + options, + &errorcode, + &erroroffset, + NULL); + + if (code == NULL) { + *errmsg = get_error_msg(errorcode); + return NULL; + } + + return code; +} + + +ws_regex_t * +ws_regex_compile_ex(const char *patt, ssize_t size, char **errmsg, unsigned flags) +{ + ws_return_val_if(!patt, NULL); + + pcre2_code *code = compile_pcre2(patt, size, errmsg, flags); + if (code == NULL) + return NULL; + + ws_regex_t *re = g_new(ws_regex_t, 1); + re->code = code; + re->pattern = ws_escape_string_len(NULL, patt, size, false); + return re; +} + + +ws_regex_t * +ws_regex_compile(const char *patt, char **errmsg) +{ + return ws_regex_compile_ex(patt, -1, errmsg, 0); +} + + +static bool +match_pcre2(pcre2_code *code, const char *subject, ssize_t subj_length, + pcre2_match_data *match_data) +{ + PCRE2_SIZE length; + int rc; + + if (subj_length < 0) + length = PCRE2_ZERO_TERMINATED; + else + length = (PCRE2_SIZE)subj_length; + + rc = pcre2_match(code, + subject, + length, + 0, /* start at offset zero of the subject */ + 0, /* default options */ + match_data, + NULL); + + if (rc < 0) { + /* No match */ + if (rc != PCRE2_ERROR_NOMATCH) { + /* Error. Should not happen with UTF-8 disabled. Some huge + * subject strings could hit some internal limit. */ + char *msg = get_error_msg(rc); + ws_debug("Unexpected pcre2_match() error: %s.", msg); + g_free(msg); + } + return false; + } + + /* Matched */ + return true; +} + + +bool +ws_regex_matches(const ws_regex_t *re, const char *subj) +{ + return ws_regex_matches_length(re, subj, -1); +} + + +bool +ws_regex_matches_length(const ws_regex_t *re, + const char *subj, ssize_t subj_length) +{ + bool matched; + pcre2_match_data *match_data; + + ws_return_val_if(!re, false); + ws_return_val_if(!subj, false); + + /* We don't use the matched substring but pcre2_match requires + * at least one pair of offsets. */ + match_data = pcre2_match_data_create(1, NULL); + matched = match_pcre2(re->code, subj, subj_length, match_data); + pcre2_match_data_free(match_data); + return matched; +} + + +bool +ws_regex_matches_pos(const ws_regex_t *re, + const char *subj, ssize_t subj_length, + size_t pos_vect[2]) +{ + bool matched; + pcre2_match_data *match_data; + + ws_return_val_if(!re, false); + ws_return_val_if(!subj, false); + + match_data = pcre2_match_data_create(1, NULL); + matched = match_pcre2(re->code, subj, subj_length, match_data); + if (matched && pos_vect) { + PCRE2_SIZE *ovect = pcre2_get_ovector_pointer(match_data); + pos_vect[0] = ovect[0]; + pos_vect[1] = ovect[1]; + } + pcre2_match_data_free(match_data); + return matched; +} + + +void +ws_regex_free(ws_regex_t *re) +{ + pcre2_code_free(re->code); + g_free(re->pattern); + g_free(re); +} + + +const char * +ws_regex_pattern(const ws_regex_t *re) +{ + return re->pattern; +} |