1 files changed, 203 insertions, 0 deletions
diff --git a/wsutil/regex.c b/wsutil/regex.c
new file mode 100644
index 00000000..bb27189b
--- /dev/null
+++ b/wsutil/regex.c
@@ -0,0 +1,203 @@
+/*
+ * Wireshark - Network traffic analyzer
+ * By Gerald Combs <gerald@wireshark.org>
+ * Copyright 1998 Gerald Combs
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "config.h"
+
+#include "regex.h"
+
+#include <wsutil/str_util.h>
+#include <pcre2.h>
+
+
+struct _ws_regex {
+    pcre2_code *code;
+    char *pattern;
+};
+
+#define ERROR_MAXLEN_IN_CODE_UNITS   128
+
+static char *
+get_error_msg(int errorcode)
+{
+    char *buffer;
+
+    /*
+     * We have to provide a buffer and we don't know how long the
+     * error message is or even the maximum size. From pcre2api(3):
+     *     "None of the messages are very long; a
+     *     buffer size of 120 code units is ample."
+     */
+    /* Code unit = one byte */
+    buffer = g_malloc(ERROR_MAXLEN_IN_CODE_UNITS);
+    /* Message is returned with a trailing zero. */
+    pcre2_get_error_message(errorcode, buffer, ERROR_MAXLEN_IN_CODE_UNITS);
+    /* One more at the end for good luck. */
+    buffer[ERROR_MAXLEN_IN_CODE_UNITS-1] = '\0';
+    return buffer;
+}
+
+
+static pcre2_code *
+compile_pcre2(const char *patt, ssize_t size, char **errmsg, unsigned flags)
+{
+    pcre2_code *code;
+    int errorcode;
+    PCRE2_SIZE length;
+    PCRE2_SIZE erroroffset;
+    uint32_t options = 0;
+
+    if (size < 0)
+        length = PCRE2_ZERO_TERMINATED;
+    else
+        length = (PCRE2_SIZE)size;
+
+    if (flags & WS_REGEX_NEVER_UTF)
+        options |= PCRE2_NEVER_UTF;
+    if (flags & WS_REGEX_CASELESS)
+        options |= PCRE2_CASELESS;
+
+    /* By default UTF-8 is off. */
+    code = pcre2_compile_8((PCRE2_SPTR)patt,
+                length,
+                options,
+                &errorcode,
+                &erroroffset,
+                NULL);
+
+    if (code == NULL) {
+        *errmsg = get_error_msg(errorcode);
+        return NULL;
+    }
+
+    return code;
+}
+
+
+ws_regex_t *
+ws_regex_compile_ex(const char *patt, ssize_t size, char **errmsg, unsigned flags)
+{
+    ws_return_val_if(!patt, NULL);
+
+    pcre2_code *code = compile_pcre2(patt, size, errmsg, flags);
+    if (code == NULL)
+        return NULL;
+
+    ws_regex_t *re = g_new(ws_regex_t, 1);
+    re->code = code;
+    re->pattern = ws_escape_string_len(NULL, patt, size, false);
+    return re;
+}
+
+
+ws_regex_t *
+ws_regex_compile(const char *patt, char **errmsg)
+{
+    return ws_regex_compile_ex(patt, -1, errmsg, 0);
+}
+
+
+static bool
+match_pcre2(pcre2_code *code, const char *subject, ssize_t subj_length,
+                pcre2_match_data *match_data)
+{
+    PCRE2_SIZE length;
+    int rc;
+
+    if (subj_length < 0)
+        length = PCRE2_ZERO_TERMINATED;
+    else
+        length = (PCRE2_SIZE)subj_length;
+
+    rc = pcre2_match(code,
+                    subject,
+                    length,
+                    0,          /* start at offset zero of the subject */
+                    0,          /* default options */
+                    match_data,
+                    NULL);
+
+    if (rc < 0) {
+        /* No match */
+        if (rc != PCRE2_ERROR_NOMATCH) {
+            /* Error. Should not happen with UTF-8 disabled. Some huge
+             * subject strings could hit some internal limit. */
+            char *msg = get_error_msg(rc);
+            ws_debug("Unexpected pcre2_match() error: %s.", msg);
+            g_free(msg);
+        }
+        return false;
+    }
+
+    /* Matched */
+    return true;
+}
+
+
+bool
+ws_regex_matches(const ws_regex_t *re, const char *subj)
+{
+    return ws_regex_matches_length(re, subj, -1);
+}
+
+
+bool
+ws_regex_matches_length(const ws_regex_t *re,
+                        const char *subj, ssize_t subj_length)
+{
+    bool matched;
+    pcre2_match_data *match_data;
+
+    ws_return_val_if(!re, false);
+    ws_return_val_if(!subj, false);
+
+    /* We don't use the matched substring but pcre2_match requires
+     * at least one pair of offsets. */
+    match_data = pcre2_match_data_create(1, NULL);
+    matched = match_pcre2(re->code, subj, subj_length, match_data);
+    pcre2_match_data_free(match_data);
+    return matched;
+}
+
+
+bool
+ws_regex_matches_pos(const ws_regex_t *re,
+                        const char *subj, ssize_t subj_length,
+                        size_t pos_vect[2])
+{
+    bool matched;
+    pcre2_match_data *match_data;
+
+    ws_return_val_if(!re, false);
+    ws_return_val_if(!subj, false);
+
+    match_data = pcre2_match_data_create(1, NULL);
+    matched = match_pcre2(re->code, subj, subj_length, match_data);
+    if (matched && pos_vect) {
+        PCRE2_SIZE *ovect = pcre2_get_ovector_pointer(match_data);
+        pos_vect[0] = ovect[0];
+        pos_vect[1] = ovect[1];
+    }
+    pcre2_match_data_free(match_data);
+    return matched;
+}
+
+
+void
+ws_regex_free(ws_regex_t *re)
+{
+    pcre2_code_free(re->code);
+    g_free(re->pattern);
+    g_free(re);
+}
+
+
+const char *
+ws_regex_pattern(const ws_regex_t *re)
+{
+    return re->pattern;
+}