Adding upstream version 2.4.47+dfsg.upstream/2.4.47+dfsg upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:23:53 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-06 01:23:53 +0000
commit: c000cad09d0b54c455c99271bfb996c2dfe13073 (patch)
tree: e47ca809ed512d7fb43ec3d555753b1b658e9819 /libraries/liblunicode/ure/ure.c
parent: Initial commit. (diff)
download: openldap-c000cad09d0b54c455c99271bfb996c2dfe13073.tar.xz
openldap-c000cad09d0b54c455c99271bfb996c2dfe13073.zip
1 files changed, 2131 insertions, 0 deletions
diff --git a/libraries/liblunicode/ure/ure.c b/libraries/liblunicode/ure/ure.c
new file mode 100644
index 0000000..d91d01a
--- /dev/null
+++ b/libraries/liblunicode/ure/ure.c
@@ -0,0 +1,2131 @@
+/* $OpenLDAP$ */
+/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
+ *
+ * Copyright 1998-2018 The OpenLDAP Foundation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted only as authorized by the OpenLDAP
+ * Public License.
+ *
+ * A copy of this license is available in file LICENSE in the
+ * top-level directory of the distribution or, alternatively, at
+ * <http://www.OpenLDAP.org/license.html>.
+ */
+/* Copyright 1997, 1998, 1999 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+/* $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" */
+
+#include "portable.h"
+
+#include <ac/stdlib.h>
+#include <ac/string.h>
+#include <ac/unistd.h>
+
+#include "ure.h"
+
+/*
+ * Flags used internally in the DFA.
+ */
+#define _URE_DFA_CASEFOLD  0x01
+#define _URE_DFA_BLANKLINE 0x02
+
+static unsigned long cclass_flags[] = {
+    0,
+    _URE_NONSPACING,
+    _URE_COMBINING,
+    _URE_NUMDIGIT,
+    _URE_NUMOTHER,
+    _URE_SPACESEP,
+    _URE_LINESEP,
+    _URE_PARASEP,
+    _URE_CNTRL,
+    _URE_PUA,
+    _URE_UPPER,
+    _URE_LOWER,
+    _URE_TITLE,
+    _URE_MODIFIER,
+    _URE_OTHERLETTER,
+    _URE_DASHPUNCT,
+    _URE_OPENPUNCT,
+    _URE_CLOSEPUNCT,
+    _URE_OTHERPUNCT,
+    _URE_MATHSYM,
+    _URE_CURRENCYSYM,
+    _URE_OTHERSYM,
+    _URE_LTR,
+    _URE_RTL,
+    _URE_EURONUM,
+    _URE_EURONUMSEP,
+    _URE_EURONUMTERM,
+    _URE_ARABNUM,
+    _URE_COMMONSEP,
+    _URE_BLOCKSEP,
+    _URE_SEGMENTSEP,
+    _URE_WHITESPACE,
+    _URE_OTHERNEUT,
+};
+
+/*
+ * Symbol types for the DFA.
+ */
+#define _URE_ANY_CHAR   1
+#define _URE_CHAR       2
+#define _URE_CCLASS     3
+#define _URE_NCCLASS    4
+#define _URE_BOL_ANCHOR 5
+#define _URE_EOL_ANCHOR 6
+
+/*
+ * Op codes for converting the NFA to a DFA.
+ */
+#define _URE_SYMBOL     10
+#define _URE_PAREN      11
+#define _URE_QUEST      12
+#define _URE_STAR       13
+#define _URE_PLUS       14
+#define _URE_ONE        15
+#define _URE_AND        16
+#define _URE_OR         17
+
+#define _URE_NOOP       0xffff
+
+#define _URE_REGSTART 0x8000
+#define _URE_REGEND   0x4000
+
+/*
+ * Structure used to handle a compacted range of characters.
+ */
+typedef struct {
+    ucs4_t min_code;
+    ucs4_t max_code;
+} _ure_range_t;
+
+typedef struct {
+    _ure_range_t *ranges;
+    ucs2_t ranges_used;
+    ucs2_t ranges_size;
+} _ure_ccl_t;
+
+typedef union {
+    ucs4_t chr;
+    _ure_ccl_t ccl;
+} _ure_sym_t;
+
+/*
+ * This is a general element structure used for expressions and stack
+ * elements.
+ */
+typedef struct {
+    ucs2_t reg;
+    ucs2_t onstack;
+    ucs2_t type;
+    ucs2_t lhs;
+    ucs2_t rhs;
+} _ure_elt_t;
+
+/*
+ * This is a structure used to track a list or a stack of states.
+ */
+typedef struct {
+    ucs2_t *slist;
+    ucs2_t slist_size;
+    ucs2_t slist_used;
+} _ure_stlist_t;
+
+/*
+ * Structure to track the list of unique states for a symbol
+ * during reduction.
+ */
+typedef struct {
+    ucs2_t id;
+    ucs2_t type;
+    unsigned long mods;
+    unsigned long props;
+    _ure_sym_t sym;
+    _ure_stlist_t states;
+} _ure_symtab_t;
+
+/*
+ * Structure to hold a single state.
+ */
+typedef struct {
+    ucs2_t id;
+    ucs2_t accepting;
+    ucs2_t pad;
+    _ure_stlist_t st;
+    _ure_elt_t *trans;
+    ucs2_t trans_size;
+    ucs2_t trans_used;
+} _ure_state_t;
+
+/*
+ * Structure used for keeping lists of states.
+ */
+typedef struct {
+    _ure_state_t *states;
+    ucs2_t states_size;
+    ucs2_t states_used;
+} _ure_statetable_t;
+
+/*
+ * Structure to track pairs of DFA states when equivalent states are
+ * merged.
+ */
+typedef struct {
+    ucs2_t l;
+    ucs2_t r;
+} _ure_equiv_t;
+
+/*
+ * Structure used for constructing the NFA and reducing to a minimal DFA.
+ */
+typedef struct _ure_buffer_t {
+    int reducing;
+    int error;
+    unsigned long flags;
+
+    _ure_stlist_t stack;
+
+    /*
+     * Table of unique symbols encountered.
+     */
+    _ure_symtab_t *symtab;
+    ucs2_t symtab_size;
+    ucs2_t symtab_used;
+
+    /*
+     * Tracks the unique expressions generated for the NFA and when the NFA is
+     * reduced.
+     */
+    _ure_elt_t *expr;
+    ucs2_t expr_used;
+    ucs2_t expr_size;
+
+    /*
+     * The reduced table of unique groups of NFA states.
+     */
+    _ure_statetable_t states;
+
+    /*
+     * Tracks states when equivalent states are merged.
+     */
+    _ure_equiv_t *equiv;
+    ucs2_t equiv_used;
+    ucs2_t equiv_size;
+} _ure_buffer_t;
+
+typedef struct {
+    ucs2_t symbol;
+    ucs2_t next_state;
+} _ure_trans_t;
+
+typedef struct {
+    ucs2_t accepting;
+    ucs2_t ntrans;
+    _ure_trans_t *trans;
+} _ure_dstate_t;
+
+typedef struct _ure_dfa_t {
+    unsigned long flags;
+
+    _ure_symtab_t *syms;
+    ucs2_t nsyms;
+
+    _ure_dstate_t *states;
+    ucs2_t nstates;
+
+    _ure_trans_t *trans;
+    ucs2_t ntrans;
+} _ure_dfa_t;
+
+/*************************************************************************
+ *
+ * Functions.
+ *
+ *************************************************************************/
+
+static void
+_ure_memmove(char *dest, char *src, unsigned long bytes)
+{
+    long i, j;
+
+    i = (long) bytes;
+    j = i & 7;
+    i = (i + 7) >> 3;
+
+    /*
+     * Do a memmove using Ye Olde Duff's Device for efficiency.
+     */
+    if (src < dest) {
+        src += bytes;
+        dest += bytes;
+
+        switch (j) {
+          case 0: do {
+              *--dest = *--src;
+            case 7: *--dest = *--src;
+            case 6: *--dest = *--src;
+            case 5: *--dest = *--src;
+            case 4: *--dest = *--src;
+            case 3: *--dest = *--src;
+            case 2: *--dest = *--src;
+            case 1: *--dest = *--src;
+          } while (--i > 0);
+        }
+    } else if (src > dest) {
+        switch (j) {
+          case 0: do {
+              *dest++ = *src++;
+            case 7: *dest++ = *src++;
+            case 6: *dest++ = *src++;
+            case 5: *dest++ = *src++;
+            case 4: *dest++ = *src++;
+            case 3: *dest++ = *src++;
+            case 2: *dest++ = *src++;
+            case 1: *dest++ = *src++;
+          } while (--i > 0);
+        }
+    }
+}
+
+static void
+_ure_push(ucs2_t v, _ure_buffer_t *b)
+{
+    _ure_stlist_t *s;
+
+    if (b == 0)
+      return;
+
+    /*
+     * If the `reducing' parameter is non-zero, check to see if the value
+     * passed is already on the stack.
+     */
+    if (b->reducing != 0 && b->expr[v].onstack != 0)
+      return;
+
+    s = &b->stack;
+    if (s->slist_used == s->slist_size) {
+        if (s->slist_size == 0)
+          s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
+        else
+          s->slist = (ucs2_t *) realloc((char *) s->slist,
+                                        sizeof(ucs2_t) * (s->slist_size + 8));
+        s->slist_size += 8;
+    }
+    s->slist[s->slist_used++] = v;
+
+    /*
+     * If the `reducing' parameter is non-zero, flag the element as being on
+     * the stack.
+     */
+    if (b->reducing != 0)
+      b->expr[v].onstack = 1;
+}
+
+static ucs2_t
+_ure_peek(_ure_buffer_t *b)
+{
+    if (b == 0 || b->stack.slist_used == 0)
+      return _URE_NOOP;
+
+    return b->stack.slist[b->stack.slist_used - 1];
+}
+
+static ucs2_t
+_ure_pop(_ure_buffer_t *b)
+{
+    ucs2_t v;
+
+    if (b == 0 || b->stack.slist_used == 0)
+      return _URE_NOOP;
+
+    v = b->stack.slist[--b->stack.slist_used];
+    if (b->reducing)
+      b->expr[v].onstack = 0;
+
+    return v;
+}
+
+/*************************************************************************
+ *
+ * Start symbol parse functions.
+ *
+ *************************************************************************/
+
+/*
+ * Parse a comma-separated list of integers that represent character
+ * properties.  Combine them into a mask that is returned in the `mask'
+ * variable, and return the number of characters consumed.
+ */
+static unsigned long
+_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
+               _ure_buffer_t *b)
+{
+    unsigned long n, m;
+    ucs2_t *sp, *ep;
+
+    sp = pp;
+    ep = sp + limit;
+
+    for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
+        if (*sp == ',') {
+            /*
+             * Encountered a comma, so select the next character property flag
+             * and reset the number.
+             */
+            m |= cclass_flags[n];
+            n = 0;
+        } else if (*sp >= '0' && *sp <= '9')
+          /*
+           * Encountered a digit, so start or continue building the cardinal
+           * that represents the character property flag.
+           */
+          n = (n * 10) + (*sp - '0');
+        else
+          /*
+           * Encountered something that is not part of the property list.
+           * Indicate that we are done.
+           */
+          break;
+
+        /*
+         * If a property number greater than 32 occurs, then there is a
+         * problem.  Most likely a missing comma separator.
+         */
+        if (n > 32)
+          b->error = _URE_INVALID_PROPERTY;
+    }
+
+    if (b->error == _URE_OK && n != 0)
+      m |= cclass_flags[n];
+
+    /*
+     * Set the mask that represents the group of character properties.
+     */
+    *mask = m;
+
+    /*
+     * Return the number of characters consumed.
+     */
+    return sp - pp;
+}
+
+/*
+ * Collect a hex number with 1 to 4 digits and return the number
+ * of characters used.
+ */
+static unsigned long
+_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
+{
+    ucs2_t i;
+    ucs2_t *sp, *ep;
+    ucs4_t nn;
+
+    sp = np;
+    ep = sp + limit;
+
+    for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
+        if (*sp >= '0' && *sp <= '9')
+          nn = (nn << 4) + (*sp - '0');
+        else if (*sp >= 'A' && *sp <= 'F')
+          nn = (nn << 4) + ((*sp - 'A') + 10);
+        else if (*sp >= 'a' && *sp <= 'f')
+          nn = (nn << 4) + ((*sp - 'a') + 10);
+        else
+          /*
+           * Encountered something that is not a hex digit.
+           */
+          break;
+    }
+
+    /*
+     * Assign the character code collected and return the number of
+     * characters used.
+     */
+    *n = nn;
+
+    return sp - np;
+}
+
+/*
+ * Insert a range into a character class, removing duplicates and ordering
+ * them in increasing range-start order.
+ */
+static void
+_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
+{
+    ucs2_t i;
+    ucs4_t tmp;
+    _ure_range_t *rp;
+
+    /*
+     * If the `casefold' flag is set, then make sure both endpoints of the
+     * range are converted to lower case.
+     */
+    if (b->flags & _URE_DFA_CASEFOLD) {
+        r->min_code = _ure_tolower(r->min_code);
+        r->max_code = _ure_tolower(r->max_code);
+    }
+
+    /*
+     * Swap the range endpoints if they are not in increasing order.
+     */
+    if (r->min_code > r->max_code) {
+        tmp = r->min_code;
+        r->min_code = r->max_code;
+        r->max_code = tmp;
+    }
+
+    for (i = 0, rp = ccl->ranges;
+         i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
+
+    /*
+     * Check for a duplicate.
+     */
+    if (i < ccl->ranges_used &&
+        r->min_code == rp->min_code && r->max_code == rp->max_code)
+      return;
+
+    if (ccl->ranges_used == ccl->ranges_size) {
+        if (ccl->ranges_size == 0)
+          ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
+        else
+          ccl->ranges = (_ure_range_t *)
+              realloc((char *) ccl->ranges,
+                      sizeof(_ure_range_t) * (ccl->ranges_size + 8));
+        ccl->ranges_size += 8;
+    }
+
+    rp = ccl->ranges + ccl->ranges_used;
+
+    if (i < ccl->ranges_used)
+      _ure_memmove((char *) (rp + 1), (char *) rp,
+                   sizeof(_ure_range_t) * (ccl->ranges_used - i));
+
+    ccl->ranges_used++;
+    rp->min_code = r->min_code;
+    rp->max_code = r->max_code;
+}
+
+#define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
+_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
+#define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
+#define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
+_URE_OTHERPUNCT)
+#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
+_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
+#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
+#define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
+
+typedef void (*_ure_cclsetup_t)(
+    _ure_symtab_t *sym,
+    unsigned long mask,
+    _ure_buffer_t *b
+);
+
+typedef struct {
+    ucs2_t key;
+    unsigned long len;
+    unsigned long next;
+    _ure_cclsetup_t func;
+    unsigned long mask;
+} _ure_trie_t;
+
+static void
+_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
+{
+    sym->props |= mask;
+}
+
+static void
+_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
+{
+    _ure_range_t range;
+
+    sym->props |= mask;
+
+    /*
+     * Add the additional characters needed for handling isspace().
+     */
+    range.min_code = range.max_code = '\t';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+    range.min_code = range.max_code = '\r';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+    range.min_code = range.max_code = '\n';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+    range.min_code = range.max_code = '\f';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+    range.min_code = range.max_code = 0xfeff;
+    _ure_add_range(&sym->sym.ccl, &range, b);
+}
+
+static void
+_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
+{
+    _ure_range_t range;
+
+    /*
+     * Add the additional characters needed for handling isxdigit().
+     */
+    range.min_code = '0';
+    range.max_code = '9';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+    range.min_code = 'A';
+    range.max_code = 'F';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+    range.min_code = 'a';
+    range.max_code = 'f';
+    _ure_add_range(&sym->sym.ccl, &range, b);
+}
+
+static _ure_trie_t cclass_trie[] = {
+    {0x003a, 1, 1, 0, 0},
+    {0x0061, 9, 10, 0, 0},
+    {0x0063, 8, 19, 0, 0},
+    {0x0064, 7, 24, 0, 0},
+    {0x0067, 6, 29, 0, 0},
+    {0x006c, 5, 34, 0, 0},
+    {0x0070, 4, 39, 0, 0},
+    {0x0073, 3, 49, 0, 0},
+    {0x0075, 2, 54, 0, 0},
+    {0x0078, 1, 59, 0, 0},
+    {0x006c, 1, 11, 0, 0},
+    {0x006e, 2, 13, 0, 0},
+    {0x0070, 1, 16, 0, 0},
+    {0x0075, 1, 14, 0, 0},
+    {0x006d, 1, 15, 0, 0},
+    {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
+    {0x0068, 1, 17, 0, 0},
+    {0x0061, 1, 18, 0, 0},
+    {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
+    {0x006e, 1, 20, 0, 0},
+    {0x0074, 1, 21, 0, 0},
+    {0x0072, 1, 22, 0, 0},
+    {0x006c, 1, 23, 0, 0},
+    {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
+    {0x0069, 1, 25, 0, 0},
+    {0x0067, 1, 26, 0, 0},
+    {0x0069, 1, 27, 0, 0},
+    {0x0074, 1, 28, 0, 0},
+    {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
+    {0x0072, 1, 30, 0, 0},
+    {0x0061, 1, 31, 0, 0},
+    {0x0070, 1, 32, 0, 0},
+    {0x0068, 1, 33, 0, 0},
+    {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
+    {0x006f, 1, 35, 0, 0},
+    {0x0077, 1, 36, 0, 0},
+    {0x0065, 1, 37, 0, 0},
+    {0x0072, 1, 38, 0, 0},
+    {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
+    {0x0072, 2, 41, 0, 0},
+    {0x0075, 1, 45, 0, 0},
+    {0x0069, 1, 42, 0, 0},
+    {0x006e, 1, 43, 0, 0},
+    {0x0074, 1, 44, 0, 0},
+    {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
+    {0x006e, 1, 46, 0, 0},
+    {0x0063, 1, 47, 0, 0},
+    {0x0074, 1, 48, 0, 0},
+    {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
+    {0x0070, 1, 50, 0, 0},
+    {0x0061, 1, 51, 0, 0},
+    {0x0063, 1, 52, 0, 0},
+    {0x0065, 1, 53, 0, 0},
+    {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
+    {0x0070, 1, 55, 0, 0},
+    {0x0070, 1, 56, 0, 0},
+    {0x0065, 1, 57, 0, 0},
+    {0x0072, 1, 58, 0, 0},
+    {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
+    {0x0064, 1, 60, 0, 0},
+    {0x0069, 1, 61, 0, 0},
+    {0x0067, 1, 62, 0, 0},
+    {0x0069, 1, 63, 0, 0},
+    {0x0074, 1, 64, 0, 0},
+    {0x003a, 1, 65, _ure_xdigit_setup, 0},
+};
+
+/*
+ * Probe for one of the POSIX colon delimited character classes in the static
+ * trie.
+ */
+static unsigned long
+_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
+               _ure_buffer_t *b)
+{
+    int i;
+    unsigned long n;
+    _ure_trie_t *tp;
+    ucs2_t *sp, *ep;
+
+    /*
+     * If the number of characters left is less than 7, then this cannot be
+     * interpreted as one of the colon delimited classes.
+     */
+    if (limit < 7)
+      return 0;
+
+    sp = cp;
+    ep = sp + limit;
+    tp = cclass_trie;
+    for (i = 0; sp < ep && i < 8; i++, sp++) {
+        n = tp->len;
+
+        for (; n > 0 && tp->key != *sp; tp++, n--) ;
+
+        if (n == 0)
+          return 0;
+
+        if (*sp == ':' && (i == 6 || i == 7)) {
+            sp++;
+            break;
+        }
+        if (sp + 1 < ep)
+          tp = cclass_trie + tp->next;
+    }
+    if (tp->func == 0)
+      return 0;
+
+    (*tp->func)(sym, tp->mask, b);
+
+    return sp - cp;
+}
+
+/*
+ * Construct a list of ranges and return the number of characters consumed.
+ */
+static unsigned long
+_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
+            _ure_buffer_t *b)
+{
+    int range_end;
+    unsigned long n;
+    ucs2_t *sp, *ep;
+    ucs4_t c, last;
+    _ure_ccl_t *cclp;
+    _ure_range_t range;
+
+    sp = cp;
+    ep = sp + limit;
+
+    if (*sp == '^') {
+      symp->type = _URE_NCCLASS;
+      sp++;
+    } else
+      symp->type = _URE_CCLASS;
+
+    for (last = 0, range_end = 0;
+         b->error == _URE_OK && sp < ep && *sp != ']'; ) {
+        c = *sp++;
+        if (c == '\\') {
+            if (sp == ep) {
+                /*
+                 * The EOS was encountered when expecting the reverse solidus
+                 * to be followed by the character it is escaping.  Set an
+                 * error code and return the number of characters consumed up
+                 * to this point.
+                 */
+                b->error = _URE_UNEXPECTED_EOS;
+                return sp - cp;
+            }
+
+            c = *sp++;
+            switch (c) {
+              case 'a':
+                c = 0x07;
+                break;
+              case 'b':
+                c = 0x08;
+                break;
+              case 'f':
+                c = 0x0c;
+                break;
+              case 'n':
+                c = 0x0a;
+                break;
+              case 'r':
+                c = 0x0d;
+                break;
+              case 't':
+                c = 0x09;
+                break;
+              case 'v':
+                c = 0x0b;
+                break;
+              case 'p':
+              case 'P':
+                sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
+                /*
+                 * Invert the bit mask of the properties if this is a negated
+                 * character class or if 'P' is used to specify a list of
+                 * character properties that should *not* match in a
+                 * character class.
+                 */
+                if (c == 'P')
+                  symp->props = ~symp->props;
+                continue;
+                break;
+              case 'x':
+              case 'X':
+              case 'u':
+              case 'U':
+                if (sp < ep &&
+                    ((*sp >= '0' && *sp <= '9') ||
+                     (*sp >= 'A' && *sp <= 'F') ||
+                     (*sp >= 'a' && *sp <= 'f')))
+                  sp += _ure_hex(sp, ep - sp, &c);
+            }
+        } else if (c == ':') {
+            /*
+             * Probe for a POSIX colon delimited character class.
+             */
+            sp--;
+            if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
+              sp++;
+            else {
+                sp += n;
+                continue;
+            }
+        }
+
+        cclp = &symp->sym.ccl;
+
+        /*
+         * Check to see if the current character is a low surrogate that needs
+         * to be combined with a preceding high surrogate.
+         */
+        if (last != 0) {
+            if (c >= 0xdc00 && c <= 0xdfff)
+              /*
+               * Construct the UTF16 character code.
+               */
+              c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
+            else {
+                /*
+                 * Add the isolated high surrogate to the range.
+                 */
+                if (range_end == 1)
+                  range.max_code = last & 0xffff;
+                else
+                  range.min_code = range.max_code = last & 0xffff;
+
+                _ure_add_range(cclp, &range, b);
+                range_end = 0;
+            }
+        }
+
+        /*
+         * Clear the last character code.
+         */
+        last = 0;
+
+        /*
+         * This slightly awkward code handles the different cases needed to
+         * construct a range.
+         */
+        if (c >= 0xd800 && c <= 0xdbff) {
+            /*
+             * If the high surrogate is followed by a range indicator, simply
+             * add it as the range start.  Otherwise, save it in case the next
+             * character is a low surrogate.
+             */
+            if (*sp == '-') {
+                sp++;
+                range.min_code = c;
+                range_end = 1;
+            } else
+              last = c;
+        } else if (range_end == 1) {
+            range.max_code = c;
+            _ure_add_range(cclp, &range, b);
+            range_end = 0;
+        } else {
+            range.min_code = range.max_code = c;
+            if (*sp == '-') {
+                sp++;
+                range_end = 1;
+            } else
+              _ure_add_range(cclp, &range, b);
+        }
+    }
+
+    if (sp < ep && *sp == ']')
+      sp++;
+    else
+      /*
+       * The parse was not terminated by the character class close symbol
+       * (']'), so set an error code.
+       */
+      b->error = _URE_CCLASS_OPEN;
+
+    return sp - cp;
+}
+
+/*
+ * Probe for a low surrogate hex code.
+ */
+static unsigned long
+_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
+{
+    ucs4_t i, code;
+    ucs2_t *sp, *ep;
+
+    for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
+        if (*sp >= '0' && *sp <= '9')
+          code = (code << 4) + (*sp - '0');
+        else if (*sp >= 'A' && *sp <= 'F')
+          code = (code << 4) + ((*sp - 'A') + 10);
+        else if (*sp >= 'a' && *sp <= 'f')
+          code = (code << 4) + ((*sp - 'a') + 10);
+        else
+          break;
+    }
+
+    *c = code;
+    return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
+}
+
+static unsigned long
+_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
+                    _ure_buffer_t *b)
+{
+    ucs4_t c;
+    ucs2_t *sp, *ep;
+
+    sp = sym;
+    ep = sym + limit;
+
+    if ((c = *sp++) == '\\') {
+
+        if (sp == ep) {
+            /*
+             * The EOS was encountered when expecting the reverse solidus to
+             * be followed by the character it is escaping.  Set an error code
+             * and return the number of characters consumed up to this point.
+             */
+            b->error = _URE_UNEXPECTED_EOS;
+            return sp - sym;
+        }
+
+        c = *sp++;
+        switch (c) {
+          case 'p':
+          case 'P':
+            symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
+            sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
+            break;
+          case 'a':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x07;
+            break;
+          case 'b':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x08;
+            break;
+          case 'f':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x0c;
+            break;
+          case 'n':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x0a;
+            break;
+          case 'r':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x0d;
+            break;
+          case 't':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x09;
+            break;
+          case 'v':
+            symp->type = _URE_CHAR;
+            symp->sym.chr = 0x0b;
+            break;
+          case 'x':
+          case 'X':
+          case 'u':
+          case 'U':
+            /*
+             * Collect between 1 and 4 digits representing a UCS2 code.  Fall
+             * through to the next case.
+             */
+            if (sp < ep &&
+                ((*sp >= '0' && *sp <= '9') ||
+                 (*sp >= 'A' && *sp <= 'F') ||
+                 (*sp >= 'a' && *sp <= 'f')))
+              sp += _ure_hex(sp, ep - sp, &c);
+            /* FALLTHROUGH */
+          default:
+            /*
+             * Simply add an escaped character here.
+             */
+            symp->type = _URE_CHAR;
+            symp->sym.chr = c;
+        }
+    } else if (c == '^' || c == '$')
+      /*
+       * Handle the BOL and EOL anchors.  This actually consists simply of
+       * setting a flag that indicates that the user supplied anchor match
+       * function should be called.  This needs to be done instead of simply
+       * matching line/paragraph separators because beginning-of-text and
+       * end-of-text tests are needed as well.
+       */
+      symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
+    else if (c == '[')
+      /*
+       * Construct a character class.
+       */
+      sp += _ure_cclass(sp, ep - sp, symp, b);
+    else if (c == '.')
+      symp->type = _URE_ANY_CHAR;
+    else {
+        symp->type = _URE_CHAR;
+        symp->sym.chr = c;
+    }
+
+    /*
+     * If the symbol type happens to be a character and is a high surrogate,
+     * then probe forward to see if it is followed by a low surrogate that
+     * needs to be added.
+     */
+    if (sp < ep && symp->type == _URE_CHAR &&
+        0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
+
+        if (0xdc00 <= *sp && *sp <= 0xdfff) {
+            symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
+                                       (*sp & 0x03ff));
+            sp++;
+        } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
+                                 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
+            sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
+            if (0xdc00 <= c && c <= 0xdfff) {
+                /*
+                 * Take into account the \[xu] in front of the hex code.
+                 */
+                sp += 2;
+                symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
+                                           (c & 0x03ff));
+            }
+        }
+    }
+
+    /*
+     * Last, make sure any _URE_CHAR type symbols are changed to lower case if
+     * the `casefold' flag is set.
+     */
+    if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
+      symp->sym.chr = _ure_tolower(symp->sym.chr);
+
+    /*
+     * If the symbol constructed is anything other than one of the anchors,
+     * make sure the _URE_DFA_BLANKLINE flag is removed.
+     */
+    if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
+      b->flags &= ~_URE_DFA_BLANKLINE;
+
+    /*
+     * Return the number of characters consumed.
+     */
+    return sp - sym;
+}
+
+static int
+_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
+{
+    if (a->type != b->type || a->mods != b->mods || a->props != b->props)
+      return 1;
+
+    if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
+        if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
+          return 1;
+        if (a->sym.ccl.ranges_used > 0 &&
+            memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
+                   sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
+          return 1;
+    } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
+      return 1;
+    return 0;
+}
+
+/*
+ * Construct a symbol, but only keep unique symbols.
+ */
+static ucs2_t
+_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
+                 _ure_buffer_t *b)
+{
+    ucs2_t i;
+    _ure_symtab_t *sp, symbol;
+
+    /*
+     * Build the next symbol so we can test to see if it is already in the
+     * symbol table.
+     */
+    (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
+    *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
+
+    /*
+     * Check to see if the symbol exists.
+     */
+    for (i = 0, sp = b->symtab;
+         i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
+
+    if (i < b->symtab_used) {
+        /*
+         * Free up any ranges used for the symbol.
+         */
+        if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
+            symbol.sym.ccl.ranges_size > 0)
+          free((char *) symbol.sym.ccl.ranges);
+
+        return b->symtab[i].id;
+    }
+
+    /*
+     * Need to add the new symbol.
+     */
+    if (b->symtab_used == b->symtab_size) {
+        if (b->symtab_size == 0)
+          b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
+        else
+          b->symtab = (_ure_symtab_t *)
+              realloc((char *) b->symtab,
+                      sizeof(_ure_symtab_t) * (b->symtab_size + 8));
+        sp = b->symtab + b->symtab_size;
+        (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
+        b->symtab_size += 8;
+    }
+
+    symbol.id = b->symtab_used++;
+    (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
+                  sizeof(_ure_symtab_t));
+
+    return symbol.id;
+}
+
+/*************************************************************************
+ *
+ * End symbol parse functions.
+ *
+ *************************************************************************/
+
+static ucs2_t
+_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
+{
+    ucs2_t i;
+
+    if (b == 0)
+      return _URE_NOOP;
+
+    /*
+     * Determine if the expression already exists or not.
+     */
+    for (i = 0; i < b->expr_used; i++) {
+        if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
+            b->expr[i].rhs == rhs)
+          break;
+    }
+    if (i < b->expr_used)
+      return i;
+
+    /*
+     * Need to add a new expression.
+     */
+    if (b->expr_used == b->expr_size) {
+        if (b->expr_size == 0)
+          b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
+        else
+          b->expr = (_ure_elt_t *)
+              realloc((char *) b->expr,
+                      sizeof(_ure_elt_t) * (b->expr_size + 8));
+        b->expr_size += 8;
+    }
+
+    b->expr[b->expr_used].onstack = 0;
+    b->expr[b->expr_used].type = type;
+    b->expr[b->expr_used].lhs = lhs;
+    b->expr[b->expr_used].rhs = rhs;
+
+    return b->expr_used++;
+}
+
+static unsigned char spmap[] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
+                            (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
+
+/*
+ * Convert the regular expression into an NFA in a form that will be easy to
+ * reduce to a DFA.  The starting state for the reduction will be returned.
+ */
+static ucs2_t
+_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
+{
+    ucs2_t c, state, top, sym, *sp, *ep;
+    unsigned long used;
+
+    state = _URE_NOOP;
+
+    sp = re;
+    ep = sp + relen;
+    while (b->error == _URE_OK && sp < ep) {
+        c = *sp++;
+        switch (c) {
+          case '(':
+            _ure_push(_URE_PAREN, b);
+            break;
+          case ')':
+            /*
+             * Check for the case of too many close parentheses.
+             */
+            if (_ure_peek(b) == _URE_NOOP) {
+                b->error = _URE_UNBALANCED_GROUP;
+                break;
+            }
+
+            while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
+              /*
+               * Make an expression with the AND or OR operator and its right
+               * hand side.
+               */
+              state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
+
+            /*
+             * Remove the _URE_PAREN off the stack.
+             */
+            (void) _ure_pop(b);
+            break;
+          case '*':
+            state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
+            break;
+          case '+':
+            state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
+            break;
+          case '?':
+            state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
+            break;
+          case '|':
+            while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
+              /*
+               * Make an expression with the AND or OR operator and its right
+               * hand side.
+               */
+              state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
+
+            _ure_push(state, b);
+            _ure_push(_URE_OR, b);
+            break;
+          default:
+            sp--;
+            sym = _ure_make_symbol(sp, ep - sp, &used, b);
+            sp += used;
+            state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
+            break;
+        }
+
+        if (c != '(' && c != '|' && sp < ep &&
+            (!_ure_isspecial(*sp) || *sp == '(')) {
+            _ure_push(state, b);
+            _ure_push(_URE_AND, b);
+        }
+    }
+    while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
+      /*
+       * Make an expression with the AND or OR operator and its right
+       * hand side.
+       */
+      state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
+
+    if (b->stack.slist_used > 0)
+      b->error = _URE_UNBALANCED_GROUP;
+
+    return (b->error == _URE_OK) ? state : _URE_NOOP;
+}
+
+static void
+_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
+{
+    ucs2_t i, *stp;
+    _ure_symtab_t *sp;
+
+    /*
+     * Locate the symbol in the symbol table so the state can be added.
+     * If the symbol doesn't exist, then a real problem exists.
+     */
+    for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
+         i++, sp++) ;
+
+    /*
+     * Now find out if the state exists in the symbol's state list.
+     */
+    for (i = 0, stp = sp->states.slist;
+         i < sp->states.slist_used && state > *stp; i++, stp++) ;
+
+    if (i == sp->states.slist_used || state < *stp) {
+        /*
+         * Need to add the state in order.
+         */
+        if (sp->states.slist_used == sp->states.slist_size) {
+            if (sp->states.slist_size == 0)
+              sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
+            else
+              sp->states.slist = (ucs2_t *)
+                  realloc((char *) sp->states.slist,
+                          sizeof(ucs2_t) * (sp->states.slist_size + 8));
+            sp->states.slist_size += 8;
+        }
+        if (i < sp->states.slist_used)
+          (void) _ure_memmove((char *) (sp->states.slist + i + 1),
+                              (char *) (sp->states.slist + i),
+                              sizeof(ucs2_t) * (sp->states.slist_used - i));
+        sp->states.slist[i] = state;
+        sp->states.slist_used++;
+    }
+}
+
+static ucs2_t
+_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
+{
+    ucs2_t i;
+    _ure_state_t *sp;
+
+    for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
+        if (sp->st.slist_used == nstates &&
+            memcmp((char *) states, (char *) sp->st.slist,
+                   sizeof(ucs2_t) * nstates) == 0)
+          break;
+    }
+
+    if (i == b->states.states_used) {
+        /*
+         * Need to add a new DFA state (set of NFA states).
+         */
+        if (b->states.states_used == b->states.states_size) {
+            if (b->states.states_size == 0)
+              b->states.states = (_ure_state_t *)
+                  malloc(sizeof(_ure_state_t) << 3);
+            else
+              b->states.states = (_ure_state_t *)
+                  realloc((char *) b->states.states,
+                          sizeof(_ure_state_t) * (b->states.states_size + 8));
+            sp = b->states.states + b->states.states_size;
+            (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
+            b->states.states_size += 8;
+        }
+
+        sp = b->states.states + b->states.states_used++;
+        sp->id = i;
+
+        if (sp->st.slist_used + nstates > sp->st.slist_size) {
+            if (sp->st.slist_size == 0)
+              sp->st.slist = (ucs2_t *)
+                  malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
+            else
+              sp->st.slist = (ucs2_t *)
+                  realloc((char *) sp->st.slist,
+                          sizeof(ucs2_t) * (sp->st.slist_used + nstates));
+            sp->st.slist_size = sp->st.slist_used + nstates;
+        }
+        sp->st.slist_used = nstates;
+        (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
+                      sizeof(ucs2_t) * nstates);
+    }
+
+    /*
+     * Return the ID of the DFA state representing a group of NFA states.
+     */
+    return i;
+}
+
+static void
+_ure_reduce(ucs2_t start, _ure_buffer_t *b)
+{
+    ucs2_t i, j, state, eval, syms, rhs;
+    ucs2_t s1, s2, ns1, ns2;
+    _ure_state_t *sp;
+    _ure_symtab_t *smp;
+
+    b->reducing = 1;
+
+    /*
+     * Add the starting state for the reduction.
+     */
+    _ure_add_state(1, &start, b);
+
+    /*
+     * Process each set of NFA states that get created.
+     */
+    for (i = 0; i < b->states.states_used; i++) {
+        sp = b->states.states + i;
+
+        /*
+         * Push the current states on the stack.
+         */
+        for (j = 0; j < sp->st.slist_used; j++)
+          _ure_push(sp->st.slist[j], b);
+
+        /*
+         * Reduce the NFA states.
+         */
+        for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
+            state = b->stack.slist[j];
+            eval = 1;
+
+            /*
+             * This inner loop is the iterative equivalent of recursively
+             * reducing subexpressions generated as a result of a reduction.
+             */
+            while (eval) {
+                switch (b->expr[state].type) {
+                  case _URE_SYMBOL:
+                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
+                    _ure_add_symstate(b->expr[state].lhs, ns1, b);
+                    syms++;
+                    eval = 0;
+                    break;
+                  case _URE_ONE:
+                    sp->accepting = 1;
+                    eval = 0;
+                    break;
+                  case _URE_QUEST:
+                    s1 = b->expr[state].lhs;
+                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
+                    state = _ure_make_expr(_URE_OR, ns1, s1, b);
+                    break;
+                  case _URE_PLUS:
+                    s1 = b->expr[state].lhs;
+                    ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
+                    state = _ure_make_expr(_URE_AND, s1, ns1, b);
+                    break;
+                  case _URE_STAR:
+                    s1 = b->expr[state].lhs;
+                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
+                    ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
+                    state = _ure_make_expr(_URE_OR, ns1, ns2, b);
+                    break;
+                  case _URE_OR:
+                    s1 = b->expr[state].lhs;
+                    s2 = b->expr[state].rhs;
+                    _ure_push(s1, b);
+                    _ure_push(s2, b);
+                    eval = 0;
+                    break;
+                  case _URE_AND:
+                    s1 = b->expr[state].lhs;
+                    s2 = b->expr[state].rhs;
+                    switch (b->expr[s1].type) {
+                      case _URE_SYMBOL:
+                        _ure_add_symstate(b->expr[s1].lhs, s2, b);
+                        syms++;
+                        eval = 0;
+                        break;
+                      case _URE_ONE:
+                        state = s2;
+                        break;
+                      case _URE_QUEST:
+                        ns1 = b->expr[s1].lhs;
+                        ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
+                        state = _ure_make_expr(_URE_OR, s2, ns2, b);
+                        break;
+                      case _URE_PLUS:
+                        ns1 = b->expr[s1].lhs;
+                        ns2 = _ure_make_expr(_URE_OR, s2, state, b);
+                        state = _ure_make_expr(_URE_AND, ns1, ns2, b);
+                        break;
+                      case _URE_STAR:
+                        ns1 = b->expr[s1].lhs;
+                        ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
+                        state = _ure_make_expr(_URE_OR, s2, ns2, b);
+                        break;
+                      case _URE_OR:
+                        ns1 = b->expr[s1].lhs;
+                        ns2 = b->expr[s1].rhs;
+                        ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
+                        ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
+                        state = _ure_make_expr(_URE_OR, ns1, ns2, b);
+                        break;
+                      case _URE_AND:
+                        ns1 = b->expr[s1].lhs;
+                        ns2 = b->expr[s1].rhs;
+                        ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
+                        state = _ure_make_expr(_URE_AND, ns1, ns2, b);
+                        break;
+                    }
+                }
+            }
+        }
+
+        /*
+         * Clear the state stack.
+         */
+        while (_ure_pop(b) != _URE_NOOP) ;
+
+        /*
+         * Reset the state pointer because the reduction may have moved it
+         * during a reallocation.
+         */
+        sp = b->states.states + i;
+
+        /*
+         * Generate the DFA states for the symbols collected during the
+         * current reduction.
+         */
+        if (sp->trans_used + syms > sp->trans_size) {
+            if (sp->trans_size == 0)
+              sp->trans = (_ure_elt_t *)
+                  malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
+            else
+              sp->trans = (_ure_elt_t *)
+                  realloc((char *) sp->trans,
+                          sizeof(_ure_elt_t) * (sp->trans_used + syms));
+            sp->trans_size = sp->trans_used + syms;
+        }
+
+        /*
+         * Go through the symbol table and generate the DFA state transitions
+         * for each symbol that has collected NFA states.
+         */
+        for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
+            sp = b->states.states + i;
+
+            if (smp->states.slist_used > 0) {
+                sp->trans[syms].lhs = smp->id;
+                rhs = _ure_add_state(smp->states.slist_used,
+                                     smp->states.slist, b);
+                /*
+                 * Reset the state pointer in case the reallocation moves it
+                 * in memory.
+                 */
+                sp = b->states.states + i;
+                sp->trans[syms].rhs = rhs;
+
+                smp->states.slist_used = 0;
+                syms++;
+            }
+        }
+
+        /*
+         * Set the number of transitions actually used.
+         */
+        sp->trans_used = syms;
+    }
+    b->reducing = 0;
+}
+
+static void
+_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
+{
+    ucs2_t tmp;
+
+    l = b->states.states[l].id;
+    r = b->states.states[r].id;
+
+    if (l == r)
+      return;
+
+    if (l > r) {
+        tmp = l;
+        l = r;
+        r = tmp;
+    }
+
+    /*
+     * Check to see if the equivalence pair already exists.
+     */
+    for (tmp = 0; tmp < b->equiv_used &&
+             (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
+         tmp++) ;
+
+    if (tmp < b->equiv_used)
+      return;
+
+    if (b->equiv_used == b->equiv_size) {
+        if (b->equiv_size == 0)
+          b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
+        else
+          b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
+                                              sizeof(_ure_equiv_t) *
+                                              (b->equiv_size + 8));
+        b->equiv_size += 8;
+    }
+    b->equiv[b->equiv_used].l = l;
+    b->equiv[b->equiv_used].r = r;
+    b->equiv_used++;
+}
+
+/*
+ * Merge the DFA states that are equivalent.
+ */
+static void
+_ure_merge_equiv(_ure_buffer_t *b)
+{
+    ucs2_t i, j, k, eq, done;
+    _ure_state_t *sp1, *sp2, *ls, *rs;
+
+    for (i = 0; i < b->states.states_used; i++) {
+        sp1 = b->states.states + i;
+        if (sp1->id != i)
+          continue;
+        for (j = 0; j < i; j++) {
+            sp2 = b->states.states + j;
+            if (sp2->id != j)
+              continue;
+            b->equiv_used = 0;
+            _ure_add_equiv(i, j, b);
+            for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
+                ls = b->states.states + b->equiv[eq].l;
+                rs = b->states.states + b->equiv[eq].r;
+                if (ls->accepting != rs->accepting ||
+                    ls->trans_used != rs->trans_used) {
+                    done = 1;
+                    break;
+                }
+                for (k = 0; k < ls->trans_used &&
+                         ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
+                if (k < ls->trans_used) {
+                    done = 1;
+                    break;
+                }
+
+                for (k = 0; k < ls->trans_used; k++)
+                  _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
+            }
+            if (done == 0)
+              break;
+        }
+        for (eq = 0; j < i && eq < b->equiv_used; eq++)
+          b->states.states[b->equiv[eq].r].id =
+              b->states.states[b->equiv[eq].l].id;
+    }
+
+    /*
+     * Renumber the states appropriately.
+     */
+    for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
+         sp1++, i++)
+      sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
+}
+
+/*************************************************************************
+ *
+ * API.
+ *
+ *************************************************************************/
+
+ure_buffer_t
+ure_buffer_create(void)
+{
+    ure_buffer_t b;
+
+    b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
+
+    return b;
+}
+
+void
+ure_buffer_free(ure_buffer_t buf)
+{
+    unsigned long i;
+
+    if (buf == 0)
+      return;
+
+    if (buf->stack.slist_size > 0)
+      free((char *) buf->stack.slist);
+
+    if (buf->expr_size > 0)
+      free((char *) buf->expr);
+
+    for (i = 0; i < buf->symtab_size; i++) {
+        if (buf->symtab[i].states.slist_size > 0)
+          free((char *) buf->symtab[i].states.slist);
+    }
+
+    if (buf->symtab_size > 0)
+      free((char *) buf->symtab);
+
+    for (i = 0; i < buf->states.states_size; i++) {
+        if (buf->states.states[i].trans_size > 0)
+          free((char *) buf->states.states[i].trans);
+        if (buf->states.states[i].st.slist_size > 0)
+          free((char *) buf->states.states[i].st.slist);
+    }
+
+    if (buf->states.states_size > 0)
+      free((char *) buf->states.states);
+
+    if (buf->equiv_size > 0)
+      free((char *) buf->equiv);
+
+    free((char *) buf);
+}
+
+ure_dfa_t
+ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
+{
+    ucs2_t i, j, state;
+    _ure_state_t *sp;
+    _ure_dstate_t *dsp;
+    _ure_trans_t *tp;
+    ure_dfa_t dfa;
+
+    if (re == 0 || *re == 0 || relen == 0 || buf == 0)
+      return 0;
+
+    /*
+     * Reset the various fields of the compilation buffer.  Default the flags
+     * to indicate the presense of the "^$" pattern.  If any other pattern
+     * occurs, then this flag will be removed.  This is done to catch this
+     * special pattern and handle it specially when matching.
+     */
+    buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
+    buf->reducing = 0;
+    buf->stack.slist_used = 0;
+    buf->expr_used = 0;
+
+    for (i = 0; i < buf->symtab_used; i++)
+      buf->symtab[i].states.slist_used = 0;
+    buf->symtab_used = 0;
+
+    for (i = 0; i < buf->states.states_used; i++) {
+        buf->states.states[i].st.slist_used = 0;
+        buf->states.states[i].trans_used = 0;
+    }
+    buf->states.states_used = 0;
+
+    /*
+     * Construct the NFA.  If this stage returns a 0, then an error occured or
+     * an empty expression was passed.
+     */
+    if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
+      return 0;
+
+    /*
+     * Do the expression reduction to get the initial DFA.
+     */
+    _ure_reduce(state, buf);
+
+    /*
+     * Merge all the equivalent DFA states.
+     */
+    _ure_merge_equiv(buf);
+
+    /*
+     * Construct the minimal DFA.
+     */
+    dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
+    (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
+
+    dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
+
+    /*
+     * Free up the NFA state groups and transfer the symbols from the buffer
+     * to the DFA.
+     */
+    for (i = 0; i < buf->symtab_size; i++) {
+        if (buf->symtab[i].states.slist_size > 0)
+          free((char *) buf->symtab[i].states.slist);
+    }
+    dfa->syms = buf->symtab;
+    dfa->nsyms = buf->symtab_used;
+
+    buf->symtab_used = buf->symtab_size = 0;
+
+    /*
+     * Collect the total number of states and transitions needed for the DFA.
+     */
+    for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
+         i++, sp++) {
+        if (sp->id == state) {
+            dfa->nstates++;
+            dfa->ntrans += sp->trans_used;
+            state++;
+        }
+    }
+
+    /*
+     * Allocate enough space for the states and transitions.
+     */
+    dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
+                                           dfa->nstates);
+    dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
+
+    /*
+     * Actually transfer the DFA states from the buffer.
+     */
+    dsp = dfa->states;
+    tp = dfa->trans;
+    for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
+         i++, sp++) {
+        if (sp->id == state) {
+            dsp->trans = tp;
+            dsp->ntrans = sp->trans_used;
+            dsp->accepting = sp->accepting;
+
+            /*
+             * Add the transitions for the state.
+             */
+            for (j = 0; j < dsp->ntrans; j++, tp++) {
+                tp->symbol = sp->trans[j].lhs;
+                tp->next_state = buf->states.states[sp->trans[j].rhs].id;
+            }
+
+            dsp++;
+            state++;
+        }
+    }
+
+    return dfa;
+}
+
+void
+ure_dfa_free(ure_dfa_t dfa)
+{
+    ucs2_t i;
+
+    if (dfa == 0)
+      return;
+
+    for (i = 0; i < dfa->nsyms; i++) {
+        if ((dfa->syms[i].type == _URE_CCLASS ||
+             dfa->syms[i].type == _URE_NCCLASS) &&
+            dfa->syms[i].sym.ccl.ranges_size > 0)
+          free((char *) dfa->syms[i].sym.ccl.ranges);
+    }
+    if (dfa->nsyms > 0)
+      free((char *) dfa->syms);
+
+    if (dfa->nstates > 0)
+      free((char *) dfa->states);
+    if (dfa->ntrans > 0)
+      free((char *) dfa->trans);
+    free((char *) dfa);
+}
+
+void
+ure_write_dfa(ure_dfa_t dfa, FILE *out)
+{
+    ucs2_t i, j, k, h, l;
+    _ure_dstate_t *sp;
+    _ure_symtab_t *sym;
+    _ure_range_t *rp;
+
+    if (dfa == 0 || out == 0)
+      return;
+
+    /*
+     * Write all the different character classes.
+     */
+    for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
+        if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
+            fprintf(out, "C%hd = ", sym->id);
+            if (sym->sym.ccl.ranges_used > 0) {
+                putc('[', out);
+                if (sym->type == _URE_NCCLASS)
+                  putc('^', out);
+            }
+            if (sym->props != 0) {
+                if (sym->type == _URE_NCCLASS)
+                  fprintf(out, "\\P");
+                else
+                  fprintf(out, "\\p");
+                for (k = h = 0; k < 32; k++) {
+                    if (sym->props & (1 << k)) {
+                        if (h != 0)
+                          putc(',', out);
+                        fprintf(out, "%hd", k + 1);
+                        h = 1;
+                    }
+                }
+            }
+            /*
+             * Dump the ranges.
+             */
+            for (k = 0, rp = sym->sym.ccl.ranges;
+                 k < sym->sym.ccl.ranges_used; k++, rp++) {
+                /*
+                 * Check for UTF16 characters.
+                 */
+                if (0x10000 <= rp->min_code &&
+                    rp->min_code <= 0x10ffff) {
+                    h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
+                    l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
+                    fprintf(out, "\\x%04hX\\x%04hX", h, l);
+                } else
+                  fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
+                if (rp->max_code != rp->min_code) {
+                    putc('-', out);
+                    if (rp->max_code >= 0x10000 &&
+                        rp->max_code <= 0x10ffff) {
+                        h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
+                        l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
+                        fprintf(out, "\\x%04hX\\x%04hX", h, l);
+                    } else
+                      fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
+                }
+            }
+            if (sym->sym.ccl.ranges_used > 0)
+              putc(']', out);
+            putc('\n', out);
+        }
+    }
+
+    for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
+        fprintf(out, "S%hd = ", i);
+        if (sp->accepting) {
+            fprintf(out, "1 ");
+            if (sp->ntrans)
+              fprintf(out, "| ");
+        }
+        for (j = 0; j < sp->ntrans; j++) {
+            if (j > 0)
+              fprintf(out, "| ");
+
+            sym = dfa->syms + sp->trans[j].symbol;
+            switch (sym->type) {
+              case _URE_CHAR:
+                if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
+                    /*
+                     * Take care of UTF16 characters.
+                     */
+                    h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
+                    l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
+                    fprintf(out, "\\x%04hX\\x%04hX ", h, l);
+                } else
+                  fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
+                break;
+              case _URE_ANY_CHAR:
+                fprintf(out, "<any> ");
+                break;
+              case _URE_BOL_ANCHOR:
+                fprintf(out, "<bol-anchor> ");
+                break;
+              case _URE_EOL_ANCHOR:
+                fprintf(out, "<eol-anchor> ");
+                break;
+              case _URE_CCLASS:
+              case _URE_NCCLASS:
+                fprintf(out, "[C%hd] ", sym->id);
+                break;
+            }
+            fprintf(out, "S%hd", sp->trans[j].next_state);
+            if (j + 1 < sp->ntrans)
+              putc(' ', out);
+        }
+        putc('\n', out);
+    }
+}
+
+#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
+                        (cc) == 0x2029)
+
+int
+ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
+         unsigned long *match_start, unsigned long *match_end)
+{
+    int i, j, matched, found, skip;
+    unsigned long ms, me;
+    ucs4_t c;
+    ucs2_t *sp, *ep, *lp;
+    _ure_dstate_t *stp;
+    _ure_symtab_t *sym;
+    _ure_range_t *rp;
+
+    if (dfa == 0 || text == 0)
+      return 0;
+
+    /*
+     * Handle the special case of an empty string matching the "^$" pattern.
+     */
+    if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
+        *match_start = *match_end = 0;
+        return 1;
+    }
+
+    sp = text;
+    ep = sp + textlen;
+
+    ms = me = ~0;
+
+    stp = dfa->states;
+
+    for (found = skip = 0; found == 0 && sp < ep; ) {
+        lp = sp;
+        c = *sp++;
+
+        /*
+         * Check to see if this is a high surrogate that should be
+         * combined with a following low surrogate.
+         */
+        if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
+            0xdc00 <= *sp && *sp <= 0xdfff)
+          c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
+
+        /*
+         * Determine if the character is non-spacing and should be skipped.
+         */
+        if (_ure_matches_properties(_URE_NONSPACING, c) &&
+            (flags & URE_IGNORE_NONSPACING)) {
+            sp++;
+            continue;
+        }
+
+        if (dfa->flags & _URE_DFA_CASEFOLD)
+          c = _ure_tolower(c);
+
+        /*
+         * See if one of the transitions matches.
+         */
+        for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
+            sym = dfa->syms + stp->trans[i].symbol;
+            switch (sym->type) {
+              case _URE_ANY_CHAR:
+                if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
+                    !_ure_issep(c))
+                  matched = 1;
+                break;
+              case _URE_CHAR:
+                if (c == sym->sym.chr)
+                  matched = 1;
+                break;
+              case _URE_BOL_ANCHOR:
+                if (lp == text) {
+                    sp = lp;
+                    matched = 1;
+                } else if (_ure_issep(c)) {
+                    if (c == '\r' && sp < ep && *sp == '\n')
+                      sp++;
+                    lp = sp;
+                    matched = 1;
+                }
+                break;
+              case _URE_EOL_ANCHOR:
+                if (_ure_issep(c)) {
+                    /*
+                     * Put the pointer back before the separator so the match
+                     * end position will be correct.  This case will also
+                     * cause the `sp' pointer to be advanced over the current
+                     * separator once the match end point has been recorded.
+                     */
+                    sp = lp;
+                    matched = 1;
+                }
+                break;
+              case _URE_CCLASS:
+              case _URE_NCCLASS:
+                if (sym->props != 0)
+                  matched = _ure_matches_properties(sym->props, c);
+                for (j = 0, rp = sym->sym.ccl.ranges;
+                     j < sym->sym.ccl.ranges_used; j++, rp++) {
+                    if (rp->min_code <= c && c <= rp->max_code)
+                      matched = 1;
+                }
+                if (sym->type == _URE_NCCLASS)
+                  matched = !matched;
+                break;
+            }
+
+            if (matched) {
+                if (ms == ~0UL)
+                  ms = lp - text;
+                else
+                  me = sp - text;
+                stp = dfa->states + stp->trans[i].next_state;
+
+                /*
+                 * If the match was an EOL anchor, adjust the pointer past the
+                 * separator that caused the match.  The correct match
+                 * position has been recorded already.
+                 */
+                if (sym->type == _URE_EOL_ANCHOR) {
+                    /*
+                     * Skip the character that caused the match.
+                     */
+                    sp++;
+
+                    /*
+                     * Handle the infamous CRLF situation.
+                     */
+                    if (sp < ep && c == '\r' && *sp == '\n')
+                      sp++;
+                }
+            }
+        }
+
+        if (matched == 0) {
+            if (stp->accepting == 0) {
+                /*
+                 * If the last state was not accepting, then reset
+                 * and start over.
+                 */
+                stp = dfa->states;
+                ms = me = ~0;
+            } else
+              /*
+               * The last state was accepting, so terminate the matching
+               * loop to avoid more work.
+               */
+              found = 1;
+        } else if (sp == ep) {
+            if (!stp->accepting) {
+                /*
+                 * This ugly hack is to make sure the end-of-line anchors
+                 * match when the source text hits the end.  This is only done
+                 * if the last subexpression matches.
+                 */
+                for (i = 0; found == 0 && i < stp->ntrans; i++) {
+                    sym = dfa->syms + stp->trans[i].symbol;
+                    if (sym->type ==_URE_EOL_ANCHOR) {
+                        stp = dfa->states + stp->trans[i].next_state;
+                        if (stp->accepting) {
+                            me = sp - text;
+                            found = 1;
+                        } else
+                          break;
+                    }
+                }
+            } else {
+                /*
+                 * Make sure any conditions that match all the way to the end
+                 * of the string match.
+                 */
+                found = 1;
+                me = sp - text;
+            }
+        }
+    }
+
+    if (found == 0)
+      ms = me = ~0;
+
+    *match_start = ms;
+    *match_end = me;
+
+    return (ms != ~0UL) ? 1 : 0;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:23:53 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-06 01:23:53 +0000
commit	c000cad09d0b54c455c99271bfb996c2dfe13073 (patch)
tree	e47ca809ed512d7fb43ec3d555753b1b658e9819 /libraries/liblunicode/ure/ure.c
parent	Initial commit. (diff)
download	openldap-c000cad09d0b54c455c99271bfb996c2dfe13073.tar.xz openldap-c000cad09d0b54c455c99271bfb996c2dfe13073.zip