diff options
Diffstat (limited to 'src/backend/regex/regc_locale.c')
-rw-r--r-- | src/backend/regex/regc_locale.c | 771 |
1 files changed, 771 insertions, 0 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c new file mode 100644 index 0000000..b5f3a73 --- /dev/null +++ b/src/backend/regex/regc_locale.c @@ -0,0 +1,771 @@ +/* + * regc_locale.c -- + * + * This file contains locale-specific regexp routines. + * This file is #included by regcomp.c. + * + * Copyright (c) 1998 by Scriptics Corporation. + * + * This software is copyrighted by the Regents of the University of + * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + * Corporation and other parties. The following terms apply to all files + * associated with the software unless explicitly disclaimed in + * individual files. + * + * The authors hereby grant permission to use, copy, modify, distribute, + * and license this software and its documentation for any purpose, provided + * that existing copyright notices are retained in all copies and that this + * notice is included verbatim in any distributions. No written agreement, + * license, or royalty fee is required for any of the authorized uses. + * Modifications to this software may be copyrighted by their authors + * and need not follow the licensing terms described here, provided that + * the new terms are clearly indicated on the first page of each file where + * they apply. + * + * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + * MODIFICATIONS. + * + * GOVERNMENT USE: If you are acquiring this software on behalf of the + * U.S. government, the Government shall have only "Restricted Rights" + * in the software and related documentation as defined in the Federal + * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + * are acquiring the software on behalf of the Department of Defense, the + * software shall be classified as "Commercial Computer Software" and the + * Government shall have only "Restricted Rights" as defined in Clause + * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + * authors grant the U.S. Government and others acting in its behalf + * permission to use and distribute the software in accordance with the + * terms specified in this license. + * + * src/backend/regex/regc_locale.c + */ + +/* ASCII character-name table */ + +static const struct cname +{ + const char *name; + const char code; +} cnames[] = + +{ + { + "NUL", '\0' + }, + { + "SOH", '\001' + }, + { + "STX", '\002' + }, + { + "ETX", '\003' + }, + { + "EOT", '\004' + }, + { + "ENQ", '\005' + }, + { + "ACK", '\006' + }, + { + "BEL", '\007' + }, + { + "alert", '\007' + }, + { + "BS", '\010' + }, + { + "backspace", '\b' + }, + { + "HT", '\011' + }, + { + "tab", '\t' + }, + { + "LF", '\012' + }, + { + "newline", '\n' + }, + { + "VT", '\013' + }, + { + "vertical-tab", '\v' + }, + { + "FF", '\014' + }, + { + "form-feed", '\f' + }, + { + "CR", '\015' + }, + { + "carriage-return", '\r' + }, + { + "SO", '\016' + }, + { + "SI", '\017' + }, + { + "DLE", '\020' + }, + { + "DC1", '\021' + }, + { + "DC2", '\022' + }, + { + "DC3", '\023' + }, + { + "DC4", '\024' + }, + { + "NAK", '\025' + }, + { + "SYN", '\026' + }, + { + "ETB", '\027' + }, + { + "CAN", '\030' + }, + { + "EM", '\031' + }, + { + "SUB", '\032' + }, + { + "ESC", '\033' + }, + { + "IS4", '\034' + }, + { + "FS", '\034' + }, + { + "IS3", '\035' + }, + { + "GS", '\035' + }, + { + "IS2", '\036' + }, + { + "RS", '\036' + }, + { + "IS1", '\037' + }, + { + "US", '\037' + }, + { + "space", ' ' + }, + { + "exclamation-mark", '!' + }, + { + "quotation-mark", '"' + }, + { + "number-sign", '#' + }, + { + "dollar-sign", '$' + }, + { + "percent-sign", '%' + }, + { + "ampersand", '&' + }, + { + "apostrophe", '\'' + }, + { + "left-parenthesis", '(' + }, + { + "right-parenthesis", ')' + }, + { + "asterisk", '*' + }, + { + "plus-sign", '+' + }, + { + "comma", ',' + }, + { + "hyphen", '-' + }, + { + "hyphen-minus", '-' + }, + { + "period", '.' + }, + { + "full-stop", '.' + }, + { + "slash", '/' + }, + { + "solidus", '/' + }, + { + "zero", '0' + }, + { + "one", '1' + }, + { + "two", '2' + }, + { + "three", '3' + }, + { + "four", '4' + }, + { + "five", '5' + }, + { + "six", '6' + }, + { + "seven", '7' + }, + { + "eight", '8' + }, + { + "nine", '9' + }, + { + "colon", ':' + }, + { + "semicolon", ';' + }, + { + "less-than-sign", '<' + }, + { + "equals-sign", '=' + }, + { + "greater-than-sign", '>' + }, + { + "question-mark", '?' + }, + { + "commercial-at", '@' + }, + { + "left-square-bracket", '[' + }, + { + "backslash", '\\' + }, + { + "reverse-solidus", '\\' + }, + { + "right-square-bracket", ']' + }, + { + "circumflex", '^' + }, + { + "circumflex-accent", '^' + }, + { + "underscore", '_' + }, + { + "low-line", '_' + }, + { + "grave-accent", '`' + }, + { + "left-brace", '{' + }, + { + "left-curly-bracket", '{' + }, + { + "vertical-line", '|' + }, + { + "right-brace", '}' + }, + { + "right-curly-bracket", '}' + }, + { + "tilde", '~' + }, + { + "DEL", '\177' + }, + { + NULL, 0 + } +}; + +/* + * The following array defines the valid character class names. + * The entries must match enum char_classes in regguts.h. + */ +static const char *const classNames[NUM_CCLASSES + 1] = { + "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit", "word", + NULL +}; + +/* + * We do not use the hard-wired Unicode classification tables that Tcl does. + * This is because (a) we need to deal with other encodings besides Unicode, + * and (b) we want to track the behavior of the libc locale routines as + * closely as possible. For example, it wouldn't be unreasonable for a + * locale to not consider every Unicode letter as a letter. So we build + * character classification cvecs by asking libc, even for Unicode. + */ + + +/* + * element - map collating-element name to chr + */ +static chr +element(struct vars *v, /* context */ + const chr *startp, /* points to start of name */ + const chr *endp) /* points just past end of name */ +{ + const struct cname *cn; + size_t len; + + /* generic: one-chr names stand for themselves */ + assert(startp < endp); + len = endp - startp; + if (len == 1) + return *startp; + + NOTE(REG_ULOCALE); + + /* search table */ + for (cn = cnames; cn->name != NULL; cn++) + { + if (strlen(cn->name) == len && + pg_char_and_wchar_strncmp(cn->name, startp, len) == 0) + { + break; /* NOTE BREAK OUT */ + } + } + if (cn->name != NULL) + return CHR(cn->code); + + /* couldn't find it */ + ERR(REG_ECOLLATE); + return 0; +} + +/* + * range - supply cvec for a range, including legality check + */ +static struct cvec * +range(struct vars *v, /* context */ + chr a, /* range start */ + chr b, /* range end, might equal a */ + int cases) /* case-independent? */ +{ + int nchrs; + struct cvec *cv; + chr c, + cc; + + if (a != b && !before(a, b)) + { + ERR(REG_ERANGE); + return NULL; + } + + if (!cases) + { /* easy version */ + cv = getcvec(v, 0, 1); + NOERRN(); + addrange(cv, a, b); + return cv; + } + + /* + * When case-independent, it's hard to decide when cvec ranges are usable, + * so for now at least, we won't try. We use a range for the originally + * specified chrs and then add on any case-equivalents that are outside + * that range as individual chrs. + * + * To ensure sane behavior if someone specifies a very large range, limit + * the allocation size to 100000 chrs (arbitrary) and check for overrun + * inside the loop below. + */ + nchrs = b - a + 1; + if (nchrs <= 0 || nchrs > 100000) + nchrs = 100000; + + cv = getcvec(v, nchrs, 1); + NOERRN(); + addrange(cv, a, b); + + for (c = a; c <= b; c++) + { + cc = pg_wc_tolower(c); + if (cc != c && + (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + cc = pg_wc_toupper(c); + if (cc != c && + (before(cc, a) || before(b, cc))) + { + if (cv->nchrs >= cv->chrspace) + { + ERR(REG_ETOOBIG); + return NULL; + } + addchr(cv, cc); + } + if (CANCEL_REQUESTED(v->re)) + { + ERR(REG_CANCEL); + return NULL; + } + } + + return cv; +} + +/* + * before - is chr x before chr y, for purposes of range legality? + */ +static int /* predicate */ +before(chr x, chr y) +{ + if (x < y) + return 1; + return 0; +} + +/* + * eclass - supply cvec for an equivalence class + * Must include case counterparts on request. + */ +static struct cvec * +eclass(struct vars *v, /* context */ + chr c, /* Collating element representing the + * equivalence class. */ + int cases) /* all cases? */ +{ + struct cvec *cv; + + /* crude fake equivalence class for testing */ + if ((v->cflags & REG_FAKE) && c == 'x') + { + cv = getcvec(v, 4, 0); + addchr(cv, CHR('x')); + addchr(cv, CHR('y')); + if (cases) + { + addchr(cv, CHR('X')); + addchr(cv, CHR('Y')); + } + return cv; + } + + /* otherwise, none */ + if (cases) + return allcases(v, c); + cv = getcvec(v, 1, 0); + assert(cv != NULL); + addchr(cv, c); + return cv; +} + +/* + * lookupcclass - lookup a character class identified by name + * + * On failure, sets an error code in *v; the result is then garbage. + */ +static enum char_classes +lookupcclass(struct vars *v, /* context (for returning errors) */ + const chr *startp, /* where the name starts */ + const chr *endp) /* just past the end of the name */ +{ + size_t len; + const char *const *namePtr; + int i; + + /* + * Map the name to the corresponding enumerated value. + */ + len = endp - startp; + for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) + { + if (strlen(*namePtr) == len && + pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) + return (enum char_classes) i; + } + + ERR(REG_ECTYPE); + return (enum char_classes) 0; +} + +/* + * cclasscvec - supply cvec for a character class + * + * Must include case counterparts if "cases" is true. + * + * The returned cvec might be either a transient cvec gotten from getcvec(), + * or a permanently cached one from pg_ctype_get_cache(). This is okay + * because callers are not supposed to explicitly free the result either way. + */ +static struct cvec * +cclasscvec(struct vars *v, /* context */ + enum char_classes cclasscode, /* class to build a cvec for */ + int cases) /* case-independent? */ +{ + struct cvec *cv = NULL; + + /* + * Remap lower and upper to alpha if the match is case insensitive. + */ + + if (cases && + (cclasscode == CC_LOWER || + cclasscode == CC_UPPER)) + cclasscode = CC_ALPHA; + + /* + * Now compute the character class contents. For classes that are based + * on the behavior of a <wctype.h> or <ctype.h> function, we use + * pg_ctype_get_cache so that we can cache the results. Other classes + * have definitions that are hard-wired here, and for those we just + * construct a transient cvec on the fly. + * + * NB: keep this code in sync with cclass_column_index(), below. + */ + + switch (cclasscode) + { + case CC_PRINT: + cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode); + break; + case CC_ALNUM: + cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode); + break; + case CC_ALPHA: + cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode); + break; + case CC_WORD: + cv = pg_ctype_get_cache(pg_wc_isword, cclasscode); + break; + case CC_ASCII: + /* hard-wired meaning */ + cv = getcvec(v, 0, 1); + if (cv) + addrange(cv, 0, 0x7f); + break; + case CC_BLANK: + /* hard-wired meaning */ + cv = getcvec(v, 2, 0); + addchr(cv, '\t'); + addchr(cv, ' '); + break; + case CC_CNTRL: + /* hard-wired meaning */ + cv = getcvec(v, 0, 2); + addrange(cv, 0x0, 0x1f); + addrange(cv, 0x7f, 0x9f); + break; + case CC_DIGIT: + cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode); + break; + case CC_PUNCT: + cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode); + break; + case CC_XDIGIT: + + /* + * It's not clear how to define this in non-western locales, and + * even less clear that there's any particular use in trying. So + * just hard-wire the meaning. + */ + cv = getcvec(v, 0, 3); + if (cv) + { + addrange(cv, '0', '9'); + addrange(cv, 'a', 'f'); + addrange(cv, 'A', 'F'); + } + break; + case CC_SPACE: + cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode); + break; + case CC_LOWER: + cv = pg_ctype_get_cache(pg_wc_islower, cclasscode); + break; + case CC_UPPER: + cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode); + break; + case CC_GRAPH: + cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode); + break; + } + + /* If cv is NULL now, the reason must be "out of memory" */ + if (cv == NULL) + ERR(REG_ESPACE); + return cv; +} + +/* + * cclass_column_index - get appropriate high colormap column index for chr + */ +static int +cclass_column_index(struct colormap *cm, chr c) +{ + int colnum = 0; + + /* Shouldn't go through all these pushups for simple chrs */ + assert(c > MAX_SIMPLE_CHR); + + /* + * Note: we should not see requests to consider cclasses that are not + * treated as locale-specific by cclasscvec(), above. + */ + if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) + colnum |= cm->classbits[CC_PRINT]; + if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c)) + colnum |= cm->classbits[CC_ALNUM]; + if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) + colnum |= cm->classbits[CC_ALPHA]; + if (cm->classbits[CC_WORD] && pg_wc_isword(c)) + colnum |= cm->classbits[CC_WORD]; + assert(cm->classbits[CC_ASCII] == 0); + assert(cm->classbits[CC_BLANK] == 0); + assert(cm->classbits[CC_CNTRL] == 0); + if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c)) + colnum |= cm->classbits[CC_DIGIT]; + if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c)) + colnum |= cm->classbits[CC_PUNCT]; + assert(cm->classbits[CC_XDIGIT] == 0); + if (cm->classbits[CC_SPACE] && pg_wc_isspace(c)) + colnum |= cm->classbits[CC_SPACE]; + if (cm->classbits[CC_LOWER] && pg_wc_islower(c)) + colnum |= cm->classbits[CC_LOWER]; + if (cm->classbits[CC_UPPER] && pg_wc_isupper(c)) + colnum |= cm->classbits[CC_UPPER]; + if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c)) + colnum |= cm->classbits[CC_GRAPH]; + + return colnum; +} + +/* + * allcases - supply cvec for all case counterparts of a chr (including itself) + * + * This is a shortcut, preferably an efficient one, for simple characters; + * messy cases are done via range(). + */ +static struct cvec * +allcases(struct vars *v, /* context */ + chr c) /* character to get case equivs of */ +{ + struct cvec *cv; + chr lc, + uc; + + lc = pg_wc_tolower(c); + uc = pg_wc_toupper(c); + + cv = getcvec(v, 2, 0); + addchr(cv, lc); + if (lc != uc) + addchr(cv, uc); + return cv; +} + +/* + * cmp - chr-substring compare + * + * Backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +cmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + return memcmp(VS(x), VS(y), len * sizeof(chr)); +} + +/* + * casecmp - case-independent chr-substring compare + * + * REG_ICASE backrefs need this. It should preferably be efficient. + * Note that it does not need to report anything except equal/unequal. + * Note also that the length is exact, and the comparison should not + * stop at embedded NULs! + */ +static int /* 0 for equal, nonzero for unequal */ +casecmp(const chr *x, const chr *y, /* strings to compare */ + size_t len) /* exact length of comparison */ +{ + for (; len > 0; len--, x++, y++) + { + if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y))) + return 1; + } + return 0; +} |