1 files changed, 771 insertions, 0 deletions
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
new file mode 100644
index 0000000..b5f3a73
--- /dev/null
+++ b/src/backend/regex/regc_locale.c
@@ -0,0 +1,771 @@
+/*
+ * regc_locale.c --
+ *
+ *	This file contains locale-specific regexp routines.
+ *	This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * This software is copyrighted by the Regents of the University of
+ * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ * Corporation and other parties.  The following terms apply to all files
+ * associated with the software unless explicitly disclaimed in
+ * individual files.
+ *
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ *
+ * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+ * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ * MODIFICATIONS.
+ *
+ * GOVERNMENT USE: If you are acquiring this software on behalf of the
+ * U.S. government, the Government shall have only "Restricted Rights"
+ * in the software and related documentation as defined in the Federal
+ * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+ * are acquiring the software on behalf of the Department of Defense, the
+ * software shall be classified as "Commercial Computer Software" and the
+ * Government shall have only "Restricted Rights" as defined in Clause
+ * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+ * authors grant the U.S. Government and others acting in its behalf
+ * permission to use and distribute the software in accordance with the
+ * terms specified in this license.
+ *
+ * src/backend/regex/regc_locale.c
+ */
+
+/* ASCII character-name table */
+
+static const struct cname
+{
+	const char *name;
+	const char	code;
+}			cnames[] =
+
+{
+	{
+		"NUL", '\0'
+	},
+	{
+		"SOH", '\001'
+	},
+	{
+		"STX", '\002'
+	},
+	{
+		"ETX", '\003'
+	},
+	{
+		"EOT", '\004'
+	},
+	{
+		"ENQ", '\005'
+	},
+	{
+		"ACK", '\006'
+	},
+	{
+		"BEL", '\007'
+	},
+	{
+		"alert", '\007'
+	},
+	{
+		"BS", '\010'
+	},
+	{
+		"backspace", '\b'
+	},
+	{
+		"HT", '\011'
+	},
+	{
+		"tab", '\t'
+	},
+	{
+		"LF", '\012'
+	},
+	{
+		"newline", '\n'
+	},
+	{
+		"VT", '\013'
+	},
+	{
+		"vertical-tab", '\v'
+	},
+	{
+		"FF", '\014'
+	},
+	{
+		"form-feed", '\f'
+	},
+	{
+		"CR", '\015'
+	},
+	{
+		"carriage-return", '\r'
+	},
+	{
+		"SO", '\016'
+	},
+	{
+		"SI", '\017'
+	},
+	{
+		"DLE", '\020'
+	},
+	{
+		"DC1", '\021'
+	},
+	{
+		"DC2", '\022'
+	},
+	{
+		"DC3", '\023'
+	},
+	{
+		"DC4", '\024'
+	},
+	{
+		"NAK", '\025'
+	},
+	{
+		"SYN", '\026'
+	},
+	{
+		"ETB", '\027'
+	},
+	{
+		"CAN", '\030'
+	},
+	{
+		"EM", '\031'
+	},
+	{
+		"SUB", '\032'
+	},
+	{
+		"ESC", '\033'
+	},
+	{
+		"IS4", '\034'
+	},
+	{
+		"FS", '\034'
+	},
+	{
+		"IS3", '\035'
+	},
+	{
+		"GS", '\035'
+	},
+	{
+		"IS2", '\036'
+	},
+	{
+		"RS", '\036'
+	},
+	{
+		"IS1", '\037'
+	},
+	{
+		"US", '\037'
+	},
+	{
+		"space", ' '
+	},
+	{
+		"exclamation-mark", '!'
+	},
+	{
+		"quotation-mark", '"'
+	},
+	{
+		"number-sign", '#'
+	},
+	{
+		"dollar-sign", '$'
+	},
+	{
+		"percent-sign", '%'
+	},
+	{
+		"ampersand", '&'
+	},
+	{
+		"apostrophe", '\''
+	},
+	{
+		"left-parenthesis", '('
+	},
+	{
+		"right-parenthesis", ')'
+	},
+	{
+		"asterisk", '*'
+	},
+	{
+		"plus-sign", '+'
+	},
+	{
+		"comma", ','
+	},
+	{
+		"hyphen", '-'
+	},
+	{
+		"hyphen-minus", '-'
+	},
+	{
+		"period", '.'
+	},
+	{
+		"full-stop", '.'
+	},
+	{
+		"slash", '/'
+	},
+	{
+		"solidus", '/'
+	},
+	{
+		"zero", '0'
+	},
+	{
+		"one", '1'
+	},
+	{
+		"two", '2'
+	},
+	{
+		"three", '3'
+	},
+	{
+		"four", '4'
+	},
+	{
+		"five", '5'
+	},
+	{
+		"six", '6'
+	},
+	{
+		"seven", '7'
+	},
+	{
+		"eight", '8'
+	},
+	{
+		"nine", '9'
+	},
+	{
+		"colon", ':'
+	},
+	{
+		"semicolon", ';'
+	},
+	{
+		"less-than-sign", '<'
+	},
+	{
+		"equals-sign", '='
+	},
+	{
+		"greater-than-sign", '>'
+	},
+	{
+		"question-mark", '?'
+	},
+	{
+		"commercial-at", '@'
+	},
+	{
+		"left-square-bracket", '['
+	},
+	{
+		"backslash", '\\'
+	},
+	{
+		"reverse-solidus", '\\'
+	},
+	{
+		"right-square-bracket", ']'
+	},
+	{
+		"circumflex", '^'
+	},
+	{
+		"circumflex-accent", '^'
+	},
+	{
+		"underscore", '_'
+	},
+	{
+		"low-line", '_'
+	},
+	{
+		"grave-accent", '`'
+	},
+	{
+		"left-brace", '{'
+	},
+	{
+		"left-curly-bracket", '{'
+	},
+	{
+		"vertical-line", '|'
+	},
+	{
+		"right-brace", '}'
+	},
+	{
+		"right-curly-bracket", '}'
+	},
+	{
+		"tilde", '~'
+	},
+	{
+		"DEL", '\177'
+	},
+	{
+		NULL, 0
+	}
+};
+
+/*
+ * The following array defines the valid character class names.
+ * The entries must match enum char_classes in regguts.h.
+ */
+static const char *const classNames[NUM_CCLASSES + 1] = {
+	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
+	"lower", "print", "punct", "space", "upper", "xdigit", "word",
+	NULL
+};
+
+/*
+ * We do not use the hard-wired Unicode classification tables that Tcl does.
+ * This is because (a) we need to deal with other encodings besides Unicode,
+ * and (b) we want to track the behavior of the libc locale routines as
+ * closely as possible.  For example, it wouldn't be unreasonable for a
+ * locale to not consider every Unicode letter as a letter.  So we build
+ * character classification cvecs by asking libc, even for Unicode.
+ */
+
+
+/*
+ * element - map collating-element name to chr
+ */
+static chr
+element(struct vars *v,			/* context */
+		const chr *startp,		/* points to start of name */
+		const chr *endp)		/* points just past end of name */
+{
+	const struct cname *cn;
+	size_t		len;
+
+	/* generic:  one-chr names stand for themselves */
+	assert(startp < endp);
+	len = endp - startp;
+	if (len == 1)
+		return *startp;
+
+	NOTE(REG_ULOCALE);
+
+	/* search table */
+	for (cn = cnames; cn->name != NULL; cn++)
+	{
+		if (strlen(cn->name) == len &&
+			pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
+		{
+			break;				/* NOTE BREAK OUT */
+		}
+	}
+	if (cn->name != NULL)
+		return CHR(cn->code);
+
+	/* couldn't find it */
+	ERR(REG_ECOLLATE);
+	return 0;
+}
+
+/*
+ * range - supply cvec for a range, including legality check
+ */
+static struct cvec *
+range(struct vars *v,			/* context */
+	  chr a,					/* range start */
+	  chr b,					/* range end, might equal a */
+	  int cases)				/* case-independent? */
+{
+	int			nchrs;
+	struct cvec *cv;
+	chr			c,
+				cc;
+
+	if (a != b && !before(a, b))
+	{
+		ERR(REG_ERANGE);
+		return NULL;
+	}
+
+	if (!cases)
+	{							/* easy version */
+		cv = getcvec(v, 0, 1);
+		NOERRN();
+		addrange(cv, a, b);
+		return cv;
+	}
+
+	/*
+	 * When case-independent, it's hard to decide when cvec ranges are usable,
+	 * so for now at least, we won't try.  We use a range for the originally
+	 * specified chrs and then add on any case-equivalents that are outside
+	 * that range as individual chrs.
+	 *
+	 * To ensure sane behavior if someone specifies a very large range, limit
+	 * the allocation size to 100000 chrs (arbitrary) and check for overrun
+	 * inside the loop below.
+	 */
+	nchrs = b - a + 1;
+	if (nchrs <= 0 || nchrs > 100000)
+		nchrs = 100000;
+
+	cv = getcvec(v, nchrs, 1);
+	NOERRN();
+	addrange(cv, a, b);
+
+	for (c = a; c <= b; c++)
+	{
+		cc = pg_wc_tolower(c);
+		if (cc != c &&
+			(before(cc, a) || before(b, cc)))
+		{
+			if (cv->nchrs >= cv->chrspace)
+			{
+				ERR(REG_ETOOBIG);
+				return NULL;
+			}
+			addchr(cv, cc);
+		}
+		cc = pg_wc_toupper(c);
+		if (cc != c &&
+			(before(cc, a) || before(b, cc)))
+		{
+			if (cv->nchrs >= cv->chrspace)
+			{
+				ERR(REG_ETOOBIG);
+				return NULL;
+			}
+			addchr(cv, cc);
+		}
+		if (CANCEL_REQUESTED(v->re))
+		{
+			ERR(REG_CANCEL);
+			return NULL;
+		}
+	}
+
+	return cv;
+}
+
+/*
+ * before - is chr x before chr y, for purposes of range legality?
+ */
+static int						/* predicate */
+before(chr x, chr y)
+{
+	if (x < y)
+		return 1;
+	return 0;
+}
+
+/*
+ * eclass - supply cvec for an equivalence class
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+eclass(struct vars *v,			/* context */
+	   chr c,					/* Collating element representing the
+								 * equivalence class. */
+	   int cases)				/* all cases? */
+{
+	struct cvec *cv;
+
+	/* crude fake equivalence class for testing */
+	if ((v->cflags & REG_FAKE) && c == 'x')
+	{
+		cv = getcvec(v, 4, 0);
+		addchr(cv, CHR('x'));
+		addchr(cv, CHR('y'));
+		if (cases)
+		{
+			addchr(cv, CHR('X'));
+			addchr(cv, CHR('Y'));
+		}
+		return cv;
+	}
+
+	/* otherwise, none */
+	if (cases)
+		return allcases(v, c);
+	cv = getcvec(v, 1, 0);
+	assert(cv != NULL);
+	addchr(cv, c);
+	return cv;
+}
+
+/*
+ * lookupcclass - lookup a character class identified by name
+ *
+ * On failure, sets an error code in *v; the result is then garbage.
+ */
+static enum char_classes
+lookupcclass(struct vars *v,	/* context (for returning errors) */
+			 const chr *startp, /* where the name starts */
+			 const chr *endp)	/* just past the end of the name */
+{
+	size_t		len;
+	const char *const *namePtr;
+	int			i;
+
+	/*
+	 * Map the name to the corresponding enumerated value.
+	 */
+	len = endp - startp;
+	for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
+	{
+		if (strlen(*namePtr) == len &&
+			pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
+			return (enum char_classes) i;
+	}
+
+	ERR(REG_ECTYPE);
+	return (enum char_classes) 0;
+}
+
+/*
+ * cclasscvec - supply cvec for a character class
+ *
+ * Must include case counterparts if "cases" is true.
+ *
+ * The returned cvec might be either a transient cvec gotten from getcvec(),
+ * or a permanently cached one from pg_ctype_get_cache().  This is okay
+ * because callers are not supposed to explicitly free the result either way.
+ */
+static struct cvec *
+cclasscvec(struct vars *v,		/* context */
+		   enum char_classes cclasscode,	/* class to build a cvec for */
+		   int cases)			/* case-independent? */
+{
+	struct cvec *cv = NULL;
+
+	/*
+	 * Remap lower and upper to alpha if the match is case insensitive.
+	 */
+
+	if (cases &&
+		(cclasscode == CC_LOWER ||
+		 cclasscode == CC_UPPER))
+		cclasscode = CC_ALPHA;
+
+	/*
+	 * Now compute the character class contents.  For classes that are based
+	 * on the behavior of a <wctype.h> or <ctype.h> function, we use
+	 * pg_ctype_get_cache so that we can cache the results.  Other classes
+	 * have definitions that are hard-wired here, and for those we just
+	 * construct a transient cvec on the fly.
+	 *
+	 * NB: keep this code in sync with cclass_column_index(), below.
+	 */
+
+	switch (cclasscode)
+	{
+		case CC_PRINT:
+			cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
+			break;
+		case CC_ALNUM:
+			cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
+			break;
+		case CC_ALPHA:
+			cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
+			break;
+		case CC_WORD:
+			cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
+			break;
+		case CC_ASCII:
+			/* hard-wired meaning */
+			cv = getcvec(v, 0, 1);
+			if (cv)
+				addrange(cv, 0, 0x7f);
+			break;
+		case CC_BLANK:
+			/* hard-wired meaning */
+			cv = getcvec(v, 2, 0);
+			addchr(cv, '\t');
+			addchr(cv, ' ');
+			break;
+		case CC_CNTRL:
+			/* hard-wired meaning */
+			cv = getcvec(v, 0, 2);
+			addrange(cv, 0x0, 0x1f);
+			addrange(cv, 0x7f, 0x9f);
+			break;
+		case CC_DIGIT:
+			cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
+			break;
+		case CC_PUNCT:
+			cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
+			break;
+		case CC_XDIGIT:
+
+			/*
+			 * It's not clear how to define this in non-western locales, and
+			 * even less clear that there's any particular use in trying. So
+			 * just hard-wire the meaning.
+			 */
+			cv = getcvec(v, 0, 3);
+			if (cv)
+			{
+				addrange(cv, '0', '9');
+				addrange(cv, 'a', 'f');
+				addrange(cv, 'A', 'F');
+			}
+			break;
+		case CC_SPACE:
+			cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
+			break;
+		case CC_LOWER:
+			cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
+			break;
+		case CC_UPPER:
+			cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
+			break;
+		case CC_GRAPH:
+			cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
+			break;
+	}
+
+	/* If cv is NULL now, the reason must be "out of memory" */
+	if (cv == NULL)
+		ERR(REG_ESPACE);
+	return cv;
+}
+
+/*
+ * cclass_column_index - get appropriate high colormap column index for chr
+ */
+static int
+cclass_column_index(struct colormap *cm, chr c)
+{
+	int			colnum = 0;
+
+	/* Shouldn't go through all these pushups for simple chrs */
+	assert(c > MAX_SIMPLE_CHR);
+
+	/*
+	 * Note: we should not see requests to consider cclasses that are not
+	 * treated as locale-specific by cclasscvec(), above.
+	 */
+	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
+		colnum |= cm->classbits[CC_PRINT];
+	if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
+		colnum |= cm->classbits[CC_ALNUM];
+	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
+		colnum |= cm->classbits[CC_ALPHA];
+	if (cm->classbits[CC_WORD] && pg_wc_isword(c))
+		colnum |= cm->classbits[CC_WORD];
+	assert(cm->classbits[CC_ASCII] == 0);
+	assert(cm->classbits[CC_BLANK] == 0);
+	assert(cm->classbits[CC_CNTRL] == 0);
+	if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
+		colnum |= cm->classbits[CC_DIGIT];
+	if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
+		colnum |= cm->classbits[CC_PUNCT];
+	assert(cm->classbits[CC_XDIGIT] == 0);
+	if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
+		colnum |= cm->classbits[CC_SPACE];
+	if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
+		colnum |= cm->classbits[CC_LOWER];
+	if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
+		colnum |= cm->classbits[CC_UPPER];
+	if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
+		colnum |= cm->classbits[CC_GRAPH];
+
+	return colnum;
+}
+
+/*
+ * allcases - supply cvec for all case counterparts of a chr (including itself)
+ *
+ * This is a shortcut, preferably an efficient one, for simple characters;
+ * messy cases are done via range().
+ */
+static struct cvec *
+allcases(struct vars *v,		/* context */
+		 chr c)					/* character to get case equivs of */
+{
+	struct cvec *cv;
+	chr			lc,
+				uc;
+
+	lc = pg_wc_tolower(c);
+	uc = pg_wc_toupper(c);
+
+	cv = getcvec(v, 2, 0);
+	addchr(cv, lc);
+	if (lc != uc)
+		addchr(cv, uc);
+	return cv;
+}
+
+/*
+ * cmp - chr-substring compare
+ *
+ * Backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int						/* 0 for equal, nonzero for unequal */
+cmp(const chr *x, const chr *y, /* strings to compare */
+	size_t len)					/* exact length of comparison */
+{
+	return memcmp(VS(x), VS(y), len * sizeof(chr));
+}
+
+/*
+ * casecmp - case-independent chr-substring compare
+ *
+ * REG_ICASE backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int						/* 0 for equal, nonzero for unequal */
+casecmp(const chr *x, const chr *y, /* strings to compare */
+		size_t len)				/* exact length of comparison */
+{
+	for (; len > 0; len--, x++, y++)
+	{
+		if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
+			return 1;
+	}
+	return 0;
+}