summaryrefslogtreecommitdiffstats
path: root/lib/glob/smatch.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/glob/smatch.c')
-rw-r--r--lib/glob/smatch.c638
1 files changed, 638 insertions, 0 deletions
diff --git a/lib/glob/smatch.c b/lib/glob/smatch.c
new file mode 100644
index 0000000..379c2d2
--- /dev/null
+++ b/lib/glob/smatch.c
@@ -0,0 +1,638 @@
+/* strmatch.c -- ksh-like extended pattern matching for the shell and filename
+ globbing. */
+
+/* Copyright (C) 1991-2021 Free Software Foundation, Inc.
+
+ This file is part of GNU Bash, the Bourne Again SHell.
+
+ Bash is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Bash is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Bash. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <stdio.h> /* for debugging */
+
+#include "strmatch.h"
+#include <chartypes.h>
+
+#include "bashansi.h"
+#include "shmbutil.h"
+#include "xmalloc.h"
+
+#include <errno.h>
+
+#if !defined (errno)
+extern int errno;
+#endif
+
+#if FNMATCH_EQUIV_FALLBACK
+/* We don't include <fnmatch.h> in order to avoid namespace collisions; the
+ internal strmatch still uses the FNM_ constants. */
+extern int fnmatch (const char *, const char *, int);
+#endif
+
+/* First, compile `sm_loop.c' for single-byte characters. */
+#define CHAR unsigned char
+#define U_CHAR unsigned char
+#define XCHAR char
+#define INT int
+#define L(CS) CS
+#define INVALID -1
+
+#undef STREQ
+#undef STREQN
+#define STREQ(a, b) ((a)[0] == (b)[0] && strcmp(a, b) == 0)
+#define STREQN(a, b, n) ((a)[0] == (b)[0] && strncmp(a, b, n) == 0)
+
+#ifndef GLOBASCII_DEFAULT
+# define GLOBASCII_DEFAULT 0
+#endif
+
+int glob_asciirange = GLOBASCII_DEFAULT;
+
+#if FNMATCH_EQUIV_FALLBACK
+/* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
+ to fnmatch to see if wide characters c1 and c2 collate as members of the
+ same equivalence class. We can't really do this portably any other way */
+static int
+_fnmatch_fallback (s, p)
+ int s, p; /* string char, patchar */
+{
+ char s1[2]; /* string */
+ char s2[8]; /* constructed pattern */
+
+ s1[0] = (unsigned char)s;
+ s1[1] = '\0';
+
+ /* reconstruct the pattern */
+ s2[0] = s2[1] = '[';
+ s2[2] = '=';
+ s2[3] = (unsigned char)p;
+ s2[4] = '=';
+ s2[5] = s2[6] = ']';
+ s2[7] = '\0';
+
+ return (fnmatch ((const char *)s2, (const char *)s1, 0));
+}
+#endif
+
+/* We use strcoll(3) for range comparisons in bracket expressions,
+ even though it can have unwanted side effects in locales
+ other than POSIX or US. For instance, in the de locale, [A-Z] matches
+ all characters. If GLOB_ASCIIRANGE is non-zero, and we're not forcing
+ the use of strcoll (e.g., for explicit collating symbols), we use
+ straight ordering as if in the C locale. */
+
+#if defined (HAVE_STRCOLL)
+/* Helper functions for collating symbol equivalence. */
+
+/* Return 0 if C1 == C2 or collates equally if FORCECOLL is non-zero. */
+static int
+charcmp (c1, c2, forcecoll)
+ int c1, c2;
+ int forcecoll;
+{
+ static char s1[2] = { ' ', '\0' };
+ static char s2[2] = { ' ', '\0' };
+ int ret;
+
+ /* Eight bits only. Period. */
+ c1 &= 0xFF;
+ c2 &= 0xFF;
+
+ if (c1 == c2)
+ return (0);
+
+ if (forcecoll == 0 && glob_asciirange)
+ return (c1 - c2);
+
+ s1[0] = c1;
+ s2[0] = c2;
+
+ return (strcoll (s1, s2));
+}
+
+static int
+rangecmp (c1, c2, forcecoll)
+ int c1, c2;
+ int forcecoll;
+{
+ int r;
+
+ r = charcmp (c1, c2, forcecoll);
+
+ /* We impose a total ordering here by returning c1-c2 if charcmp returns 0 */
+ if (r != 0)
+ return r;
+ return (c1 - c2); /* impose total ordering */
+}
+#else /* !HAVE_STRCOLL */
+# define rangecmp(c1, c2, f) ((int)(c1) - (int)(c2))
+#endif /* !HAVE_STRCOLL */
+
+#if defined (HAVE_STRCOLL)
+/* Returns 1 if chars C and EQUIV collate equally in the current locale. */
+static int
+collequiv (c, equiv)
+ int c, equiv;
+{
+ if (charcmp (c, equiv, 1) == 0)
+ return 1;
+
+#if FNMATCH_EQUIV_FALLBACK
+ return (_fnmatch_fallback (c, equiv) == 0);
+#else
+ return 0;
+#endif
+
+}
+#else
+# define collequiv(c, equiv) ((c) == (equiv))
+#endif
+
+#define _COLLSYM _collsym
+#define __COLLSYM __collsym
+#define POSIXCOLL posix_collsyms
+#include "collsyms.h"
+
+static int
+collsym (s, len)
+ CHAR *s;
+ int len;
+{
+ register struct _collsym *csp;
+ char *x;
+
+ x = (char *)s;
+ for (csp = posix_collsyms; csp->name; csp++)
+ {
+ if (STREQN(csp->name, x, len) && csp->name[len] == '\0')
+ return (csp->code);
+ }
+ if (len == 1)
+ return s[0];
+ return INVALID;
+}
+
+/* unibyte character classification */
+#if !defined (isascii) && !defined (HAVE_ISASCII)
+# define isascii(c) ((unsigned int)(c) <= 0177)
+#endif
+
+enum char_class
+ {
+ CC_NO_CLASS = 0,
+ CC_ASCII, CC_ALNUM, CC_ALPHA, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
+ CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_WORD, CC_XDIGIT
+ };
+
+static char const *const cclass_name[] =
+ {
+ "",
+ "ascii", "alnum", "alpha", "blank", "cntrl", "digit", "graph",
+ "lower", "print", "punct", "space", "upper", "word", "xdigit"
+ };
+
+#define N_CHAR_CLASS (sizeof(cclass_name) / sizeof (cclass_name[0]))
+
+static enum char_class
+is_valid_cclass (name)
+ const char *name;
+{
+ enum char_class ret;
+ int i;
+
+ ret = CC_NO_CLASS;
+
+ for (i = 1; i < N_CHAR_CLASS; i++)
+ {
+ if (STREQ (name, cclass_name[i]))
+ {
+ ret = (enum char_class)i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int
+cclass_test (c, char_class)
+ int c;
+ enum char_class char_class;
+{
+ int result;
+
+ switch (char_class)
+ {
+ case CC_ASCII:
+ result = isascii (c);
+ break;
+ case CC_ALNUM:
+ result = ISALNUM (c);
+ break;
+ case CC_ALPHA:
+ result = ISALPHA (c);
+ break;
+ case CC_BLANK:
+ result = ISBLANK (c);
+ break;
+ case CC_CNTRL:
+ result = ISCNTRL (c);
+ break;
+ case CC_DIGIT:
+ result = ISDIGIT (c);
+ break;
+ case CC_GRAPH:
+ result = ISGRAPH (c);
+ break;
+ case CC_LOWER:
+ result = ISLOWER (c);
+ break;
+ case CC_PRINT:
+ result = ISPRINT (c);
+ break;
+ case CC_PUNCT:
+ result = ISPUNCT (c);
+ break;
+ case CC_SPACE:
+ result = ISSPACE (c);
+ break;
+ case CC_UPPER:
+ result = ISUPPER (c);
+ break;
+ case CC_WORD:
+ result = (ISALNUM (c) || c == '_');
+ break;
+ case CC_XDIGIT:
+ result = ISXDIGIT (c);
+ break;
+ default:
+ result = -1;
+ break;
+ }
+
+ return result;
+}
+
+static int
+is_cclass (c, name)
+ int c;
+ const char *name;
+{
+ enum char_class char_class;
+ int result;
+
+ char_class = is_valid_cclass (name);
+ if (char_class == CC_NO_CLASS)
+ return -1;
+
+ result = cclass_test (c, char_class);
+ return (result);
+}
+
+/* Now include `sm_loop.c' for single-byte characters. */
+/* The result of FOLD is an `unsigned char' */
+# define FOLD(c) ((flags & FNM_CASEFOLD) \
+ ? TOLOWER ((unsigned char)c) \
+ : ((unsigned char)c))
+
+#if !defined (__CYGWIN__)
+# define ISDIRSEP(c) ((c) == '/')
+#else
+# define ISDIRSEP(c) ((c) == '/' || (c) == '\\')
+#endif /* __CYGWIN__ */
+#define PATHSEP(c) (ISDIRSEP(c) || (c) == 0)
+
+# define PDOT_OR_DOTDOT(s) (s[0] == '.' && (PATHSEP (s[1]) || (s[1] == '.' && PATHSEP (s[2]))))
+# define SDOT_OR_DOTDOT(s) (s[0] == '.' && (s[1] == 0 || (s[1] == '.' && s[2] == 0)))
+
+#define FCT internal_strmatch
+#define GMATCH gmatch
+#define COLLSYM collsym
+#define PARSE_COLLSYM parse_collsym
+#define BRACKMATCH brackmatch
+#define PATSCAN glob_patscan
+#define STRCOMPARE strcompare
+#define EXTMATCH extmatch
+#define DEQUOTE_PATHNAME udequote_pathname
+#define STRUCT smat_struct
+#define STRCHR(S, C) strchr((S), (C))
+#define MEMCHR(S, C, N) memchr((S), (C), (N))
+#define STRCOLL(S1, S2) strcoll((S1), (S2))
+#define STRLEN(S) strlen(S)
+#define STRCMP(S1, S2) strcmp((S1), (S2))
+#define RANGECMP(C1, C2, F) rangecmp((C1), (C2), (F))
+#define COLLEQUIV(C1, C2) collequiv((C1), (C2))
+#define CTYPE_T enum char_class
+#define IS_CCLASS(C, S) is_cclass((C), (S))
+#include "sm_loop.c"
+
+#if HANDLE_MULTIBYTE
+
+# define CHAR wchar_t
+# define U_CHAR wint_t
+# define XCHAR wchar_t
+# define INT wint_t
+# define L(CS) L##CS
+# define INVALID WEOF
+
+# undef STREQ
+# undef STREQN
+# define STREQ(s1, s2) ((wcscmp (s1, s2) == 0))
+# define STREQN(a, b, n) ((a)[0] == (b)[0] && wcsncmp(a, b, n) == 0)
+
+extern char *mbsmbchar PARAMS((const char *));
+
+#if FNMATCH_EQUIV_FALLBACK
+/* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them
+ to fnmatch to see if wide characters c1 and c2 collate as members of the
+ same equivalence class. We can't really do this portably any other way */
+static int
+_fnmatch_fallback_wc (c1, c2)
+ wchar_t c1, c2; /* string char, patchar */
+{
+ char w1[MB_LEN_MAX+1]; /* string */
+ char w2[MB_LEN_MAX+8]; /* constructed pattern */
+ int l1, l2;
+
+ l1 = wctomb (w1, c1);
+ if (l1 == -1)
+ return (2);
+ w1[l1] = '\0';
+
+ /* reconstruct the pattern */
+ w2[0] = w2[1] = '[';
+ w2[2] = '=';
+ l2 = wctomb (w2+3, c2);
+ if (l2 == -1)
+ return (2);
+ w2[l2+3] = '=';
+ w2[l2+4] = w2[l2+5] = ']';
+ w2[l2+6] = '\0';
+
+ return (fnmatch ((const char *)w2, (const char *)w1, 0));
+}
+#endif
+
+static int
+charcmp_wc (c1, c2, forcecoll)
+ wint_t c1, c2;
+ int forcecoll;
+{
+ static wchar_t s1[2] = { L' ', L'\0' };
+ static wchar_t s2[2] = { L' ', L'\0' };
+ int r;
+
+ if (c1 == c2)
+ return 0;
+
+ if (forcecoll == 0 && glob_asciirange && c1 <= UCHAR_MAX && c2 <= UCHAR_MAX)
+ return ((int)(c1 - c2));
+
+ s1[0] = c1;
+ s2[0] = c2;
+
+ return (wcscoll (s1, s2));
+}
+
+static int
+rangecmp_wc (c1, c2, forcecoll)
+ wint_t c1, c2;
+ int forcecoll;
+{
+ int r;
+
+ r = charcmp_wc (c1, c2, forcecoll);
+
+ /* We impose a total ordering here by returning c1-c2 if charcmp returns 0,
+ as we do above in the single-byte case. */
+ if (r != 0 || forcecoll)
+ return r;
+ return ((int)(c1 - c2)); /* impose total ordering */
+}
+
+/* Returns 1 if wide chars C and EQUIV collate equally in the current locale. */
+static int
+collequiv_wc (c, equiv)
+ wint_t c, equiv;
+{
+ wchar_t s, p;
+
+ if (charcmp_wc (c, equiv, 1) == 0)
+ return 1;
+
+#if FNMATCH_EQUIV_FALLBACK
+/* We check explicitly for success (fnmatch returns 0) to avoid problems if
+ our local definition of FNM_NOMATCH (strmatch.h) doesn't match the
+ system's (fnmatch.h). We don't care about error return values here. */
+
+ s = c;
+ p = equiv;
+ return (_fnmatch_fallback_wc (s, p) == 0);
+#else
+ return 0;
+#endif
+}
+
+/* Helper function for collating symbol. */
+# define _COLLSYM _collwcsym
+# define __COLLSYM __collwcsym
+# define POSIXCOLL posix_collwcsyms
+# include "collsyms.h"
+
+static wint_t
+collwcsym (s, len)
+ wchar_t *s;
+ int len;
+{
+ register struct _collwcsym *csp;
+
+ for (csp = posix_collwcsyms; csp->name; csp++)
+ {
+ if (STREQN(csp->name, s, len) && csp->name[len] == L'\0')
+ return (csp->code);
+ }
+ if (len == 1)
+ return s[0];
+ return INVALID;
+}
+
+static int
+is_wcclass (wc, name)
+ wint_t wc;
+ wchar_t *name;
+{
+ char *mbs;
+ mbstate_t state;
+ size_t mbslength;
+ wctype_t desc;
+ int want_word;
+
+ if ((wctype ("ascii") == (wctype_t)0) && (wcscmp (name, L"ascii") == 0))
+ {
+ int c;
+
+ if ((c = wctob (wc)) == EOF)
+ return 0;
+ else
+ return (c <= 0x7F);
+ }
+
+ want_word = (wcscmp (name, L"word") == 0);
+ if (want_word)
+ name = L"alnum";
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ mbs = (char *) malloc (wcslen(name) * MB_CUR_MAX + 1);
+ if (mbs == 0)
+ return -1;
+ mbslength = wcsrtombs (mbs, (const wchar_t **)&name, (wcslen(name) * MB_CUR_MAX + 1), &state);
+
+ if (mbslength == (size_t)-1 || mbslength == (size_t)-2)
+ {
+ free (mbs);
+ return -1;
+ }
+ desc = wctype (mbs);
+ free (mbs);
+
+ if (desc == (wctype_t)0)
+ return -1;
+
+ if (want_word)
+ return (iswctype (wc, desc) || wc == L'_');
+ else
+ return (iswctype (wc, desc));
+}
+
+/* Return 1 if there are no char class [:class:] expressions (degenerate case)
+ or only posix-specified (C locale supported) char class expressions in
+ PATTERN. These are the ones where it's safe to punt to the single-byte
+ code, since wide character support allows locale-defined char classes.
+ This only uses single-byte code, but is only needed to support multibyte
+ locales. */
+static int
+posix_cclass_only (pattern)
+ char *pattern;
+{
+ char *p, *p1;
+ char cc[16]; /* sufficient for all valid posix char class names */
+ enum char_class valid;
+
+ p = pattern;
+ while (p = strchr (p, '['))
+ {
+ if (p[1] != ':')
+ {
+ p++;
+ continue;
+ }
+ p += 2; /* skip past "[:" */
+ /* Find end of char class expression */
+ for (p1 = p; *p1; p1++)
+ if (*p1 == ':' && p1[1] == ']')
+ break;
+ if (*p1 == 0) /* no char class expression found */
+ break;
+ /* Find char class name and validate it against posix char classes */
+ if ((p1 - p) >= sizeof (cc))
+ return 0;
+ bcopy (p, cc, p1 - p);
+ cc[p1 - p] = '\0';
+ valid = is_valid_cclass (cc);
+ if (valid == CC_NO_CLASS)
+ return 0; /* found unrecognized char class name */
+
+ p = p1 + 2; /* found posix char class name */
+ }
+
+ return 1; /* no char class names or only posix */
+}
+
+/* Now include `sm_loop.c' for multibyte characters. */
+#define FOLD(c) ((flags & FNM_CASEFOLD) && iswupper (c) ? towlower (c) : (c))
+
+# if !defined (__CYGWIN__)
+# define ISDIRSEP(c) ((c) == L'/')
+# else
+# define ISDIRSEP(c) ((c) == L'/' || (c) == L'\\')
+# endif /* __CYGWIN__ */
+# define PATHSEP(c) (ISDIRSEP(c) || (c) == L'\0')
+
+# define PDOT_OR_DOTDOT(w) (w[0] == L'.' && (PATHSEP(w[1]) || (w[1] == L'.' && PATHSEP(w[2]))))
+# define SDOT_OR_DOTDOT(w) (w[0] == L'.' && (w[1] == L'\0' || (w[1] == L'.' && w[2] == L'\0')))
+
+#define FCT internal_wstrmatch
+#define GMATCH gmatch_wc
+#define COLLSYM collwcsym
+#define PARSE_COLLSYM parse_collwcsym
+#define BRACKMATCH brackmatch_wc
+#define PATSCAN glob_patscan_wc
+#define STRCOMPARE wscompare
+#define EXTMATCH extmatch_wc
+#define DEQUOTE_PATHNAME wcdequote_pathname
+#define STRUCT wcsmat_struct
+#define STRCHR(S, C) wcschr((S), (C))
+#define MEMCHR(S, C, N) wmemchr((S), (C), (N))
+#define STRCOLL(S1, S2) wcscoll((S1), (S2))
+#define STRLEN(S) wcslen(S)
+#define STRCMP(S1, S2) wcscmp((S1), (S2))
+#define RANGECMP(C1, C2, F) rangecmp_wc((C1), (C2), (F))
+#define COLLEQUIV(C1, C2) collequiv_wc((C1), (C2))
+#define CTYPE_T enum char_class
+#define IS_CCLASS(C, S) is_wcclass((C), (S))
+#include "sm_loop.c"
+
+#endif /* HAVE_MULTIBYTE */
+
+int
+xstrmatch (pattern, string, flags)
+ char *pattern;
+ char *string;
+ int flags;
+{
+#if HANDLE_MULTIBYTE
+ int ret;
+ size_t n;
+ wchar_t *wpattern, *wstring;
+ size_t plen, slen, mplen, mslen;
+
+ if (MB_CUR_MAX == 1)
+ return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
+
+ if (mbsmbchar (string) == 0 && mbsmbchar (pattern) == 0 && posix_cclass_only (pattern))
+ return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
+
+ n = xdupmbstowcs (&wpattern, NULL, pattern);
+ if (n == (size_t)-1 || n == (size_t)-2)
+ return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
+
+ n = xdupmbstowcs (&wstring, NULL, string);
+ if (n == (size_t)-1 || n == (size_t)-2)
+ {
+ free (wpattern);
+ return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
+ }
+
+ ret = internal_wstrmatch (wpattern, wstring, flags);
+
+ free (wpattern);
+ free (wstring);
+
+ return ret;
+#else
+ return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags));
+#endif /* !HANDLE_MULTIBYTE */
+}