diff options
Diffstat (limited to 'lib/glob/smatch.c')
-rw-r--r-- | lib/glob/smatch.c | 638 |
1 files changed, 638 insertions, 0 deletions
diff --git a/lib/glob/smatch.c b/lib/glob/smatch.c new file mode 100644 index 0000000..379c2d2 --- /dev/null +++ b/lib/glob/smatch.c @@ -0,0 +1,638 @@ +/* strmatch.c -- ksh-like extended pattern matching for the shell and filename + globbing. */ + +/* Copyright (C) 1991-2021 Free Software Foundation, Inc. + + This file is part of GNU Bash, the Bourne Again SHell. + + Bash is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Bash is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Bash. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <config.h> + +#include <stdio.h> /* for debugging */ + +#include "strmatch.h" +#include <chartypes.h> + +#include "bashansi.h" +#include "shmbutil.h" +#include "xmalloc.h" + +#include <errno.h> + +#if !defined (errno) +extern int errno; +#endif + +#if FNMATCH_EQUIV_FALLBACK +/* We don't include <fnmatch.h> in order to avoid namespace collisions; the + internal strmatch still uses the FNM_ constants. */ +extern int fnmatch (const char *, const char *, int); +#endif + +/* First, compile `sm_loop.c' for single-byte characters. */ +#define CHAR unsigned char +#define U_CHAR unsigned char +#define XCHAR char +#define INT int +#define L(CS) CS +#define INVALID -1 + +#undef STREQ +#undef STREQN +#define STREQ(a, b) ((a)[0] == (b)[0] && strcmp(a, b) == 0) +#define STREQN(a, b, n) ((a)[0] == (b)[0] && strncmp(a, b, n) == 0) + +#ifndef GLOBASCII_DEFAULT +# define GLOBASCII_DEFAULT 0 +#endif + +int glob_asciirange = GLOBASCII_DEFAULT; + +#if FNMATCH_EQUIV_FALLBACK +/* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them + to fnmatch to see if wide characters c1 and c2 collate as members of the + same equivalence class. We can't really do this portably any other way */ +static int +_fnmatch_fallback (s, p) + int s, p; /* string char, patchar */ +{ + char s1[2]; /* string */ + char s2[8]; /* constructed pattern */ + + s1[0] = (unsigned char)s; + s1[1] = '\0'; + + /* reconstruct the pattern */ + s2[0] = s2[1] = '['; + s2[2] = '='; + s2[3] = (unsigned char)p; + s2[4] = '='; + s2[5] = s2[6] = ']'; + s2[7] = '\0'; + + return (fnmatch ((const char *)s2, (const char *)s1, 0)); +} +#endif + +/* We use strcoll(3) for range comparisons in bracket expressions, + even though it can have unwanted side effects in locales + other than POSIX or US. For instance, in the de locale, [A-Z] matches + all characters. If GLOB_ASCIIRANGE is non-zero, and we're not forcing + the use of strcoll (e.g., for explicit collating symbols), we use + straight ordering as if in the C locale. */ + +#if defined (HAVE_STRCOLL) +/* Helper functions for collating symbol equivalence. */ + +/* Return 0 if C1 == C2 or collates equally if FORCECOLL is non-zero. */ +static int +charcmp (c1, c2, forcecoll) + int c1, c2; + int forcecoll; +{ + static char s1[2] = { ' ', '\0' }; + static char s2[2] = { ' ', '\0' }; + int ret; + + /* Eight bits only. Period. */ + c1 &= 0xFF; + c2 &= 0xFF; + + if (c1 == c2) + return (0); + + if (forcecoll == 0 && glob_asciirange) + return (c1 - c2); + + s1[0] = c1; + s2[0] = c2; + + return (strcoll (s1, s2)); +} + +static int +rangecmp (c1, c2, forcecoll) + int c1, c2; + int forcecoll; +{ + int r; + + r = charcmp (c1, c2, forcecoll); + + /* We impose a total ordering here by returning c1-c2 if charcmp returns 0 */ + if (r != 0) + return r; + return (c1 - c2); /* impose total ordering */ +} +#else /* !HAVE_STRCOLL */ +# define rangecmp(c1, c2, f) ((int)(c1) - (int)(c2)) +#endif /* !HAVE_STRCOLL */ + +#if defined (HAVE_STRCOLL) +/* Returns 1 if chars C and EQUIV collate equally in the current locale. */ +static int +collequiv (c, equiv) + int c, equiv; +{ + if (charcmp (c, equiv, 1) == 0) + return 1; + +#if FNMATCH_EQUIV_FALLBACK + return (_fnmatch_fallback (c, equiv) == 0); +#else + return 0; +#endif + +} +#else +# define collequiv(c, equiv) ((c) == (equiv)) +#endif + +#define _COLLSYM _collsym +#define __COLLSYM __collsym +#define POSIXCOLL posix_collsyms +#include "collsyms.h" + +static int +collsym (s, len) + CHAR *s; + int len; +{ + register struct _collsym *csp; + char *x; + + x = (char *)s; + for (csp = posix_collsyms; csp->name; csp++) + { + if (STREQN(csp->name, x, len) && csp->name[len] == '\0') + return (csp->code); + } + if (len == 1) + return s[0]; + return INVALID; +} + +/* unibyte character classification */ +#if !defined (isascii) && !defined (HAVE_ISASCII) +# define isascii(c) ((unsigned int)(c) <= 0177) +#endif + +enum char_class + { + CC_NO_CLASS = 0, + CC_ASCII, CC_ALNUM, CC_ALPHA, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, + CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_WORD, CC_XDIGIT + }; + +static char const *const cclass_name[] = + { + "", + "ascii", "alnum", "alpha", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "word", "xdigit" + }; + +#define N_CHAR_CLASS (sizeof(cclass_name) / sizeof (cclass_name[0])) + +static enum char_class +is_valid_cclass (name) + const char *name; +{ + enum char_class ret; + int i; + + ret = CC_NO_CLASS; + + for (i = 1; i < N_CHAR_CLASS; i++) + { + if (STREQ (name, cclass_name[i])) + { + ret = (enum char_class)i; + break; + } + } + + return ret; +} + +static int +cclass_test (c, char_class) + int c; + enum char_class char_class; +{ + int result; + + switch (char_class) + { + case CC_ASCII: + result = isascii (c); + break; + case CC_ALNUM: + result = ISALNUM (c); + break; + case CC_ALPHA: + result = ISALPHA (c); + break; + case CC_BLANK: + result = ISBLANK (c); + break; + case CC_CNTRL: + result = ISCNTRL (c); + break; + case CC_DIGIT: + result = ISDIGIT (c); + break; + case CC_GRAPH: + result = ISGRAPH (c); + break; + case CC_LOWER: + result = ISLOWER (c); + break; + case CC_PRINT: + result = ISPRINT (c); + break; + case CC_PUNCT: + result = ISPUNCT (c); + break; + case CC_SPACE: + result = ISSPACE (c); + break; + case CC_UPPER: + result = ISUPPER (c); + break; + case CC_WORD: + result = (ISALNUM (c) || c == '_'); + break; + case CC_XDIGIT: + result = ISXDIGIT (c); + break; + default: + result = -1; + break; + } + + return result; +} + +static int +is_cclass (c, name) + int c; + const char *name; +{ + enum char_class char_class; + int result; + + char_class = is_valid_cclass (name); + if (char_class == CC_NO_CLASS) + return -1; + + result = cclass_test (c, char_class); + return (result); +} + +/* Now include `sm_loop.c' for single-byte characters. */ +/* The result of FOLD is an `unsigned char' */ +# define FOLD(c) ((flags & FNM_CASEFOLD) \ + ? TOLOWER ((unsigned char)c) \ + : ((unsigned char)c)) + +#if !defined (__CYGWIN__) +# define ISDIRSEP(c) ((c) == '/') +#else +# define ISDIRSEP(c) ((c) == '/' || (c) == '\\') +#endif /* __CYGWIN__ */ +#define PATHSEP(c) (ISDIRSEP(c) || (c) == 0) + +# define PDOT_OR_DOTDOT(s) (s[0] == '.' && (PATHSEP (s[1]) || (s[1] == '.' && PATHSEP (s[2])))) +# define SDOT_OR_DOTDOT(s) (s[0] == '.' && (s[1] == 0 || (s[1] == '.' && s[2] == 0))) + +#define FCT internal_strmatch +#define GMATCH gmatch +#define COLLSYM collsym +#define PARSE_COLLSYM parse_collsym +#define BRACKMATCH brackmatch +#define PATSCAN glob_patscan +#define STRCOMPARE strcompare +#define EXTMATCH extmatch +#define DEQUOTE_PATHNAME udequote_pathname +#define STRUCT smat_struct +#define STRCHR(S, C) strchr((S), (C)) +#define MEMCHR(S, C, N) memchr((S), (C), (N)) +#define STRCOLL(S1, S2) strcoll((S1), (S2)) +#define STRLEN(S) strlen(S) +#define STRCMP(S1, S2) strcmp((S1), (S2)) +#define RANGECMP(C1, C2, F) rangecmp((C1), (C2), (F)) +#define COLLEQUIV(C1, C2) collequiv((C1), (C2)) +#define CTYPE_T enum char_class +#define IS_CCLASS(C, S) is_cclass((C), (S)) +#include "sm_loop.c" + +#if HANDLE_MULTIBYTE + +# define CHAR wchar_t +# define U_CHAR wint_t +# define XCHAR wchar_t +# define INT wint_t +# define L(CS) L##CS +# define INVALID WEOF + +# undef STREQ +# undef STREQN +# define STREQ(s1, s2) ((wcscmp (s1, s2) == 0)) +# define STREQN(a, b, n) ((a)[0] == (b)[0] && wcsncmp(a, b, n) == 0) + +extern char *mbsmbchar PARAMS((const char *)); + +#if FNMATCH_EQUIV_FALLBACK +/* Construct a string w1 = "c1" and a pattern w2 = "[[=c2=]]" and pass them + to fnmatch to see if wide characters c1 and c2 collate as members of the + same equivalence class. We can't really do this portably any other way */ +static int +_fnmatch_fallback_wc (c1, c2) + wchar_t c1, c2; /* string char, patchar */ +{ + char w1[MB_LEN_MAX+1]; /* string */ + char w2[MB_LEN_MAX+8]; /* constructed pattern */ + int l1, l2; + + l1 = wctomb (w1, c1); + if (l1 == -1) + return (2); + w1[l1] = '\0'; + + /* reconstruct the pattern */ + w2[0] = w2[1] = '['; + w2[2] = '='; + l2 = wctomb (w2+3, c2); + if (l2 == -1) + return (2); + w2[l2+3] = '='; + w2[l2+4] = w2[l2+5] = ']'; + w2[l2+6] = '\0'; + + return (fnmatch ((const char *)w2, (const char *)w1, 0)); +} +#endif + +static int +charcmp_wc (c1, c2, forcecoll) + wint_t c1, c2; + int forcecoll; +{ + static wchar_t s1[2] = { L' ', L'\0' }; + static wchar_t s2[2] = { L' ', L'\0' }; + int r; + + if (c1 == c2) + return 0; + + if (forcecoll == 0 && glob_asciirange && c1 <= UCHAR_MAX && c2 <= UCHAR_MAX) + return ((int)(c1 - c2)); + + s1[0] = c1; + s2[0] = c2; + + return (wcscoll (s1, s2)); +} + +static int +rangecmp_wc (c1, c2, forcecoll) + wint_t c1, c2; + int forcecoll; +{ + int r; + + r = charcmp_wc (c1, c2, forcecoll); + + /* We impose a total ordering here by returning c1-c2 if charcmp returns 0, + as we do above in the single-byte case. */ + if (r != 0 || forcecoll) + return r; + return ((int)(c1 - c2)); /* impose total ordering */ +} + +/* Returns 1 if wide chars C and EQUIV collate equally in the current locale. */ +static int +collequiv_wc (c, equiv) + wint_t c, equiv; +{ + wchar_t s, p; + + if (charcmp_wc (c, equiv, 1) == 0) + return 1; + +#if FNMATCH_EQUIV_FALLBACK +/* We check explicitly for success (fnmatch returns 0) to avoid problems if + our local definition of FNM_NOMATCH (strmatch.h) doesn't match the + system's (fnmatch.h). We don't care about error return values here. */ + + s = c; + p = equiv; + return (_fnmatch_fallback_wc (s, p) == 0); +#else + return 0; +#endif +} + +/* Helper function for collating symbol. */ +# define _COLLSYM _collwcsym +# define __COLLSYM __collwcsym +# define POSIXCOLL posix_collwcsyms +# include "collsyms.h" + +static wint_t +collwcsym (s, len) + wchar_t *s; + int len; +{ + register struct _collwcsym *csp; + + for (csp = posix_collwcsyms; csp->name; csp++) + { + if (STREQN(csp->name, s, len) && csp->name[len] == L'\0') + return (csp->code); + } + if (len == 1) + return s[0]; + return INVALID; +} + +static int +is_wcclass (wc, name) + wint_t wc; + wchar_t *name; +{ + char *mbs; + mbstate_t state; + size_t mbslength; + wctype_t desc; + int want_word; + + if ((wctype ("ascii") == (wctype_t)0) && (wcscmp (name, L"ascii") == 0)) + { + int c; + + if ((c = wctob (wc)) == EOF) + return 0; + else + return (c <= 0x7F); + } + + want_word = (wcscmp (name, L"word") == 0); + if (want_word) + name = L"alnum"; + + memset (&state, '\0', sizeof (mbstate_t)); + mbs = (char *) malloc (wcslen(name) * MB_CUR_MAX + 1); + if (mbs == 0) + return -1; + mbslength = wcsrtombs (mbs, (const wchar_t **)&name, (wcslen(name) * MB_CUR_MAX + 1), &state); + + if (mbslength == (size_t)-1 || mbslength == (size_t)-2) + { + free (mbs); + return -1; + } + desc = wctype (mbs); + free (mbs); + + if (desc == (wctype_t)0) + return -1; + + if (want_word) + return (iswctype (wc, desc) || wc == L'_'); + else + return (iswctype (wc, desc)); +} + +/* Return 1 if there are no char class [:class:] expressions (degenerate case) + or only posix-specified (C locale supported) char class expressions in + PATTERN. These are the ones where it's safe to punt to the single-byte + code, since wide character support allows locale-defined char classes. + This only uses single-byte code, but is only needed to support multibyte + locales. */ +static int +posix_cclass_only (pattern) + char *pattern; +{ + char *p, *p1; + char cc[16]; /* sufficient for all valid posix char class names */ + enum char_class valid; + + p = pattern; + while (p = strchr (p, '[')) + { + if (p[1] != ':') + { + p++; + continue; + } + p += 2; /* skip past "[:" */ + /* Find end of char class expression */ + for (p1 = p; *p1; p1++) + if (*p1 == ':' && p1[1] == ']') + break; + if (*p1 == 0) /* no char class expression found */ + break; + /* Find char class name and validate it against posix char classes */ + if ((p1 - p) >= sizeof (cc)) + return 0; + bcopy (p, cc, p1 - p); + cc[p1 - p] = '\0'; + valid = is_valid_cclass (cc); + if (valid == CC_NO_CLASS) + return 0; /* found unrecognized char class name */ + + p = p1 + 2; /* found posix char class name */ + } + + return 1; /* no char class names or only posix */ +} + +/* Now include `sm_loop.c' for multibyte characters. */ +#define FOLD(c) ((flags & FNM_CASEFOLD) && iswupper (c) ? towlower (c) : (c)) + +# if !defined (__CYGWIN__) +# define ISDIRSEP(c) ((c) == L'/') +# else +# define ISDIRSEP(c) ((c) == L'/' || (c) == L'\\') +# endif /* __CYGWIN__ */ +# define PATHSEP(c) (ISDIRSEP(c) || (c) == L'\0') + +# define PDOT_OR_DOTDOT(w) (w[0] == L'.' && (PATHSEP(w[1]) || (w[1] == L'.' && PATHSEP(w[2])))) +# define SDOT_OR_DOTDOT(w) (w[0] == L'.' && (w[1] == L'\0' || (w[1] == L'.' && w[2] == L'\0'))) + +#define FCT internal_wstrmatch +#define GMATCH gmatch_wc +#define COLLSYM collwcsym +#define PARSE_COLLSYM parse_collwcsym +#define BRACKMATCH brackmatch_wc +#define PATSCAN glob_patscan_wc +#define STRCOMPARE wscompare +#define EXTMATCH extmatch_wc +#define DEQUOTE_PATHNAME wcdequote_pathname +#define STRUCT wcsmat_struct +#define STRCHR(S, C) wcschr((S), (C)) +#define MEMCHR(S, C, N) wmemchr((S), (C), (N)) +#define STRCOLL(S1, S2) wcscoll((S1), (S2)) +#define STRLEN(S) wcslen(S) +#define STRCMP(S1, S2) wcscmp((S1), (S2)) +#define RANGECMP(C1, C2, F) rangecmp_wc((C1), (C2), (F)) +#define COLLEQUIV(C1, C2) collequiv_wc((C1), (C2)) +#define CTYPE_T enum char_class +#define IS_CCLASS(C, S) is_wcclass((C), (S)) +#include "sm_loop.c" + +#endif /* HAVE_MULTIBYTE */ + +int +xstrmatch (pattern, string, flags) + char *pattern; + char *string; + int flags; +{ +#if HANDLE_MULTIBYTE + int ret; + size_t n; + wchar_t *wpattern, *wstring; + size_t plen, slen, mplen, mslen; + + if (MB_CUR_MAX == 1) + return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags)); + + if (mbsmbchar (string) == 0 && mbsmbchar (pattern) == 0 && posix_cclass_only (pattern)) + return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags)); + + n = xdupmbstowcs (&wpattern, NULL, pattern); + if (n == (size_t)-1 || n == (size_t)-2) + return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags)); + + n = xdupmbstowcs (&wstring, NULL, string); + if (n == (size_t)-1 || n == (size_t)-2) + { + free (wpattern); + return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags)); + } + + ret = internal_wstrmatch (wpattern, wstring, flags); + + free (wpattern); + free (wstring); + + return ret; +#else + return (internal_strmatch ((unsigned char *)pattern, (unsigned char *)string, flags)); +#endif /* !HANDLE_MULTIBYTE */ +} |