diff options
Diffstat (limited to 'src/grep/src/searchutils.c')
-rw-r--r-- | src/grep/src/searchutils.c | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/src/grep/src/searchutils.c b/src/grep/src/searchutils.c new file mode 100644 index 0000000..8058511 --- /dev/null +++ b/src/grep/src/searchutils.c @@ -0,0 +1,190 @@ +/* searchutils.c - helper subroutines for grep's matchers. + Copyright 1992, 1998, 2000, 2007, 2009-2021 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#include <config.h> + +#define SEARCH_INLINE _GL_EXTERN_INLINE +#define SYSTEM_INLINE _GL_EXTERN_INLINE +#include "search.h" + +/* For each byte B, sbwordchar[B] is true if B is a single-byte + character that is a word constituent, and is false otherwise. */ +static bool sbwordchar[NCHAR]; + +/* Whether -w considers WC to be a word constituent. */ +static bool +wordchar (wint_t wc) +{ + return wc == L'_' || iswalnum (wc); +} + +void +wordinit (void) +{ + for (int i = 0; i < NCHAR; i++) + sbwordchar[i] = wordchar (localeinfo.sbctowc[i]); +} + +kwset_t +kwsinit (bool mb_trans) +{ + char *trans = NULL; + + if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) + { + trans = xmalloc (NCHAR); + /* If I is a single-byte character that becomes a different + single-byte character when uppercased, set trans[I] + to that character. Otherwise, set trans[I] to I. */ + for (int i = 0; i < NCHAR; i++) + trans[i] = toupper (i); + } + + return kwsalloc (trans); +} + +/* In the buffer *MB_START, return the number of bytes needed to go + back from CUR to the previous boundary, where a "boundary" is the + start of a multibyte character or is an error-encoding byte. The + buffer ends at END (i.e., one past the address of the buffer's last + byte). If CUR is already at a boundary, return 0. If CUR is no + larger than *MB_START, return CUR - *MB_START without modifying + *MB_START or *MBCLEN. + + When returning zero, set *MB_START to CUR. When returning a + positive value, set *MB_START to the next boundary after CUR, + or to END if there is no such boundary, and set *MBCLEN to the + length of the preceding character. */ +ptrdiff_t +mb_goback (char const **mb_start, size_t *mbclen, char const *cur, + char const *end) +{ + const char *p = *mb_start; + const char *p0 = p; + size_t clen; + + if (cur <= p) + return cur - p; + + if (localeinfo.using_utf8) + { + p = cur; + clen = 1; + + if (cur < end && (*cur & 0xc0) == 0x80) + for (int i = 1; i <= 3; i++) + if ((cur[-i] & 0xc0) != 0x80) + { + mbstate_t mbs = { 0 }; + clen = mb_clen (cur - i, end - (cur - i), &mbs); + if (i < clen && clen < (size_t) -2) + { + p0 = cur - i; + p = p0 + clen; + } + break; + } + } + else + { + mbstate_t mbs = { 0 }; + do + { + clen = mb_clen (p, end - p, &mbs); + + if ((size_t) -2 <= clen) + { + /* An invalid sequence, or a truncated multibyte character. + Treat it as a single byte character. */ + clen = 1; + memset (&mbs, 0, sizeof mbs); + } + p0 = p; + p += clen; + } + while (p < cur); + } + + *mb_start = p; + if (mbclen) + *mbclen = clen; + return p == cur ? 0 : cur - p0; +} + +/* Examine the start of BUF (which goes to END) for word constituents. + If COUNTALL, examine as many as possible; otherwise, examine at most one. + Return the total number of bytes in the examined characters. */ +static size_t +wordchars_count (char const *buf, char const *end, bool countall) +{ + size_t n = 0; + mbstate_t mbs = { 0 }; + while (n < end - buf) + { + unsigned char b = buf[n]; + if (sbwordchar[b]) + n++; + else if (localeinfo.sbclen[b] != -2) + break; + else + { + wchar_t wc = 0; + size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs); + if (!wordchar (wc)) + break; + n += wcbytes + !wcbytes; + } + if (!countall) + break; + } + return n; +} + +/* Examine the start of BUF for the longest prefix containing just + word constituents. Return the total number of bytes in the prefix. + The buffer ends at END. */ +size_t +wordchars_size (char const *buf, char const *end) +{ + return wordchars_count (buf, end, true); +} + +/* If BUF starts with a word constituent, return the number of bytes + used to represent it; otherwise, return zero. The buffer ends at END. */ +size_t +wordchar_next (char const *buf, char const *end) +{ + return wordchars_count (buf, end, false); +} + +/* In the buffer BUF, return nonzero if the character whose encoding + contains the byte before CUR is a word constituent. The buffer + ends at END. */ +size_t +wordchar_prev (char const *buf, char const *cur, char const *end) +{ + if (buf == cur) + return 0; + unsigned char b = *--cur; + if (! localeinfo.multibyte + || (localeinfo.using_utf8 && localeinfo.sbclen[b] == 1)) + return sbwordchar[b]; + char const *p = buf; + cur -= mb_goback (&p, NULL, cur, end); + return wordchar_next (cur, end); +} |