From 5e45211a64149b3c659b90ff2de6fa982a5a93ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 14:17:33 +0200 Subject: Adding upstream version 15.5. Signed-off-by: Daniel Baumann --- src/backend/tsearch/wparser_def.c | 2648 +++++++++++++++++++++++++++++++++++++ 1 file changed, 2648 insertions(+) create mode 100644 src/backend/tsearch/wparser_def.c (limited to 'src/backend/tsearch/wparser_def.c') diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c new file mode 100644 index 0000000..916db5a --- /dev/null +++ b/src/backend/tsearch/wparser_def.c @@ -0,0 +1,2648 @@ +/*------------------------------------------------------------------------- + * + * wparser_def.c + * Default text search parser + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/wparser_def.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "catalog/pg_collation.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" + + +/* Define me to enable tracing of parser behavior */ +/* #define WPARSER_TRACE */ + + +/* Output token categories */ + +#define ASCIIWORD 1 +#define WORD_T 2 +#define NUMWORD 3 +#define EMAIL 4 +#define URL_T 5 +#define HOST 6 +#define SCIENTIFIC 7 +#define VERSIONNUMBER 8 +#define NUMPARTHWORD 9 +#define PARTHWORD 10 +#define ASCIIPARTHWORD 11 +#define SPACE 12 +#define TAG_T 13 +#define PROTOCOL 14 +#define NUMHWORD 15 +#define ASCIIHWORD 16 +#define HWORD 17 +#define URLPATH 18 +#define FILEPATH 19 +#define DECIMAL_T 20 +#define SIGNEDINT 21 +#define UNSIGNEDINT 22 +#define XMLENTITY 23 + +#define LASTNUM 23 + +static const char *const tok_alias[] = { + "", + "asciiword", + "word", + "numword", + "email", + "url", + "host", + "sfloat", + "version", + "hword_numpart", + "hword_part", + "hword_asciipart", + "blank", + "tag", + "protocol", + "numhword", + "asciihword", + "hword", + "url_path", + "file", + "float", + "int", + "uint", + "entity" +}; + +static const char *const lex_descr[] = { + "", + "Word, all ASCII", + "Word, all letters", + "Word, letters and digits", + "Email address", + "URL", + "Host", + "Scientific notation", + "Version number", + "Hyphenated word part, letters and digits", + "Hyphenated word part, all letters", + "Hyphenated word part, all ASCII", + "Space symbols", + "XML tag", + "Protocol head", + "Hyphenated word, letters and digits", + "Hyphenated word, all ASCII", + "Hyphenated word, all letters", + "URL path", + "File or path name", + "Decimal notation", + "Signed integer", + "Unsigned integer", + "XML entity" +}; + + +/* Parser states */ + +typedef enum +{ + TPS_Base = 0, + TPS_InNumWord, + TPS_InAsciiWord, + TPS_InWord, + TPS_InUnsignedInt, + TPS_InSignedIntFirst, + TPS_InSignedInt, + TPS_InSpace, + TPS_InUDecimalFirst, + TPS_InUDecimal, + TPS_InDecimalFirst, + TPS_InDecimal, + TPS_InVerVersion, + TPS_InSVerVersion, + TPS_InVersionFirst, + TPS_InVersion, + TPS_InMantissaFirst, + TPS_InMantissaSign, + TPS_InMantissa, + TPS_InXMLEntityFirst, + TPS_InXMLEntity, + TPS_InXMLEntityNumFirst, + TPS_InXMLEntityNum, + TPS_InXMLEntityHexNumFirst, + TPS_InXMLEntityHexNum, + TPS_InXMLEntityEnd, + TPS_InTagFirst, + TPS_InXMLBegin, + TPS_InTagCloseFirst, + TPS_InTagName, + TPS_InTagBeginEnd, + TPS_InTag, + TPS_InTagEscapeK, + TPS_InTagEscapeKK, + TPS_InTagBackSleshed, + TPS_InTagEnd, + TPS_InCommentFirst, + TPS_InCommentLast, + TPS_InComment, + TPS_InCloseCommentFirst, + TPS_InCloseCommentLast, + TPS_InCommentEnd, + TPS_InHostFirstDomain, + TPS_InHostDomainSecond, + TPS_InHostDomain, + TPS_InPortFirst, + TPS_InPort, + TPS_InHostFirstAN, + TPS_InHost, + TPS_InEmail, + TPS_InFileFirst, + TPS_InFileTwiddle, + TPS_InPathFirst, + TPS_InPathFirstFirst, + TPS_InPathSecond, + TPS_InFile, + TPS_InFileNext, + TPS_InURLPathFirst, + TPS_InURLPathStart, + TPS_InURLPath, + TPS_InFURL, + TPS_InProtocolFirst, + TPS_InProtocolSecond, + TPS_InProtocolEnd, + TPS_InHyphenAsciiWordFirst, + TPS_InHyphenAsciiWord, + TPS_InHyphenWordFirst, + TPS_InHyphenWord, + TPS_InHyphenNumWordFirst, + TPS_InHyphenNumWord, + TPS_InHyphenDigitLookahead, + TPS_InParseHyphen, + TPS_InParseHyphenHyphen, + TPS_InHyphenWordPart, + TPS_InHyphenAsciiWordPart, + TPS_InHyphenNumWordPart, + TPS_InHyphenUnsignedInt, + TPS_Null /* last state (fake value) */ +} TParserState; + +/* forward declaration */ +struct TParser; + +typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions + * except p_iseq */ +typedef void (*TParserSpecial) (struct TParser *); /* special handler for + * special cases... */ + +typedef struct +{ + TParserCharTest isclass; + char c; + uint16 flags; + TParserState tostate; + int type; + TParserSpecial special; +} TParserStateActionItem; + +/* Flag bits in TParserStateActionItem.flags */ +#define A_NEXT 0x0000 +#define A_BINGO 0x0001 +#define A_POP 0x0002 +#define A_PUSH 0x0004 +#define A_RERUN 0x0008 +#define A_CLEAR 0x0010 +#define A_MERGE 0x0020 +#define A_CLRALL 0x0040 + +typedef struct TParserPosition +{ + int posbyte; /* position of parser in bytes */ + int poschar; /* position of parser in characters */ + int charlen; /* length of current char */ + int lenbytetoken; /* length of token-so-far in bytes */ + int lenchartoken; /* and in chars */ + TParserState state; + struct TParserPosition *prev; + const TParserStateActionItem *pushedAtAction; +} TParserPosition; + +typedef struct TParser +{ + /* string and position information */ + char *str; /* multibyte string */ + int lenstr; /* length of mbstring */ + wchar_t *wstr; /* wide character string */ + pg_wchar *pgwstr; /* wide character string for C-locale */ + bool usewide; + + /* State of parse */ + int charmaxlen; + TParserPosition *state; + bool ignore; + bool wanthost; + + /* silly char */ + char c; + + /* out */ + char *token; + int lenbytetoken; + int lenchartoken; + int type; +} TParser; + + +/* forward decls here */ +static bool TParserGet(TParser *prs); + + +static TParserPosition * +newTParserPosition(TParserPosition *prev) +{ + TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); + + if (prev) + memcpy(res, prev, sizeof(TParserPosition)); + else + memset(res, 0, sizeof(TParserPosition)); + + res->prev = prev; + + res->pushedAtAction = NULL; + + return res; +} + +static TParser * +TParserInit(char *str, int len) +{ + TParser *prs = (TParser *) palloc0(sizeof(TParser)); + + prs->charmaxlen = pg_database_encoding_max_length(); + prs->str = str; + prs->lenstr = len; + + /* + * Use wide char code only when max encoding length > 1. + */ + if (prs->charmaxlen > 1) + { + pg_locale_t mylocale = 0; /* TODO */ + + prs->usewide = true; + if (database_ctype_is_c) + { + /* + * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could + * be different from sizeof(wchar_t) + */ + prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); + pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); + } + else + { + prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); + char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, + mylocale); + } + } + else + prs->usewide = false; + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + +#ifdef WPARSER_TRACE + fprintf(stderr, "parsing \"%.*s\"\n", len, str); +#endif + + return prs; +} + +/* + * As an alternative to a full TParserInit one can create a + * TParserCopy which basically is a regular TParser without a private + * copy of the string - instead it uses the one from another TParser. + * This is useful because at some places TParsers are created + * recursively and the repeated copying around of the strings can + * cause major inefficiency if the source string is long. + * The new parser starts parsing at the original's current position. + * + * Obviously one must not close the original TParser before the copy. + */ +static TParser * +TParserCopyInit(const TParser *orig) +{ + TParser *prs = (TParser *) palloc0(sizeof(TParser)); + + prs->charmaxlen = orig->charmaxlen; + prs->str = orig->str + orig->state->posbyte; + prs->lenstr = orig->lenstr - orig->state->posbyte; + prs->usewide = orig->usewide; + + if (orig->pgwstr) + prs->pgwstr = orig->pgwstr + orig->state->poschar; + if (orig->wstr) + prs->wstr = orig->wstr + orig->state->poschar; + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + +#ifdef WPARSER_TRACE + fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str); +#endif + + return prs; +} + + +static void +TParserClose(TParser *prs) +{ + while (prs->state) + { + TParserPosition *ptr = prs->state->prev; + + pfree(prs->state); + prs->state = ptr; + } + + if (prs->wstr) + pfree(prs->wstr); + if (prs->pgwstr) + pfree(prs->pgwstr); + +#ifdef WPARSER_TRACE + fprintf(stderr, "closing parser\n"); +#endif + pfree(prs); +} + +/* + * Close a parser created with TParserCopyInit + */ +static void +TParserCopyClose(TParser *prs) +{ + while (prs->state) + { + TParserPosition *ptr = prs->state->prev; + + pfree(prs->state); + prs->state = ptr; + } + +#ifdef WPARSER_TRACE + fprintf(stderr, "closing parser copy\n"); +#endif + pfree(prs); +} + + +/* + * Character-type support functions, equivalent to is* macros, but + * working with any possible encodings and locales. Notes: + * - with multibyte encoding and C-locale isw* function may fail + * or give wrong result. + * - multibyte encoding and C-locale often are used for + * Asian languages. + * - if locale is C then we use pgwstr instead of wstr. + */ + +#define p_iswhat(type, nonascii) \ + \ +static int \ +p_is##type(TParser *prs) \ +{ \ + Assert(prs->state); \ + if (prs->usewide) \ + { \ + if (prs->pgwstr) \ + { \ + unsigned int c = *(prs->pgwstr + prs->state->poschar); \ + if (c > 0x7f) \ + return nonascii; \ + return is##type(c); \ + } \ + return isw##type(*(prs->wstr + prs->state->poschar)); \ + } \ + return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) \ +{ \ + return !p_is##type(prs); \ +} + +/* + * In C locale with a multibyte encoding, any non-ASCII symbol is considered + * an alpha character, but not a member of other char classes. + */ +p_iswhat(alnum, 1) +p_iswhat(alpha, 1) +p_iswhat(digit, 0) +p_iswhat(lower, 0) +p_iswhat(print, 0) +p_iswhat(punct, 0) +p_iswhat(space, 0) +p_iswhat(upper, 0) +p_iswhat(xdigit, 0) + +/* p_iseq should be used only for ascii symbols */ + +static int +p_iseq(TParser *prs, char c) +{ + Assert(prs->state); + return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; +} + +static int +p_isEOF(TParser *prs) +{ + Assert(prs->state); + return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0; +} + +static int +p_iseqC(TParser *prs) +{ + return p_iseq(prs, prs->c); +} + +static int +p_isneC(TParser *prs) +{ + return !p_iseq(prs, prs->c); +} + +static int +p_isascii(TParser *prs) +{ + return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0; +} + +static int +p_isasclet(TParser *prs) +{ + return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0; +} + +static int +p_isurlchar(TParser *prs) +{ + char ch; + + /* no non-ASCII need apply */ + if (prs->state->charlen != 1) + return 0; + ch = *(prs->str + prs->state->posbyte); + /* no spaces or control characters */ + if (ch <= 0x20 || ch >= 0x7F) + return 0; + /* reject characters disallowed by RFC 3986 */ + switch (ch) + { + case '"': + case '<': + case '>': + case '\\': + case '^': + case '`': + case '{': + case '|': + case '}': + return 0; + } + return 1; +} + + +/* deliberately suppress unused-function complaints for the above */ +void _make_compiler_happy(void); +void +_make_compiler_happy(void) +{ + p_isalnum(NULL); + p_isnotalnum(NULL); + p_isalpha(NULL); + p_isnotalpha(NULL); + p_isdigit(NULL); + p_isnotdigit(NULL); + p_islower(NULL); + p_isnotlower(NULL); + p_isprint(NULL); + p_isnotprint(NULL); + p_ispunct(NULL); + p_isnotpunct(NULL); + p_isspace(NULL); + p_isnotspace(NULL); + p_isupper(NULL); + p_isnotupper(NULL); + p_isxdigit(NULL); + p_isnotxdigit(NULL); + p_isEOF(NULL); + p_iseqC(NULL); + p_isneC(NULL); +} + + +static void +SpecialTags(TParser *prs) +{ + switch (prs->state->lenchartoken) + { + case 8: /* token, "ignore = false; + break; + case 7: /*