/*------------------------------------------------------------------------- * * wparser_def.c * Default text search parser * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * * * IDENTIFICATION * src/backend/tsearch/wparser_def.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "catalog/pg_collation.h" #include "commands/defrem.h" #include "miscadmin.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_public.h" #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" #include "utils/builtins.h" /* Define me to enable tracing of parser behavior */ /* #define WPARSER_TRACE */ /* Output token categories */ #define ASCIIWORD 1 #define WORD_T 2 #define NUMWORD 3 #define EMAIL 4 #define URL_T 5 #define HOST 6 #define SCIENTIFIC 7 #define VERSIONNUMBER 8 #define NUMPARTHWORD 9 #define PARTHWORD 10 #define ASCIIPARTHWORD 11 #define SPACE 12 #define TAG_T 13 #define PROTOCOL 14 #define NUMHWORD 15 #define ASCIIHWORD 16 #define HWORD 17 #define URLPATH 18 #define FILEPATH 19 #define DECIMAL_T 20 #define SIGNEDINT 21 #define UNSIGNEDINT 22 #define XMLENTITY 23 #define LASTNUM 23 static const char *const tok_alias[] = { "", "asciiword", "word", "numword", "email", "url", "host", "sfloat", "version", "hword_numpart", "hword_part", "hword_asciipart", "blank", "tag", "protocol", "numhword", "asciihword", "hword", "url_path", "file", "float", "int", "uint", "entity" }; static const char *const lex_descr[] = { "", "Word, all ASCII", "Word, all letters", "Word, letters and digits", "Email address", "URL", "Host", "Scientific notation", "Version number", "Hyphenated word part, letters and digits", "Hyphenated word part, all letters", "Hyphenated word part, all ASCII", "Space symbols", "XML tag", "Protocol head", "Hyphenated word, letters and digits", "Hyphenated word, all ASCII", "Hyphenated word, all letters", "URL path", "File or path name", "Decimal notation", "Signed integer", "Unsigned integer", "XML entity" }; /* Parser states */ typedef enum { TPS_Base = 0, TPS_InNumWord, TPS_InAsciiWord, TPS_InWord, TPS_InUnsignedInt, TPS_InSignedIntFirst, TPS_InSignedInt, TPS_InSpace, TPS_InUDecimalFirst, TPS_InUDecimal, TPS_InDecimalFirst, TPS_InDecimal, TPS_InVerVersion, TPS_InSVerVersion, TPS_InVersionFirst, TPS_InVersion, TPS_InMantissaFirst, TPS_InMantissaSign, TPS_InMantissa, TPS_InXMLEntityFirst, TPS_InXMLEntity, TPS_InXMLEntityNumFirst, TPS_InXMLEntityNum, TPS_InXMLEntityHexNumFirst, TPS_InXMLEntityHexNum, TPS_InXMLEntityEnd, TPS_InTagFirst, TPS_InXMLBegin, TPS_InTagCloseFirst, TPS_InTagName, TPS_InTagBeginEnd, TPS_InTag, TPS_InTagEscapeK, TPS_InTagEscapeKK, TPS_InTagBackSleshed, TPS_InTagEnd, TPS_InCommentFirst, TPS_InCommentLast, TPS_InComment, TPS_InCloseCommentFirst, TPS_InCloseCommentLast, TPS_InCommentEnd, TPS_InHostFirstDomain, TPS_InHostDomainSecond, TPS_InHostDomain, TPS_InPortFirst, TPS_InPort, TPS_InHostFirstAN, TPS_InHost, TPS_InEmail, TPS_InFileFirst, TPS_InFileTwiddle, TPS_InPathFirst, TPS_InPathFirstFirst, TPS_InPathSecond, TPS_InFile, TPS_InFileNext, TPS_InURLPathFirst, TPS_InURLPathStart, TPS_InURLPath, TPS_InFURL, TPS_InProtocolFirst, TPS_InProtocolSecond, TPS_InProtocolEnd, TPS_InHyphenAsciiWordFirst, TPS_InHyphenAsciiWord, TPS_InHyphenWordFirst, TPS_InHyphenWord, TPS_InHyphenNumWordFirst, TPS_InHyphenNumWord, TPS_InHyphenDigitLookahead, TPS_InParseHyphen, TPS_InParseHyphenHyphen, TPS_InHyphenWordPart, TPS_InHyphenAsciiWordPart, TPS_InHyphenNumWordPart, TPS_InHyphenUnsignedInt, TPS_Null /* last state (fake value) */ } TParserState; /* forward declaration */ struct TParser; typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions * except p_iseq */ typedef void (*TParserSpecial) (struct TParser *); /* special handler for * special cases... */ typedef struct { TParserCharTest isclass; char c; uint16 flags; TParserState tostate; int type; TParserSpecial special; } TParserStateActionItem; /* Flag bits in TParserStateActionItem.flags */ #define A_NEXT 0x0000 #define A_BINGO 0x0001 #define A_POP 0x0002 #define A_PUSH 0x0004 #define A_RERUN 0x0008 #define A_CLEAR 0x0010 #define A_MERGE 0x0020 #define A_CLRALL 0x0040 typedef struct TParserPosition { int posbyte; /* position of parser in bytes */ int poschar; /* position of parser in characters */ int charlen; /* length of current char */ int lenbytetoken; /* length of token-so-far in bytes */ int lenchartoken; /* and in chars */ TParserState state; struct TParserPosition *prev; const TParserStateActionItem *pushedAtAction; } TParserPosition; typedef struct TParser { /* string and position information */ char *str; /* multibyte string */ int lenstr; /* length of mbstring */ wchar_t *wstr; /* wide character string */ pg_wchar *pgwstr; /* wide character string for C-locale */ bool usewide; /* State of parse */ int charmaxlen; TParserPosition *state; bool ignore; bool wanthost; /* silly char */ char c; /* out */ char *token; int lenbytetoken; int lenchartoken; int type; } TParser; /* forward decls here */ static bool TParserGet(TParser *prs); static TParserPosition * newTParserPosition(TParserPosition *prev) { TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition)); if (prev) memcpy(res, prev, sizeof(TParserPosition)); else memset(res, 0, sizeof(TParserPosition)); res->prev = prev; res->pushedAtAction = NULL; return res; } static TParser * TParserInit(char *str, int len) { TParser *prs = (TParser *) palloc0(sizeof(TParser)); prs->charmaxlen = pg_database_encoding_max_length(); prs->str = str; prs->lenstr = len; /* * Use wide char code only when max encoding length > 1. */ if (prs->charmaxlen > 1) { pg_locale_t mylocale = 0; /* TODO */ prs->usewide = true; if (database_ctype_is_c) { /* * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could * be different from sizeof(wchar_t) */ prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); } else { prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr, mylocale); } } else prs->usewide = false; prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; #ifdef WPARSER_TRACE fprintf(stderr, "parsing \"%.*s\"\n", len, str); #endif return prs; } /* * As an alternative to a full TParserInit one can create a * TParserCopy which basically is a regular TParser without a private * copy of the string - instead it uses the one from another TParser. * This is useful because at some places TParsers are created * recursively and the repeated copying around of the strings can * cause major inefficiency if the source string is long. * The new parser starts parsing at the original's current position. * * Obviously one must not close the original TParser before the copy. */ static TParser * TParserCopyInit(const TParser *orig) { TParser *prs = (TParser *) palloc0(sizeof(TParser)); prs->charmaxlen = orig->charmaxlen; prs->str = orig->str + orig->state->posbyte; prs->lenstr = orig->lenstr - orig->state->posbyte; prs->usewide = orig->usewide; if (orig->pgwstr) prs->pgwstr = orig->pgwstr + orig->state->poschar; if (orig->wstr) prs->wstr = orig->wstr + orig->state->poschar; prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; #ifdef WPARSER_TRACE fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str); #endif return prs; } static void TParserClose(TParser *prs) { while (prs->state) { TParserPosition *ptr = prs->state->prev; pfree(prs->state); prs->state = ptr; } if (prs->wstr) pfree(prs->wstr); if (prs->pgwstr) pfree(prs->pgwstr); #ifdef WPARSER_TRACE fprintf(stderr, "closing parser\n"); #endif pfree(prs); } /* * Close a parser created with TParserCopyInit */ static void TParserCopyClose(TParser *prs) { while (prs->state) { TParserPosition *ptr = prs->state->prev; pfree(prs->state); prs->state = ptr; } #ifdef WPARSER_TRACE fprintf(stderr, "closing parser copy\n"); #endif pfree(prs); } /* * Character-type support functions, equivalent to is* macros, but * working with any possible encodings and locales. Notes: * - with multibyte encoding and C-locale isw* function may fail * or give wrong result. * - multibyte encoding and C-locale often are used for * Asian languages. * - if locale is C then we use pgwstr instead of wstr. */ #define p_iswhat(type, nonascii) \ \ static int \ p_is##type(TParser *prs) \ { \ Assert(prs->state); \ if (prs->usewide) \ { \ if (prs->pgwstr) \ { \ unsigned int c = *(prs->pgwstr + prs->state->poschar); \ if (c > 0x7f) \ return nonascii; \ return is##type(c); \ } \ return isw##type(*(prs->wstr + prs->state->poschar)); \ } \ return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \ } \ \ static int \ p_isnot##type(TParser *prs) \ { \ return !p_is##type(prs); \ } /* * In C locale with a multibyte encoding, any non-ASCII symbol is considered * an alpha character, but not a member of other char classes. */ p_iswhat(alnum, 1) p_iswhat(alpha, 1) p_iswhat(digit, 0) p_iswhat(lower, 0) p_iswhat(print, 0) p_iswhat(punct, 0) p_iswhat(space, 0) p_iswhat(upper, 0) p_iswhat(xdigit, 0) /* p_iseq should be used only for ascii symbols */ static int p_iseq(TParser *prs, char c) { Assert(prs->state); return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; } static int p_isEOF(TParser *prs) { Assert(prs->state); return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0; } static int p_iseqC(TParser *prs) { return p_iseq(prs, prs->c); } static int p_isneC(TParser *prs) { return !p_iseq(prs, prs->c); } static int p_isascii(TParser *prs) { return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0; } static int p_isasclet(TParser *prs) { return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0; } static int p_isurlchar(TParser *prs) { char ch; /* no non-ASCII need apply */ if (prs->state->charlen != 1) return 0; ch = *(prs->str + prs->state->posbyte); /* no spaces or control characters */ if (ch <= 0x20 || ch >= 0x7F) return 0; /* reject characters disallowed by RFC 3986 */ switch (ch) { case '"': case '<': case '>': case '\\': case '^': case '`': case '{': case '|': case '}': return 0; } return 1; } /* deliberately suppress unused-function complaints for the above */ void _make_compiler_happy(void); void _make_compiler_happy(void) { p_isalnum(NULL); p_isnotalnum(NULL); p_isalpha(NULL); p_isnotalpha(NULL); p_isdigit(NULL); p_isnotdigit(NULL); p_islower(NULL); p_isnotlower(NULL); p_isprint(NULL); p_isnotprint(NULL); p_ispunct(NULL); p_isnotpunct(NULL); p_isspace(NULL); p_isnotspace(NULL); p_isupper(NULL); p_isnotupper(NULL); p_isxdigit(NULL); p_isnotxdigit(NULL); p_isEOF(NULL); p_iseqC(NULL); p_isneC(NULL); } static void SpecialTags(TParser *prs) { switch (prs->state->lenchartoken) { case 8: /* token, "ignore = false; break; case 7: /*