diff options
Diffstat (limited to 'src/include/tsearch')
-rw-r--r-- | src/include/tsearch/dicts/regis.h | 49 | ||||
-rw-r--r-- | src/include/tsearch/dicts/spell.h | 247 | ||||
-rw-r--r-- | src/include/tsearch/ts_cache.h | 98 | ||||
-rw-r--r-- | src/include/tsearch/ts_locale.h | 63 | ||||
-rw-r--r-- | src/include/tsearch/ts_public.h | 159 | ||||
-rw-r--r-- | src/include/tsearch/ts_type.h | 242 | ||||
-rw-r--r-- | src/include/tsearch/ts_utils.h | 266 |
7 files changed, 1124 insertions, 0 deletions
diff --git a/src/include/tsearch/dicts/regis.h b/src/include/tsearch/dicts/regis.h new file mode 100644 index 0000000..c7c3d9f --- /dev/null +++ b/src/include/tsearch/dicts/regis.h @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * regis.h + * + * Declarations for fast regex subset, used by ISpell + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * src/include/tsearch/dicts/regis.h + * + *------------------------------------------------------------------------- + */ + +#ifndef __REGIS_H__ +#define __REGIS_H__ + +typedef struct RegisNode +{ + uint32 + type:2, + len:16, + unused:14; + struct RegisNode *next; + unsigned char data[FLEXIBLE_ARRAY_MEMBER]; +} RegisNode; + +#define RNHDRSZ (offsetof(RegisNode,data)) + +#define RSF_ONEOF 1 +#define RSF_NONEOF 2 + +typedef struct Regis +{ + RegisNode *node; + uint32 + issuffix:1, + nchar:16, + unused:15; +} Regis; + +extern bool RS_isRegis(const char *str); + +extern void RS_compile(Regis *r, bool issuffix, const char *str); +extern void RS_free(Regis *r); + +/*returns true if matches */ +extern bool RS_execute(Regis *r, char *str); + +#endif diff --git a/src/include/tsearch/dicts/spell.h b/src/include/tsearch/dicts/spell.h new file mode 100644 index 0000000..978f43a --- /dev/null +++ b/src/include/tsearch/dicts/spell.h @@ -0,0 +1,247 @@ +/*------------------------------------------------------------------------- + * + * spell.h + * + * Declarations for ISpell dictionary + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * src/include/tsearch/dicts/spell.h + * + *------------------------------------------------------------------------- + */ + +#ifndef __SPELL_H__ +#define __SPELL_H__ + +#include "regex/regex.h" +#include "tsearch/dicts/regis.h" +#include "tsearch/ts_public.h" + +/* + * SPNode and SPNodeData are used to represent prefix tree (Trie) to store + * a words list. + */ +struct SPNode; + +typedef struct +{ + uint32 val:8, + isword:1, + /* Stores compound flags listed below */ + compoundflag:4, + /* Reference to an entry of the AffixData field */ + affix:19; + struct SPNode *node; +} SPNodeData; + +/* + * Names of FF_ are correlated with Hunspell options in affix file + * http://hunspell.sourceforge.net/ + */ +#define FF_COMPOUNDONLY 0x01 +#define FF_COMPOUNDBEGIN 0x02 +#define FF_COMPOUNDMIDDLE 0x04 +#define FF_COMPOUNDLAST 0x08 +#define FF_COMPOUNDFLAG ( FF_COMPOUNDBEGIN | FF_COMPOUNDMIDDLE | \ + FF_COMPOUNDLAST ) +#define FF_COMPOUNDFLAGMASK 0x0f + +typedef struct SPNode +{ + uint32 length; + SPNodeData data[FLEXIBLE_ARRAY_MEMBER]; +} SPNode; + +#define SPNHDRSZ (offsetof(SPNode,data)) + +/* + * Represents an entry in a words list. + */ +typedef struct spell_struct +{ + union + { + /* + * flag is filled in by NIImportDictionary(). After + * NISortDictionary(), d is used instead of flag. + */ + char *flag; + /* d is used in mkSPNode() */ + struct + { + /* Reference to an entry of the AffixData field */ + int affix; + /* Length of the word */ + int len; + } d; + } p; + char word[FLEXIBLE_ARRAY_MEMBER]; +} SPELL; + +#define SPELLHDRSZ (offsetof(SPELL, word)) + +/* + * If an affix uses a regex, we have to store that separately in a struct + * that won't move around when arrays of affixes are enlarged or sorted. + * This is so that it can be found to be cleaned up at context destruction. + */ +typedef struct aff_regex_struct +{ + regex_t regex; + MemoryContextCallback mcallback; +} aff_regex_struct; + +/* + * Represents an entry in an affix list. + */ +typedef struct aff_struct +{ + char *flag; + /* FF_SUFFIX or FF_PREFIX */ + uint32 type:1, + flagflags:7, + issimple:1, + isregis:1, + replen:14; + char *find; + char *repl; + union + { + aff_regex_struct *pregex; + Regis regis; + } reg; +} AFFIX; + +/* + * affixes use dictionary flags too + */ +#define FF_COMPOUNDPERMITFLAG 0x10 +#define FF_COMPOUNDFORBIDFLAG 0x20 +#define FF_CROSSPRODUCT 0x40 + +/* + * Don't change the order of these. Initialization sorts by these, + * and expects prefixes to come first after sorting. + */ +#define FF_SUFFIX 1 +#define FF_PREFIX 0 + +/* + * AffixNode and AffixNodeData are used to represent prefix tree (Trie) to store + * an affix list. + */ +struct AffixNode; + +typedef struct +{ + uint32 val:8, + naff:24; + AFFIX **aff; + struct AffixNode *node; +} AffixNodeData; + +typedef struct AffixNode +{ + uint32 isvoid:1, + length:31; + AffixNodeData data[FLEXIBLE_ARRAY_MEMBER]; +} AffixNode; + +#define ANHRDSZ (offsetof(AffixNode, data)) + +typedef struct +{ + char *affix; + int len; + bool issuffix; +} CMPDAffix; + +/* + * Type of encoding affix flags in Hunspell dictionaries + */ +typedef enum +{ + FM_CHAR, /* one character (like ispell) */ + FM_LONG, /* two characters */ + FM_NUM /* number, >= 0 and < 65536 */ +} FlagMode; + +/* + * Structure to store Hunspell options. Flag representation depends on flag + * type. These flags are about support of compound words. + */ +typedef struct CompoundAffixFlag +{ + union + { + /* Flag name if flagMode is FM_CHAR or FM_LONG */ + char *s; + /* Flag name if flagMode is FM_NUM */ + uint32 i; + } flag; + /* we don't have a bsearch_arg version, so, copy FlagMode */ + FlagMode flagMode; + uint32 value; +} CompoundAffixFlag; + +#define FLAGNUM_MAXSIZE (1 << 16) + +typedef struct +{ + int maffixes; + int naffixes; + AFFIX *Affix; + + AffixNode *Suffix; + AffixNode *Prefix; + + SPNode *Dictionary; + /* Array of sets of affixes */ + char **AffixData; + int lenAffixData; + int nAffixData; + bool useFlagAliases; + + CMPDAffix *CompoundAffix; + + bool usecompound; + FlagMode flagMode; + + /* + * All follow fields are actually needed only for initialization + */ + + /* Array of Hunspell options in affix file */ + CompoundAffixFlag *CompoundAffixFlags; + /* number of entries in CompoundAffixFlags array */ + int nCompoundAffixFlag; + /* allocated length of CompoundAffixFlags array */ + int mCompoundAffixFlag; + + /* + * Remaining fields are only used during dictionary construction; they are + * set up by NIStartBuild and cleared by NIFinishBuild. + */ + MemoryContext buildCxt; /* temp context for construction */ + + /* Temporary array of all words in the dict file */ + SPELL **Spell; + int nspell; /* number of valid entries in Spell array */ + int mspell; /* allocated length of Spell array */ + + /* These are used to allocate "compact" data without palloc overhead */ + char *firstfree; /* first free address (always maxaligned) */ + size_t avail; /* free space remaining at firstfree */ +} IspellDict; + +extern TSLexeme *NINormalizeWord(IspellDict *Conf, char *word); + +extern void NIStartBuild(IspellDict *Conf); +extern void NIImportAffixes(IspellDict *Conf, const char *filename); +extern void NIImportDictionary(IspellDict *Conf, const char *filename); +extern void NISortDictionary(IspellDict *Conf); +extern void NISortAffixes(IspellDict *Conf); +extern void NIFinishBuild(IspellDict *Conf); + +#endif diff --git a/src/include/tsearch/ts_cache.h b/src/include/tsearch/ts_cache.h new file mode 100644 index 0000000..5e4a49e --- /dev/null +++ b/src/include/tsearch/ts_cache.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * ts_cache.h + * Tsearch related object caches. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/tsearch/ts_cache.h + * + *------------------------------------------------------------------------- + */ +#ifndef TS_CACHE_H +#define TS_CACHE_H + +#include "utils/guc.h" + + +/* + * All TS*CacheEntry structs must share this common header + * (see InvalidateTSCacheCallBack) + */ +typedef struct TSAnyCacheEntry +{ + Oid objId; + bool isvalid; +} TSAnyCacheEntry; + + +typedef struct TSParserCacheEntry +{ + /* prsId is the hash lookup key and MUST BE FIRST */ + Oid prsId; /* OID of the parser */ + bool isvalid; + + Oid startOid; + Oid tokenOid; + Oid endOid; + Oid headlineOid; + Oid lextypeOid; + + /* + * Pre-set-up fmgr call of most needed parser's methods + */ + FmgrInfo prsstart; + FmgrInfo prstoken; + FmgrInfo prsend; + FmgrInfo prsheadline; +} TSParserCacheEntry; + +typedef struct TSDictionaryCacheEntry +{ + /* dictId is the hash lookup key and MUST BE FIRST */ + Oid dictId; + bool isvalid; + + /* most frequent fmgr call */ + Oid lexizeOid; + FmgrInfo lexize; + + MemoryContext dictCtx; /* memory context to store private data */ + void *dictData; +} TSDictionaryCacheEntry; + +typedef struct +{ + int len; + Oid *dictIds; +} ListDictionary; + +typedef struct +{ + /* cfgId is the hash lookup key and MUST BE FIRST */ + Oid cfgId; + bool isvalid; + + Oid prsId; + + int lenmap; + ListDictionary *map; +} TSConfigCacheEntry; + + +/* + * GUC variable for current configuration + */ +extern PGDLLIMPORT char *TSCurrentConfig; + + +extern TSParserCacheEntry *lookup_ts_parser_cache(Oid prsId); +extern TSDictionaryCacheEntry *lookup_ts_dictionary_cache(Oid dictId); +extern TSConfigCacheEntry *lookup_ts_config_cache(Oid cfgId); + +extern Oid getTSCurrentConfig(bool emitError); +extern bool check_TSCurrentConfig(char **newval, void **extra, GucSource source); +extern void assign_TSCurrentConfig(const char *newval, void *extra); + +#endif /* TS_CACHE_H */ diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h new file mode 100644 index 0000000..7d7c4e1 --- /dev/null +++ b/src/include/tsearch/ts_locale.h @@ -0,0 +1,63 @@ +/*------------------------------------------------------------------------- + * + * ts_locale.h + * locale compatibility layer for tsearch + * + * Copyright (c) 1998-2022, PostgreSQL Global Development Group + * + * src/include/tsearch/ts_locale.h + * + *------------------------------------------------------------------------- + */ +#ifndef __TSLOCALE_H__ +#define __TSLOCALE_H__ + +#include <ctype.h> +#include <limits.h> + +#include "lib/stringinfo.h" +#include "mb/pg_wchar.h" +#include "utils/pg_locale.h" + +/* + * towlower() and friends should be in <wctype.h>, but some pre-C99 systems + * declare them in <wchar.h>, so include that too. + */ +#include <wchar.h> +#ifdef HAVE_WCTYPE_H +#include <wctype.h> +#endif + +/* working state for tsearch_readline (should be a local var in caller) */ +typedef struct +{ + FILE *fp; + const char *filename; + int lineno; + StringInfoData buf; /* current input line, in UTF-8 */ + char *curline; /* current input line, in DB's encoding */ + /* curline may be NULL, or equal to buf.data, or a palloc'd string */ + ErrorContextCallback cb; +} tsearch_readline_state; + +#define TOUCHAR(x) (*((const unsigned char *) (x))) + +/* The second argument of t_iseq() must be a plain ASCII character */ +#define t_iseq(x,c) (TOUCHAR(x) == (unsigned char) (c)) + +#define COPYCHAR(d,s) memcpy(d, s, pg_mblen(s)) + +extern int t_isdigit(const char *ptr); +extern int t_isspace(const char *ptr); +extern int t_isalpha(const char *ptr); +extern int t_isprint(const char *ptr); + +extern char *lowerstr(const char *str); +extern char *lowerstr_with_len(const char *str, int len); + +extern bool tsearch_readline_begin(tsearch_readline_state *stp, + const char *filename); +extern char *tsearch_readline(tsearch_readline_state *stp); +extern void tsearch_readline_end(tsearch_readline_state *stp); + +#endif /* __TSLOCALE_H__ */ diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h new file mode 100644 index 0000000..fe2a167 --- /dev/null +++ b/src/include/tsearch/ts_public.h @@ -0,0 +1,159 @@ +/*------------------------------------------------------------------------- + * + * ts_public.h + * Public interface to various tsearch modules, such as + * parsers and dictionaries. + * + * Copyright (c) 1998-2022, PostgreSQL Global Development Group + * + * src/include/tsearch/ts_public.h + * + *------------------------------------------------------------------------- + */ +#ifndef _PG_TS_PUBLIC_H_ +#define _PG_TS_PUBLIC_H_ + +#include "tsearch/ts_type.h" + +/* + * Parser's framework + */ + +/* + * returning type for prslextype method of parser + */ +typedef struct +{ + int lexid; + char *alias; + char *descr; +} LexDescr; + +/* + * Interface to headline generator (tsparser's prsheadline function) + * + * HeadlineParsedText describes the text that is to be highlighted. + * Some fields are passed from the core code to the prsheadline function, + * while others are output from the prsheadline function. + * + * The principal data is words[], an array of HeadlineWordEntry, + * one entry per token, of length curwords. + * The fields of HeadlineWordEntry are: + * + * in, selected, replace, skip: these flags are initially zero + * and may be set by the prsheadline function. A consecutive group + * of tokens marked "in" form a "fragment" to be output. + * Such tokens may additionally be marked selected, replace, or skip + * to modify how they are shown. (If you set more than one of those + * bits, you get an unspecified one of those behaviors.) + * + * type, len, pos, word: filled by core code to describe the token. + * + * item: if the token matches any operand of the tsquery of interest, + * a pointer to such an operand. (If there are multiple matching + * operands, we generate extra copies of the HeadlineWordEntry to hold + * all the pointers. The extras are marked with repeated = 1 and should + * be ignored except for checking the item pointer.) + */ +typedef struct +{ + uint32 selected:1, /* token is to be highlighted */ + in:1, /* token is part of headline */ + replace:1, /* token is to be replaced with a space */ + repeated:1, /* duplicate entry to hold item pointer */ + skip:1, /* token is to be skipped (not output) */ + unused:3, /* available bits */ + type:8, /* parser's token category */ + len:16; /* length of token */ + WordEntryPos pos; /* position of token */ + char *word; /* text of token (not null-terminated) */ + QueryOperand *item; /* a matching query operand, or NULL if none */ +} HeadlineWordEntry; + +typedef struct +{ + /* Fields filled by core code before calling prsheadline function: */ + HeadlineWordEntry *words; + int32 lenwords; /* allocated length of words[] */ + int32 curwords; /* current number of valid entries */ + int32 vectorpos; /* used by ts_parse.c in filling pos fields */ + + /* The prsheadline function must fill these fields: */ + /* Strings for marking selected tokens and separating fragments: */ + char *startsel; /* palloc'd strings */ + char *stopsel; + char *fragdelim; + int16 startsellen; /* lengths of strings */ + int16 stopsellen; + int16 fragdelimlen; +} HeadlineParsedText; + +/* + * Common useful things for tsearch subsystem + */ +extern char *get_tsearch_config_filename(const char *basename, + const char *extension); + +/* + * Often useful stopword list management + */ +typedef struct +{ + int len; + char **stop; +} StopList; + +extern void readstoplist(const char *fname, StopList *s, + char *(*wordop) (const char *)); +extern bool searchstoplist(StopList *s, char *key); + +/* + * Interface with dictionaries + */ + +/* return struct for any lexize function */ +typedef struct +{ + /*---------- + * Number of current variant of split word. For example the Norwegian + * word 'fotballklubber' has two variants to split: ( fotball, klubb ) + * and ( fot, ball, klubb ). So, dictionary should return: + * + * nvariant lexeme + * 1 fotball + * 1 klubb + * 2 fot + * 2 ball + * 2 klubb + * + * In general, a TSLexeme will be considered to belong to the same split + * variant as the previous one if they have the same nvariant value. + * The exact values don't matter, only changes from one lexeme to next. + *---------- + */ + uint16 nvariant; + + uint16 flags; /* See flag bits below */ + + char *lexeme; /* C string */ +} TSLexeme; + +/* Flag bits that can appear in TSLexeme.flags */ +#define TSL_ADDPOS 0x01 +#define TSL_PREFIX 0x02 +#define TSL_FILTER 0x04 + +/* + * Struct for supporting complex dictionaries like thesaurus. + * 4th argument for dictlexize method is a pointer to this + */ +typedef struct +{ + bool isend; /* in: marks for lexize_info about text end is + * reached */ + bool getnext; /* out: dict wants next lexeme */ + void *private_state; /* internal dict state between calls with + * getnext == true */ +} DictSubState; + +#endif /* _PG_TS_PUBLIC_H_ */ diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h new file mode 100644 index 0000000..689b2d1 --- /dev/null +++ b/src/include/tsearch/ts_type.h @@ -0,0 +1,242 @@ +/*------------------------------------------------------------------------- + * + * ts_type.h + * Definitions for the tsvector and tsquery types + * + * Copyright (c) 1998-2022, PostgreSQL Global Development Group + * + * src/include/tsearch/ts_type.h + * + *------------------------------------------------------------------------- + */ +#ifndef _PG_TSTYPE_H_ +#define _PG_TSTYPE_H_ + +#include "fmgr.h" +#include "utils/memutils.h" + + +/* + * TSVector type. + * + * Structure of tsvector datatype: + * 1) standard varlena header + * 2) int32 size - number of lexemes (WordEntry array entries) + * 3) Array of WordEntry - one per lexeme; must be sorted according to + * tsCompareString() (ie, memcmp of lexeme strings). + * WordEntry->pos gives the number of bytes from end of WordEntry + * array to start of lexeme's string, which is of length len. + * 4) Per-lexeme data storage: + * lexeme string (not null-terminated) + * if haspos is true: + * padding byte if necessary to make the position data 2-byte aligned + * uint16 number of positions that follow + * WordEntryPos[] positions + * + * The positions for each lexeme must be sorted. + * + * Note, tsvectorsend/recv believe that sizeof(WordEntry) == 4 + */ + +typedef struct +{ + uint32 + haspos:1, + len:11, /* MAX 2Kb */ + pos:20; /* MAX 1Mb */ +} WordEntry; + +#define MAXSTRLEN ( (1<<11) - 1) +#define MAXSTRPOS ( (1<<20) - 1) + +extern int compareWordEntryPos(const void *a, const void *b); + +/* + * Equivalent to + * typedef struct { + * uint16 + * weight:2, + * pos:14; + * } + */ + +typedef uint16 WordEntryPos; + +typedef struct +{ + uint16 npos; + WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER]; +} WordEntryPosVector; + +/* WordEntryPosVector with exactly 1 entry */ +typedef struct +{ + uint16 npos; + WordEntryPos pos[1]; +} WordEntryPosVector1; + + +#define WEP_GETWEIGHT(x) ( (x) >> 14 ) +#define WEP_GETPOS(x) ( (x) & 0x3fff ) + +#define WEP_SETWEIGHT(x,v) ( (x) = ( (v) << 14 ) | ( (x) & 0x3fff ) ) +#define WEP_SETPOS(x,v) ( (x) = ( (x) & 0xc000 ) | ( (v) & 0x3fff ) ) + +#define MAXENTRYPOS (1<<14) +#define MAXNUMPOS (256) +#define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) ) + +/* This struct represents a complete tsvector datum */ +typedef struct +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int32 size; + WordEntry entries[FLEXIBLE_ARRAY_MEMBER]; + /* lexemes follow the entries[] array */ +} TSVectorData; + +typedef TSVectorData *TSVector; + +#define DATAHDRSIZE (offsetof(TSVectorData, entries)) +#define CALCDATASIZE(nentries, lenstr) (DATAHDRSIZE + (nentries) * sizeof(WordEntry) + (lenstr) ) + +/* pointer to start of a tsvector's WordEntry array */ +#define ARRPTR(x) ( (x)->entries ) + +/* pointer to start of a tsvector's lexeme storage */ +#define STRPTR(x) ( (char *) &(x)->entries[(x)->size] ) + +#define _POSVECPTR(x, e) ((WordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))) +#define POSDATALEN(x,e) ( ( (e)->haspos ) ? (_POSVECPTR(x,e)->npos) : 0 ) +#define POSDATAPTR(x,e) (_POSVECPTR(x,e)->pos) + +/* + * fmgr interface macros + */ + +#define DatumGetTSVector(X) ((TSVector) PG_DETOAST_DATUM(X)) +#define DatumGetTSVectorCopy(X) ((TSVector) PG_DETOAST_DATUM_COPY(X)) +#define TSVectorGetDatum(X) PointerGetDatum(X) +#define PG_GETARG_TSVECTOR(n) DatumGetTSVector(PG_GETARG_DATUM(n)) +#define PG_GETARG_TSVECTOR_COPY(n) DatumGetTSVectorCopy(PG_GETARG_DATUM(n)) +#define PG_RETURN_TSVECTOR(x) return TSVectorGetDatum(x) + + +/* + * TSQuery + * + * + */ + +typedef int8 QueryItemType; + +/* Valid values for QueryItemType: */ +#define QI_VAL 1 +#define QI_OPR 2 +#define QI_VALSTOP 3 /* This is only used in an intermediate stack + * representation in parse_tsquery. It's not a + * legal type elsewhere. */ + +/* + * QueryItem is one node in tsquery - operator or operand. + */ +typedef struct +{ + QueryItemType type; /* operand or kind of operator (ts_tokentype) */ + uint8 weight; /* weights of operand to search. It's a + * bitmask of allowed weights. if it =0 then + * any weight are allowed. Weights and bit + * map: A: 1<<3 B: 1<<2 C: 1<<1 D: 1<<0 */ + bool prefix; /* true if it's a prefix search */ + int32 valcrc; /* XXX: pg_crc32 would be a more appropriate + * data type, but we use comparisons to signed + * integers in the code. They would need to be + * changed as well. */ + + /* pointer to text value of operand, must correlate with WordEntry */ + uint32 + length:12, + distance:20; +} QueryOperand; + + +/* + * Legal values for QueryOperator.operator. + */ +#define OP_NOT 1 +#define OP_AND 2 +#define OP_OR 3 +#define OP_PHRASE 4 /* highest code, tsquery_cleanup.c */ +#define OP_COUNT 4 + +extern PGDLLIMPORT const int tsearch_op_priority[OP_COUNT]; + +/* get operation priority by its code */ +#define OP_PRIORITY(x) ( tsearch_op_priority[(x) - 1] ) +/* get QueryOperator priority */ +#define QO_PRIORITY(x) OP_PRIORITY(((QueryOperator *) (x))->oper) + +typedef struct +{ + QueryItemType type; + int8 oper; /* see above */ + int16 distance; /* distance between agrs for OP_PHRASE */ + uint32 left; /* pointer to left operand. Right operand is + * item + 1, left operand is placed + * item+item->left */ +} QueryOperator; + +/* + * Note: TSQuery is 4-bytes aligned, so make sure there's no fields + * inside QueryItem requiring 8-byte alignment, like int64. + */ +typedef union +{ + QueryItemType type; + QueryOperator qoperator; + QueryOperand qoperand; +} QueryItem; + +/* + * Storage: + * (len)(size)(array of QueryItem)(operands as '\0'-terminated c-strings) + */ + +typedef struct +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int32 size; /* number of QueryItems */ + char data[FLEXIBLE_ARRAY_MEMBER]; /* data starts here */ +} TSQueryData; + +typedef TSQueryData *TSQuery; + +#define HDRSIZETQ ( VARHDRSZ + sizeof(int32) ) + +/* Computes the size of header and all QueryItems. size is the number of + * QueryItems, and lenofoperand is the total length of all operands + */ +#define COMPUTESIZE(size, lenofoperand) ( HDRSIZETQ + (size) * sizeof(QueryItem) + (lenofoperand) ) +#define TSQUERY_TOO_BIG(size, lenofoperand) \ + ((size) > (MaxAllocSize - HDRSIZETQ - (lenofoperand)) / sizeof(QueryItem)) + +/* Returns a pointer to the first QueryItem in a TSQuery */ +#define GETQUERY(x) ((QueryItem*)( (char*)(x)+HDRSIZETQ )) + +/* Returns a pointer to the beginning of operands in a TSQuery */ +#define GETOPERAND(x) ( (char*)GETQUERY(x) + ((TSQuery)(x))->size * sizeof(QueryItem) ) + +/* + * fmgr interface macros + * Note, TSQuery type marked as plain storage, so it can't be toasted + * but PG_DETOAST_DATUM_COPY is used for simplicity + */ + +#define DatumGetTSQuery(X) ((TSQuery) DatumGetPointer(X)) +#define DatumGetTSQueryCopy(X) ((TSQuery) PG_DETOAST_DATUM_COPY(X)) +#define TSQueryGetDatum(X) PointerGetDatum(X) +#define PG_GETARG_TSQUERY(n) DatumGetTSQuery(PG_GETARG_DATUM(n)) +#define PG_GETARG_TSQUERY_COPY(n) DatumGetTSQueryCopy(PG_GETARG_DATUM(n)) +#define PG_RETURN_TSQUERY(x) return TSQueryGetDatum(x) + +#endif /* _PG_TSTYPE_H_ */ diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h new file mode 100644 index 0000000..c36c711 --- /dev/null +++ b/src/include/tsearch/ts_utils.h @@ -0,0 +1,266 @@ +/*------------------------------------------------------------------------- + * + * ts_utils.h + * helper utilities for tsearch + * + * Copyright (c) 1998-2022, PostgreSQL Global Development Group + * + * src/include/tsearch/ts_utils.h + * + *------------------------------------------------------------------------- + */ +#ifndef _PG_TS_UTILS_H_ +#define _PG_TS_UTILS_H_ + +#include "nodes/pg_list.h" +#include "tsearch/ts_public.h" +#include "tsearch/ts_type.h" + +/* + * Common parse definitions for tsvector and tsquery + */ + +/* tsvector parser support. */ + +struct TSVectorParseStateData; /* opaque struct in tsvector_parser.c */ +typedef struct TSVectorParseStateData *TSVectorParseState; + +#define P_TSV_OPR_IS_DELIM (1 << 0) +#define P_TSV_IS_TSQUERY (1 << 1) +#define P_TSV_IS_WEB (1 << 2) + +extern TSVectorParseState init_tsvector_parser(char *input, int flags); +extern void reset_tsvector_parser(TSVectorParseState state, char *input); +extern bool gettoken_tsvector(TSVectorParseState state, + char **token, int *len, + WordEntryPos **pos, int *poslen, + char **endptr); +extern void close_tsvector_parser(TSVectorParseState state); + +/* phrase operator begins with '<' */ +#define ISOPERATOR(x) \ + ( pg_mblen(x) == 1 && ( *(x) == '!' || \ + *(x) == '&' || \ + *(x) == '|' || \ + *(x) == '(' || \ + *(x) == ')' || \ + *(x) == '<' \ + ) ) + +/* parse_tsquery */ + +struct TSQueryParserStateData; /* private in backend/utils/adt/tsquery.c */ +typedef struct TSQueryParserStateData *TSQueryParserState; + +typedef void (*PushFunction) (Datum opaque, TSQueryParserState state, + char *token, int tokenlen, + int16 tokenweights, /* bitmap as described in + * QueryOperand struct */ + bool prefix); + +#define P_TSQ_PLAIN (1 << 0) +#define P_TSQ_WEB (1 << 1) + +extern TSQuery parse_tsquery(char *buf, + PushFunction pushval, + Datum opaque, + int flags); + +/* Functions for use by PushFunction implementations */ +extern void pushValue(TSQueryParserState state, + char *strval, int lenval, int16 weight, bool prefix); +extern void pushStop(TSQueryParserState state); +extern void pushOperator(TSQueryParserState state, int8 oper, int16 distance); + +/* + * parse plain text and lexize words + */ +typedef struct +{ + uint16 len; + uint16 nvariant; + union + { + uint16 pos; + + /* + * When apos array is used, apos[0] is the number of elements in the + * array (excluding apos[0]), and alen is the allocated size of the + * array. + */ + uint16 *apos; + } pos; + uint16 flags; /* currently, only TSL_PREFIX */ + char *word; + uint32 alen; +} ParsedWord; + +typedef struct +{ + ParsedWord *words; + int32 lenwords; + int32 curwords; + int32 pos; +} ParsedText; + +extern void parsetext(Oid cfgId, ParsedText *prs, char *buf, int32 buflen); + +/* + * headline framework, flow in common to generate: + * 1 parse text with hlparsetext + * 2 parser-specific function to find part + * 3 generateHeadline to generate result text + */ + +extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, + char *buf, int32 buflen); +extern text *generateHeadline(HeadlineParsedText *prs); + +/* + * TSQuery execution support + * + * TS_execute() executes a tsquery against data that can be represented in + * various forms. The TSExecuteCallback callback function is called to check + * whether a given primitive tsquery value is matched in the data. + */ + +/* TS_execute requires ternary logic to handle NOT with phrase matches */ +typedef enum +{ + TS_NO, /* definitely no match */ + TS_YES, /* definitely does match */ + TS_MAYBE /* can't verify match for lack of pos data */ +} TSTernaryValue; + +/* + * struct ExecPhraseData is passed to a TSExecuteCallback function if we need + * lexeme position data (because of a phrase-match operator in the tsquery). + * The callback should fill in position data when it returns TS_YES (success). + * If it cannot return position data, it should leave "data" unchanged and + * return TS_MAYBE. The caller of TS_execute() must then arrange for a later + * recheck with position data available. + * + * The reported lexeme positions must be sorted and unique. Callers must only + * consult the position bits of the pos array, ie, WEP_GETPOS(data->pos[i]). + * This allows the returned "pos" to point directly to the WordEntryPos + * portion of a tsvector value. If "allocated" is true then the pos array + * is palloc'd workspace and caller may free it when done. + * + * "negate" means that the pos array contains positions where the query does + * not match, rather than positions where it does. "width" is positive when + * the match is wider than one lexeme. Neither of these fields normally need + * to be touched by TSExecuteCallback functions; they are used for + * phrase-search processing within TS_execute. + * + * All fields of the ExecPhraseData struct are initially zeroed by caller. + */ +typedef struct ExecPhraseData +{ + int npos; /* number of positions reported */ + bool allocated; /* pos points to palloc'd data? */ + bool negate; /* positions are where query is NOT matched */ + WordEntryPos *pos; /* ordered, non-duplicate lexeme positions */ + int width; /* width of match in lexemes, less 1 */ +} ExecPhraseData; + +/* + * Signature for TSQuery lexeme check functions + * + * arg: opaque value passed through from caller of TS_execute + * val: lexeme to test for presence of + * data: to be filled with lexeme positions; NULL if position data not needed + * + * Return TS_YES if lexeme is present in data, TS_MAYBE if it might be + * present, TS_NO if it definitely is not present. If data is not NULL, + * it must be filled with lexeme positions if available. If position data + * is not available, leave *data as zeroes and return TS_MAYBE, never TS_YES. + */ +typedef TSTernaryValue (*TSExecuteCallback) (void *arg, QueryOperand *val, + ExecPhraseData *data); + +/* + * Flag bits for TS_execute + */ +#define TS_EXEC_EMPTY (0x00) +/* + * If TS_EXEC_SKIP_NOT is set, then NOT sub-expressions are automatically + * evaluated to be true. This was formerly the default behavior. It's now + * deprecated because it tends to give silly answers, but some applications + * might still have a use for it. + */ +#define TS_EXEC_SKIP_NOT (0x01) +/* + * If TS_EXEC_PHRASE_NO_POS is set, allow OP_PHRASE to be executed lossily + * in the absence of position information: a true result indicates that the + * phrase might be present. Without this flag, OP_PHRASE always returns + * false if lexeme position information is not available. + */ +#define TS_EXEC_PHRASE_NO_POS (0x02) + +extern bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, + TSExecuteCallback chkcond); +extern TSTernaryValue TS_execute_ternary(QueryItem *curitem, void *arg, + uint32 flags, + TSExecuteCallback chkcond); +extern bool tsquery_requires_match(QueryItem *curitem); + +/* + * to_ts* - text transformation to tsvector, tsquery + */ +extern TSVector make_tsvector(ParsedText *prs); +extern int32 tsCompareString(char *a, int lena, char *b, int lenb, bool prefix); + +/* + * Possible strategy numbers for indexes + * TSearchStrategyNumber - (tsvector|text) @@ tsquery + * TSearchWithClassStrategyNumber - tsvector @@@ tsquery + */ +#define TSearchStrategyNumber 1 +#define TSearchWithClassStrategyNumber 2 + +/* + * TSQuery Utilities + */ +extern QueryItem *clean_NOT(QueryItem *ptr, int32 *len); +extern TSQuery cleanup_tsquery_stopwords(TSQuery in); + +typedef struct QTNode +{ + QueryItem *valnode; + uint32 flags; + int32 nchild; + char *word; + uint32 sign; + struct QTNode **child; +} QTNode; + +/* bits in QTNode.flags */ +#define QTN_NEEDFREE 0x01 +#define QTN_NOCHANGE 0x02 +#define QTN_WORDFREE 0x04 + +typedef uint64 TSQuerySign; + +#define TSQS_SIGLEN (sizeof(TSQuerySign)*BITS_PER_BYTE) + +#define TSQuerySignGetDatum(X) Int64GetDatum((int64) (X)) +#define DatumGetTSQuerySign(X) ((TSQuerySign) DatumGetInt64(X)) +#define PG_RETURN_TSQUERYSIGN(X) return TSQuerySignGetDatum(X) +#define PG_GETARG_TSQUERYSIGN(n) DatumGetTSQuerySign(PG_GETARG_DATUM(n)) + + +extern QTNode *QT2QTN(QueryItem *in, char *operand); +extern TSQuery QTN2QT(QTNode *in); +extern void QTNFree(QTNode *in); +extern void QTNSort(QTNode *in); +extern void QTNTernary(QTNode *in); +extern void QTNBinary(QTNode *in); +extern int QTNodeCompare(QTNode *an, QTNode *bn); +extern QTNode *QTNCopy(QTNode *in); +extern void QTNClearFlags(QTNode *in, uint32 flags); +extern bool QTNEq(QTNode *a, QTNode *b); +extern TSQuerySign makeTSQuerySign(TSQuery a); +extern QTNode *findsubquery(QTNode *root, QTNode *ex, QTNode *subs, + bool *isfind); + +#endif /* _PG_TS_UTILS_H_ */ |