diff options
Diffstat (limited to 'contrib/dict_xsyn')
-rw-r--r-- | contrib/dict_xsyn/.gitignore | 4 | ||||
-rw-r--r-- | contrib/dict_xsyn/Makefile | 24 | ||||
-rw-r--r-- | contrib/dict_xsyn/dict_xsyn--1.0.sql | 25 | ||||
-rw-r--r-- | contrib/dict_xsyn/dict_xsyn.c | 259 | ||||
-rw-r--r-- | contrib/dict_xsyn/dict_xsyn.control | 5 | ||||
-rw-r--r-- | contrib/dict_xsyn/expected/dict_xsyn.out | 142 | ||||
-rw-r--r-- | contrib/dict_xsyn/sql/dict_xsyn.sql | 45 | ||||
-rw-r--r-- | contrib/dict_xsyn/xsyn_sample.rules | 6 |
8 files changed, 510 insertions, 0 deletions
diff --git a/contrib/dict_xsyn/.gitignore b/contrib/dict_xsyn/.gitignore new file mode 100644 index 0000000..5dcb3ff --- /dev/null +++ b/contrib/dict_xsyn/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/contrib/dict_xsyn/Makefile b/contrib/dict_xsyn/Makefile new file mode 100644 index 0000000..b6bcfe6 --- /dev/null +++ b/contrib/dict_xsyn/Makefile @@ -0,0 +1,24 @@ +# contrib/dict_xsyn/Makefile + +MODULE_big = dict_xsyn +OBJS = \ + $(WIN32RES) \ + dict_xsyn.o + +EXTENSION = dict_xsyn +DATA = dict_xsyn--1.0.sql +DATA_TSEARCH = xsyn_sample.rules +PGFILEDESC = "dict_xsyn - add-on dictionary template for full-text search" + +REGRESS = dict_xsyn + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/dict_xsyn +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/dict_xsyn/dict_xsyn--1.0.sql b/contrib/dict_xsyn/dict_xsyn--1.0.sql new file mode 100644 index 0000000..3d6bb51 --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn--1.0.sql @@ -0,0 +1,25 @@ +/* contrib/dict_xsyn/dict_xsyn--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION dict_xsyn" to load this file. \quit + +CREATE FUNCTION dxsyn_init(internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C STRICT; + +CREATE TEXT SEARCH TEMPLATE xsyn_template ( + LEXIZE = dxsyn_lexize, + INIT = dxsyn_init +); + +CREATE TEXT SEARCH DICTIONARY xsyn ( + TEMPLATE = xsyn_template +); + +COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary'; diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c new file mode 100644 index 0000000..79c4f18 --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn.c @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * dict_xsyn.c + * Extended synonym dictionary + * + * Copyright (c) 2007-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/dict_xsyn/dict_xsyn.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <ctype.h> + +#include "commands/defrem.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" + +PG_MODULE_MAGIC; + +typedef struct +{ + char *key; /* Word */ + char *value; /* Unparsed list of synonyms, including the + * word itself */ +} Syn; + +typedef struct +{ + int len; + Syn *syn; + + bool matchorig; + bool keeporig; + bool matchsynonyms; + bool keepsynonyms; +} DictSyn; + + +PG_FUNCTION_INFO_V1(dxsyn_init); +PG_FUNCTION_INFO_V1(dxsyn_lexize); + +static char * +find_word(char *in, char **end) +{ + char *start; + + *end = NULL; + while (*in && t_isspace(in)) + in += pg_mblen(in); + + if (!*in || *in == '#') + return NULL; + start = in; + + while (*in && !t_isspace(in)) + in += pg_mblen(in); + + *end = in; + + return start; +} + +static int +compare_syn(const void *a, const void *b) +{ + return strcmp(((const Syn *) a)->key, ((const Syn *) b)->key); +} + +static void +read_dictionary(DictSyn *d, const char *filename) +{ + char *real_filename = get_tsearch_config_filename(filename, "rules"); + tsearch_readline_state trst; + char *line; + int cur = 0; + + if (!tsearch_readline_begin(&trst, real_filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open synonym file \"%s\": %m", + real_filename))); + + while ((line = tsearch_readline(&trst)) != NULL) + { + char *value; + char *key; + char *pos; + char *end; + + if (*line == '\0') + continue; + + value = lowerstr(line); + pfree(line); + + pos = value; + while ((key = find_word(pos, &end)) != NULL) + { + /* Enlarge syn structure if full */ + if (cur == d->len) + { + d->len = (d->len > 0) ? 2 * d->len : 16; + if (d->syn) + d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len); + else + d->syn = (Syn *) palloc(sizeof(Syn) * d->len); + } + + /* Save first word only if we will match it */ + if (pos != value || d->matchorig) + { + d->syn[cur].key = pnstrdup(key, end - key); + d->syn[cur].value = pstrdup(value); + + cur++; + } + + pos = end; + + /* Don't bother scanning synonyms if we will not match them */ + if (!d->matchsynonyms) + break; + } + + pfree(value); + } + + tsearch_readline_end(&trst); + + d->len = cur; + if (cur > 1) + qsort(d->syn, d->len, sizeof(Syn), compare_syn); + + pfree(real_filename); +} + +Datum +dxsyn_init(PG_FUNCTION_ARGS) +{ + List *dictoptions = (List *) PG_GETARG_POINTER(0); + DictSyn *d; + ListCell *l; + char *filename = NULL; + + d = (DictSyn *) palloc0(sizeof(DictSyn)); + d->len = 0; + d->syn = NULL; + d->matchorig = true; + d->keeporig = true; + d->matchsynonyms = false; + d->keepsynonyms = true; + + foreach(l, dictoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + + if (strcmp(defel->defname, "matchorig") == 0) + { + d->matchorig = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "keeporig") == 0) + { + d->keeporig = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "matchsynonyms") == 0) + { + d->matchsynonyms = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "keepsynonyms") == 0) + { + d->keepsynonyms = defGetBoolean(defel); + } + else if (strcmp(defel->defname, "rules") == 0) + { + /* we can't read the rules before parsing all options! */ + filename = defGetString(defel); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized xsyn parameter: \"%s\"", + defel->defname))); + } + } + + if (filename) + read_dictionary(d, filename); + + PG_RETURN_POINTER(d); +} + +Datum +dxsyn_lexize(PG_FUNCTION_ARGS) +{ + DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0); + char *in = (char *) PG_GETARG_POINTER(1); + int length = PG_GETARG_INT32(2); + Syn word; + Syn *found; + TSLexeme *res = NULL; + + if (!length || d->len == 0) + PG_RETURN_POINTER(NULL); + + /* Create search pattern */ + { + char *temp = pnstrdup(in, length); + + word.key = lowerstr(temp); + pfree(temp); + word.value = NULL; + } + + /* Look for matching syn */ + found = (Syn *) bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn); + pfree(word.key); + + if (!found) + PG_RETURN_POINTER(NULL); + + /* Parse string of synonyms and return array of words */ + { + char *value = found->value; + char *syn; + char *pos; + char *end; + int nsyns = 0; + + res = palloc(sizeof(TSLexeme)); + + pos = value; + while ((syn = find_word(pos, &end)) != NULL) + { + res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2)); + + /* The first word is output only if keeporig=true */ + if (pos != value || d->keeporig) + { + res[nsyns].lexeme = pnstrdup(syn, end - syn); + res[nsyns].nvariant = 0; + res[nsyns].flags = 0; + nsyns++; + } + + pos = end; + + /* Stop if we are not to output the synonyms */ + if (!d->keepsynonyms) + break; + } + res[nsyns].lexeme = NULL; + } + + PG_RETURN_POINTER(res); +} diff --git a/contrib/dict_xsyn/dict_xsyn.control b/contrib/dict_xsyn/dict_xsyn.control new file mode 100644 index 0000000..3fd465a --- /dev/null +++ b/contrib/dict_xsyn/dict_xsyn.control @@ -0,0 +1,5 @@ +# dict_xsyn extension +comment = 'text search dictionary template for extended synonym processing' +default_version = '1.0' +module_pathname = '$libdir/dict_xsyn' +relocatable = true diff --git a/contrib/dict_xsyn/expected/dict_xsyn.out b/contrib/dict_xsyn/expected/dict_xsyn.out new file mode 100644 index 0000000..9b95e13 --- /dev/null +++ b/contrib/dict_xsyn/expected/dict_xsyn.out @@ -0,0 +1,142 @@ +CREATE EXTENSION dict_xsyn; +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +--lexize +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +-------------------------- + {supernova,sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +---------------- + {sn,sne,1987a} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +------------- + {supernova} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'sn'); + ts_lexize +----------- + {} +(1 row) + +SELECT ts_lexize('xsyn', 'grb'); + ts_lexize +----------- + +(1 row) + diff --git a/contrib/dict_xsyn/sql/dict_xsyn.sql b/contrib/dict_xsyn/sql/dict_xsyn.sql new file mode 100644 index 0000000..4951106 --- /dev/null +++ b/contrib/dict_xsyn/sql/dict_xsyn.sql @@ -0,0 +1,45 @@ +CREATE EXTENSION dict_xsyn; + +-- default configuration - match first word and return it among with all synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); + +--lexize +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- the same, but return only synonyms +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word and return all words except first one +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=true, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any synonym but not first word, and return first word instead +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=true, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- do not match or return anything +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=false, KEEPSYNONYMS=false, MATCHSYNONYMS=false); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); + +-- match any word but return nothing +ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false, MATCHORIG=true, KEEPSYNONYMS=false, MATCHSYNONYMS=true); +SELECT ts_lexize('xsyn', 'supernova'); +SELECT ts_lexize('xsyn', 'sn'); +SELECT ts_lexize('xsyn', 'grb'); diff --git a/contrib/dict_xsyn/xsyn_sample.rules b/contrib/dict_xsyn/xsyn_sample.rules new file mode 100644 index 0000000..203bec7 --- /dev/null +++ b/contrib/dict_xsyn/xsyn_sample.rules @@ -0,0 +1,6 @@ +# Sample rules file for eXtended Synonym (xsyn) dictionary +# format is as follows: +# +# word synonym1 synonym2 ... +# +supernova sn sne 1987a |