diff options
Diffstat (limited to 'src/backend/tsearch/ts_utils.c')
-rw-r--r-- | src/backend/tsearch/ts_utils.c | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c new file mode 100644 index 0000000..ed16a2e --- /dev/null +++ b/src/backend/tsearch/ts_utils.c @@ -0,0 +1,146 @@ +/*------------------------------------------------------------------------- + * + * ts_utils.c + * various support functions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_utils.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <ctype.h> + +#include "miscadmin.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" + + +/* + * Given the base name and extension of a tsearch config file, return + * its full path name. The base name is assumed to be user-supplied, + * and is checked to prevent pathname attacks. The extension is assumed + * to be safe. + * + * The result is a palloc'd string. + */ +char * +get_tsearch_config_filename(const char *basename, + const char *extension) +{ + char sharepath[MAXPGPATH]; + char *result; + + /* + * We limit the basename to contain a-z, 0-9, and underscores. This may + * be overly restrictive, but we don't want to allow access to anything + * outside the tsearch_data directory, so for instance '/' *must* be + * rejected, and on some platforms '\' and ':' are risky as well. Allowing + * uppercase might result in incompatible behavior between case-sensitive + * and case-insensitive filesystems, and non-ASCII characters create other + * interesting risks, so on the whole a tight policy seems best. + */ + if (strspn(basename, "abcdefghijklmnopqrstuvwxyz0123456789_") != strlen(basename)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid text search configuration file name \"%s\"", + basename))); + + get_share_path(my_exec_path, sharepath); + result = palloc(MAXPGPATH); + snprintf(result, MAXPGPATH, "%s/tsearch_data/%s.%s", + sharepath, basename, extension); + + return result; +} + +/* + * Reads a stop-word file. Each word is run through 'wordop' + * function, if given. wordop may either modify the input in-place, + * or palloc a new version. + */ +void +readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *)) +{ + char **stop = NULL; + + s->len = 0; + if (fname && *fname) + { + char *filename = get_tsearch_config_filename(fname, "stop"); + tsearch_readline_state trst; + char *line; + int reallen = 0; + + if (!tsearch_readline_begin(&trst, filename)) + ereport(ERROR, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("could not open stop-word file \"%s\": %m", + filename))); + + while ((line = tsearch_readline(&trst)) != NULL) + { + char *pbuf = line; + + /* Trim trailing space */ + while (*pbuf && !t_isspace(pbuf)) + pbuf += pg_mblen(pbuf); + *pbuf = '\0'; + + /* Skip empty lines */ + if (*line == '\0') + { + pfree(line); + continue; + } + + if (s->len >= reallen) + { + if (reallen == 0) + { + reallen = 64; + stop = (char **) palloc(sizeof(char *) * reallen); + } + else + { + reallen *= 2; + stop = (char **) repalloc((void *) stop, + sizeof(char *) * reallen); + } + } + + if (wordop) + { + stop[s->len] = wordop(line); + if (stop[s->len] != line) + pfree(line); + } + else + stop[s->len] = line; + + (s->len)++; + } + + tsearch_readline_end(&trst); + pfree(filename); + } + + s->stop = stop; + + /* Sort to allow binary searching */ + if (s->stop && s->len > 0) + qsort(s->stop, s->len, sizeof(char *), pg_qsort_strcmp); +} + +bool +searchstoplist(StopList *s, char *key) +{ + return (s->stop && s->len > 0 && + bsearch(&key, s->stop, s->len, + sizeof(char *), pg_qsort_strcmp)) ? true : false; +} |