summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch/ts_utils.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/tsearch/ts_utils.c')
-rw-r--r--src/backend/tsearch/ts_utils.c146
1 files changed, 146 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c
new file mode 100644
index 0000000..ed16a2e
--- /dev/null
+++ b/src/backend/tsearch/ts_utils.c
@@ -0,0 +1,146 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_utils.c
+ * various support functions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_utils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "miscadmin.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_utils.h"
+
+
+/*
+ * Given the base name and extension of a tsearch config file, return
+ * its full path name. The base name is assumed to be user-supplied,
+ * and is checked to prevent pathname attacks. The extension is assumed
+ * to be safe.
+ *
+ * The result is a palloc'd string.
+ */
+char *
+get_tsearch_config_filename(const char *basename,
+ const char *extension)
+{
+ char sharepath[MAXPGPATH];
+ char *result;
+
+ /*
+ * We limit the basename to contain a-z, 0-9, and underscores. This may
+ * be overly restrictive, but we don't want to allow access to anything
+ * outside the tsearch_data directory, so for instance '/' *must* be
+ * rejected, and on some platforms '\' and ':' are risky as well. Allowing
+ * uppercase might result in incompatible behavior between case-sensitive
+ * and case-insensitive filesystems, and non-ASCII characters create other
+ * interesting risks, so on the whole a tight policy seems best.
+ */
+ if (strspn(basename, "abcdefghijklmnopqrstuvwxyz0123456789_") != strlen(basename))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid text search configuration file name \"%s\"",
+ basename)));
+
+ get_share_path(my_exec_path, sharepath);
+ result = palloc(MAXPGPATH);
+ snprintf(result, MAXPGPATH, "%s/tsearch_data/%s.%s",
+ sharepath, basename, extension);
+
+ return result;
+}
+
+/*
+ * Reads a stop-word file. Each word is run through 'wordop'
+ * function, if given. wordop may either modify the input in-place,
+ * or palloc a new version.
+ */
+void
+readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
+{
+ char **stop = NULL;
+
+ s->len = 0;
+ if (fname && *fname)
+ {
+ char *filename = get_tsearch_config_filename(fname, "stop");
+ tsearch_readline_state trst;
+ char *line;
+ int reallen = 0;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open stop-word file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *pbuf = line;
+
+ /* Trim trailing space */
+ while (*pbuf && !t_isspace(pbuf))
+ pbuf += pg_mblen(pbuf);
+ *pbuf = '\0';
+
+ /* Skip empty lines */
+ if (*line == '\0')
+ {
+ pfree(line);
+ continue;
+ }
+
+ if (s->len >= reallen)
+ {
+ if (reallen == 0)
+ {
+ reallen = 64;
+ stop = (char **) palloc(sizeof(char *) * reallen);
+ }
+ else
+ {
+ reallen *= 2;
+ stop = (char **) repalloc((void *) stop,
+ sizeof(char *) * reallen);
+ }
+ }
+
+ if (wordop)
+ {
+ stop[s->len] = wordop(line);
+ if (stop[s->len] != line)
+ pfree(line);
+ }
+ else
+ stop[s->len] = line;
+
+ (s->len)++;
+ }
+
+ tsearch_readline_end(&trst);
+ pfree(filename);
+ }
+
+ s->stop = stop;
+
+ /* Sort to allow binary searching */
+ if (s->stop && s->len > 0)
+ qsort(s->stop, s->len, sizeof(char *), pg_qsort_strcmp);
+}
+
+bool
+searchstoplist(StopList *s, char *key)
+{
+ return (s->stop && s->len > 0 &&
+ bsearch(&key, s->stop, s->len,
+ sizeof(char *), pg_qsort_strcmp)) ? true : false;
+}