summaryrefslogtreecommitdiffstats
path: root/src/backend/tsearch/ts_locale.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/tsearch/ts_locale.c')
-rw-r--r--src/backend/tsearch/ts_locale.c320
1 files changed, 320 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
new file mode 100644
index 0000000..3a475a0
--- /dev/null
+++ b/src/backend/tsearch/ts_locale.c
@@ -0,0 +1,320 @@
+/*-------------------------------------------------------------------------
+ *
+ * ts_locale.c
+ * locale compatibility layer for tsearch
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/ts_locale.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_collation.h"
+#include "common/string.h"
+#include "storage/fd.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+
+static void tsearch_readline_callback(void *arg);
+
+
+/*
+ * The reason these functions use a 3-wchar_t output buffer, not 2 as you
+ * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
+ * getting from char2wchar() is UTF16 not UTF32. A single input character
+ * may therefore produce a surrogate pair rather than just one wchar_t;
+ * we also need room for a trailing null. When we do get a surrogate pair,
+ * we pass just the first code to iswdigit() etc, so that these functions will
+ * always return false for characters outside the Basic Multilingual Plane.
+ */
+#define WC_BUF_LEN 3
+
+int
+t_isdigit(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || database_ctype_is_c)
+ return isdigit(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswdigit((wint_t) character[0]);
+}
+
+int
+t_isspace(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || database_ctype_is_c)
+ return isspace(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswspace((wint_t) character[0]);
+}
+
+int
+t_isalpha(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || database_ctype_is_c)
+ return isalpha(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswalpha((wint_t) character[0]);
+}
+
+int
+t_isprint(const char *ptr)
+{
+ int clen = pg_mblen(ptr);
+ wchar_t character[WC_BUF_LEN];
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (clen == 1 || database_ctype_is_c)
+ return isprint(TOUCHAR(ptr));
+
+ char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+
+ return iswprint((wint_t) character[0]);
+}
+
+
+/*
+ * Set up to read a file using tsearch_readline(). This facility is
+ * better than just reading the file directly because it provides error
+ * context pointing to the specific line where a problem is detected.
+ *
+ * Expected usage is:
+ *
+ * tsearch_readline_state trst;
+ *
+ * if (!tsearch_readline_begin(&trst, filename))
+ * ereport(ERROR,
+ * (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ * errmsg("could not open stop-word file \"%s\": %m",
+ * filename)));
+ * while ((line = tsearch_readline(&trst)) != NULL)
+ * process line;
+ * tsearch_readline_end(&trst);
+ *
+ * Note that the caller supplies the ereport() for file open failure;
+ * this is so that a custom message can be provided. The filename string
+ * passed to tsearch_readline_begin() must remain valid through
+ * tsearch_readline_end().
+ */
+bool
+tsearch_readline_begin(tsearch_readline_state *stp,
+ const char *filename)
+{
+ if ((stp->fp = AllocateFile(filename, "r")) == NULL)
+ return false;
+ stp->filename = filename;
+ stp->lineno = 0;
+ initStringInfo(&stp->buf);
+ stp->curline = NULL;
+ /* Setup error traceback support for ereport() */
+ stp->cb.callback = tsearch_readline_callback;
+ stp->cb.arg = (void *) stp;
+ stp->cb.previous = error_context_stack;
+ error_context_stack = &stp->cb;
+ return true;
+}
+
+/*
+ * Read the next line from a tsearch data file (expected to be in UTF-8), and
+ * convert it to database encoding if needed. The returned string is palloc'd.
+ * NULL return means EOF.
+ */
+char *
+tsearch_readline(tsearch_readline_state *stp)
+{
+ char *recoded;
+
+ /* Advance line number to use in error reports */
+ stp->lineno++;
+
+ /* Clear curline, it's no longer relevant */
+ if (stp->curline)
+ {
+ if (stp->curline != stp->buf.data)
+ pfree(stp->curline);
+ stp->curline = NULL;
+ }
+
+ /* Collect next line, if there is one */
+ if (!pg_get_line_buf(stp->fp, &stp->buf))
+ return NULL;
+
+ /* Validate the input as UTF-8, then convert to DB encoding if needed */
+ recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
+
+ /* Save the correctly-encoded string for possible error reports */
+ stp->curline = recoded; /* might be equal to buf.data */
+
+ /*
+ * We always return a freshly pstrdup'd string. This is clearly necessary
+ * if pg_any_to_server() returned buf.data, and we need a second copy even
+ * if encoding conversion did occur. The caller is entitled to pfree the
+ * returned string at any time, which would leave curline pointing to
+ * recycled storage, causing problems if an error occurs after that point.
+ * (It's preferable to return the result of pstrdup instead of the output
+ * of pg_any_to_server, because the conversion result tends to be
+ * over-allocated. Since callers might save the result string directly
+ * into a long-lived dictionary structure, we don't want it to be a larger
+ * palloc chunk than necessary. We'll reclaim the conversion result on
+ * the next call.)
+ */
+ return pstrdup(recoded);
+}
+
+/*
+ * Close down after reading a file with tsearch_readline()
+ */
+void
+tsearch_readline_end(tsearch_readline_state *stp)
+{
+ /* Suppress use of curline in any error reported below */
+ if (stp->curline)
+ {
+ if (stp->curline != stp->buf.data)
+ pfree(stp->curline);
+ stp->curline = NULL;
+ }
+
+ /* Release other resources */
+ pfree(stp->buf.data);
+ FreeFile(stp->fp);
+
+ /* Pop the error context stack */
+ error_context_stack = stp->cb.previous;
+}
+
+/*
+ * Error context callback for errors occurring while reading a tsearch
+ * configuration file.
+ */
+static void
+tsearch_readline_callback(void *arg)
+{
+ tsearch_readline_state *stp = (tsearch_readline_state *) arg;
+
+ /*
+ * We can't include the text of the config line for errors that occur
+ * during tsearch_readline() itself. The major cause of such errors is
+ * encoding violations, and we daren't try to print error messages
+ * containing badly-encoded data.
+ */
+ if (stp->curline)
+ errcontext("line %d of configuration file \"%s\": \"%s\"",
+ stp->lineno,
+ stp->filename,
+ stp->curline);
+ else
+ errcontext("line %d of configuration file \"%s\"",
+ stp->lineno,
+ stp->filename);
+}
+
+
+/*
+ * lowerstr --- fold null-terminated string to lower case
+ *
+ * Returned string is palloc'd
+ */
+char *
+lowerstr(const char *str)
+{
+ return lowerstr_with_len(str, strlen(str));
+}
+
+/*
+ * lowerstr_with_len --- fold string to lower case
+ *
+ * Input string need not be null-terminated.
+ *
+ * Returned string is palloc'd
+ */
+char *
+lowerstr_with_len(const char *str, int len)
+{
+ char *out;
+ pg_locale_t mylocale = 0; /* TODO */
+
+ if (len == 0)
+ return pstrdup("");
+
+ /*
+ * Use wide char code only when max encoding length > 1 and ctype != C.
+ * Some operating systems fail with multi-byte encodings and a C locale.
+ * Also, for a C locale there is no need to process as multibyte. From
+ * backend/utils/adt/oracle_compat.c Teodor
+ */
+ if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
+ {
+ wchar_t *wstr,
+ *wptr;
+ int wlen;
+
+ /*
+ * alloc number of wchar_t for worst case, len contains number of
+ * bytes >= number of characters and alloc 1 wchar_t for 0, because
+ * wchar2char wants zero-terminated string
+ */
+ wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
+
+ wlen = char2wchar(wstr, len + 1, str, len, mylocale);
+ Assert(wlen <= len);
+
+ while (*wptr)
+ {
+ *wptr = towlower((wint_t) *wptr);
+ wptr++;
+ }
+
+ /*
+ * Alloc result string for worst case + '\0'
+ */
+ len = pg_database_encoding_max_length() * wlen + 1;
+ out = (char *) palloc(len);
+
+ wlen = wchar2char(out, wstr, len, mylocale);
+
+ pfree(wstr);
+
+ if (wlen < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("conversion from wchar_t to server encoding failed: %m")));
+ Assert(wlen < len);
+ }
+ else
+ {
+ const char *ptr = str;
+ char *outptr;
+
+ outptr = out = (char *) palloc(sizeof(char) * (len + 1));
+ while ((ptr - str) < len && *ptr)
+ {
+ *outptr++ = tolower(TOUCHAR(ptr));
+ ptr++;
+ }
+ *outptr = '\0';
+ }
+
+ return out;
+}