diff options
Diffstat (limited to 'src/backend/tsearch/ts_locale.c')
-rw-r--r-- | src/backend/tsearch/ts_locale.c | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c new file mode 100644 index 0000000..3a475a0 --- /dev/null +++ b/src/backend/tsearch/ts_locale.c @@ -0,0 +1,320 @@ +/*------------------------------------------------------------------------- + * + * ts_locale.c + * locale compatibility layer for tsearch + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/tsearch/ts_locale.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/pg_collation.h" +#include "common/string.h" +#include "storage/fd.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_public.h" + +static void tsearch_readline_callback(void *arg); + + +/* + * The reason these functions use a 3-wchar_t output buffer, not 2 as you + * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be + * getting from char2wchar() is UTF16 not UTF32. A single input character + * may therefore produce a surrogate pair rather than just one wchar_t; + * we also need room for a trailing null. When we do get a surrogate pair, + * we pass just the first code to iswdigit() etc, so that these functions will + * always return false for characters outside the Basic Multilingual Plane. + */ +#define WC_BUF_LEN 3 + +int +t_isdigit(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || database_ctype_is_c) + return isdigit(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswdigit((wint_t) character[0]); +} + +int +t_isspace(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || database_ctype_is_c) + return isspace(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswspace((wint_t) character[0]); +} + +int +t_isalpha(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || database_ctype_is_c) + return isalpha(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswalpha((wint_t) character[0]); +} + +int +t_isprint(const char *ptr) +{ + int clen = pg_mblen(ptr); + wchar_t character[WC_BUF_LEN]; + pg_locale_t mylocale = 0; /* TODO */ + + if (clen == 1 || database_ctype_is_c) + return isprint(TOUCHAR(ptr)); + + char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale); + + return iswprint((wint_t) character[0]); +} + + +/* + * Set up to read a file using tsearch_readline(). This facility is + * better than just reading the file directly because it provides error + * context pointing to the specific line where a problem is detected. + * + * Expected usage is: + * + * tsearch_readline_state trst; + * + * if (!tsearch_readline_begin(&trst, filename)) + * ereport(ERROR, + * (errcode(ERRCODE_CONFIG_FILE_ERROR), + * errmsg("could not open stop-word file \"%s\": %m", + * filename))); + * while ((line = tsearch_readline(&trst)) != NULL) + * process line; + * tsearch_readline_end(&trst); + * + * Note that the caller supplies the ereport() for file open failure; + * this is so that a custom message can be provided. The filename string + * passed to tsearch_readline_begin() must remain valid through + * tsearch_readline_end(). + */ +bool +tsearch_readline_begin(tsearch_readline_state *stp, + const char *filename) +{ + if ((stp->fp = AllocateFile(filename, "r")) == NULL) + return false; + stp->filename = filename; + stp->lineno = 0; + initStringInfo(&stp->buf); + stp->curline = NULL; + /* Setup error traceback support for ereport() */ + stp->cb.callback = tsearch_readline_callback; + stp->cb.arg = (void *) stp; + stp->cb.previous = error_context_stack; + error_context_stack = &stp->cb; + return true; +} + +/* + * Read the next line from a tsearch data file (expected to be in UTF-8), and + * convert it to database encoding if needed. The returned string is palloc'd. + * NULL return means EOF. + */ +char * +tsearch_readline(tsearch_readline_state *stp) +{ + char *recoded; + + /* Advance line number to use in error reports */ + stp->lineno++; + + /* Clear curline, it's no longer relevant */ + if (stp->curline) + { + if (stp->curline != stp->buf.data) + pfree(stp->curline); + stp->curline = NULL; + } + + /* Collect next line, if there is one */ + if (!pg_get_line_buf(stp->fp, &stp->buf)) + return NULL; + + /* Validate the input as UTF-8, then convert to DB encoding if needed */ + recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8); + + /* Save the correctly-encoded string for possible error reports */ + stp->curline = recoded; /* might be equal to buf.data */ + + /* + * We always return a freshly pstrdup'd string. This is clearly necessary + * if pg_any_to_server() returned buf.data, and we need a second copy even + * if encoding conversion did occur. The caller is entitled to pfree the + * returned string at any time, which would leave curline pointing to + * recycled storage, causing problems if an error occurs after that point. + * (It's preferable to return the result of pstrdup instead of the output + * of pg_any_to_server, because the conversion result tends to be + * over-allocated. Since callers might save the result string directly + * into a long-lived dictionary structure, we don't want it to be a larger + * palloc chunk than necessary. We'll reclaim the conversion result on + * the next call.) + */ + return pstrdup(recoded); +} + +/* + * Close down after reading a file with tsearch_readline() + */ +void +tsearch_readline_end(tsearch_readline_state *stp) +{ + /* Suppress use of curline in any error reported below */ + if (stp->curline) + { + if (stp->curline != stp->buf.data) + pfree(stp->curline); + stp->curline = NULL; + } + + /* Release other resources */ + pfree(stp->buf.data); + FreeFile(stp->fp); + + /* Pop the error context stack */ + error_context_stack = stp->cb.previous; +} + +/* + * Error context callback for errors occurring while reading a tsearch + * configuration file. + */ +static void +tsearch_readline_callback(void *arg) +{ + tsearch_readline_state *stp = (tsearch_readline_state *) arg; + + /* + * We can't include the text of the config line for errors that occur + * during tsearch_readline() itself. The major cause of such errors is + * encoding violations, and we daren't try to print error messages + * containing badly-encoded data. + */ + if (stp->curline) + errcontext("line %d of configuration file \"%s\": \"%s\"", + stp->lineno, + stp->filename, + stp->curline); + else + errcontext("line %d of configuration file \"%s\"", + stp->lineno, + stp->filename); +} + + +/* + * lowerstr --- fold null-terminated string to lower case + * + * Returned string is palloc'd + */ +char * +lowerstr(const char *str) +{ + return lowerstr_with_len(str, strlen(str)); +} + +/* + * lowerstr_with_len --- fold string to lower case + * + * Input string need not be null-terminated. + * + * Returned string is palloc'd + */ +char * +lowerstr_with_len(const char *str, int len) +{ + char *out; + pg_locale_t mylocale = 0; /* TODO */ + + if (len == 0) + return pstrdup(""); + + /* + * Use wide char code only when max encoding length > 1 and ctype != C. + * Some operating systems fail with multi-byte encodings and a C locale. + * Also, for a C locale there is no need to process as multibyte. From + * backend/utils/adt/oracle_compat.c Teodor + */ + if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c) + { + wchar_t *wstr, + *wptr; + int wlen; + + /* + * alloc number of wchar_t for worst case, len contains number of + * bytes >= number of characters and alloc 1 wchar_t for 0, because + * wchar2char wants zero-terminated string + */ + wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); + + wlen = char2wchar(wstr, len + 1, str, len, mylocale); + Assert(wlen <= len); + + while (*wptr) + { + *wptr = towlower((wint_t) *wptr); + wptr++; + } + + /* + * Alloc result string for worst case + '\0' + */ + len = pg_database_encoding_max_length() * wlen + 1; + out = (char *) palloc(len); + + wlen = wchar2char(out, wstr, len, mylocale); + + pfree(wstr); + + if (wlen < 0) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("conversion from wchar_t to server encoding failed: %m"))); + Assert(wlen < len); + } + else + { + const char *ptr = str; + char *outptr; + + outptr = out = (char *) palloc(sizeof(char) * (len + 1)); + while ((ptr - str) < len && *ptr) + { + *outptr++ = tolower(TOUCHAR(ptr)); + ptr++; + } + *outptr = '\0'; + } + + return out; +} |