summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/mb/conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r--src/backend/utils/mb/conv.c838
1 files changed, 838 insertions, 0 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
new file mode 100644
index 0000000..82bc1ac
--- /dev/null
+++ b/src/backend/utils/mb/conv.c
@@ -0,0 +1,838 @@
+/*-------------------------------------------------------------------------
+ *
+ * Utility functions for conversion procs.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mb/conv.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "mb/pg_wchar.h"
+
+
+/*
+ * local2local: a generic single byte charset encoding
+ * conversion between two ASCII-superset encodings.
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * src_encoding is the PG identifier for the source encoding
+ * dest_encoding is the PG identifier for the target encoding
+ * tab holds conversion entries for the source charset
+ * starting from 128 (0x80). each entry in the table holds the corresponding
+ * code point for the target charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+local2local(const unsigned char *l,
+ unsigned char *p,
+ int len,
+ int src_encoding,
+ int dest_encoding,
+ const unsigned char *tab,
+ bool noError)
+{
+ const unsigned char *start = l;
+ unsigned char c1,
+ c2;
+
+ while (len > 0)
+ {
+ c1 = *l;
+ if (c1 == 0)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(src_encoding, (const char *) l, len);
+ }
+ if (!IS_HIGHBIT_SET(c1))
+ *p++ = c1;
+ else
+ {
+ c2 = tab[c1 - HIGHBIT];
+ if (c2)
+ *p++ = c2;
+ else
+ {
+ if (noError)
+ break;
+ report_untranslatable_char(src_encoding, dest_encoding,
+ (const char *) l, len);
+ }
+ }
+ l++;
+ len--;
+ }
+ *p = '\0';
+
+ return l - start;
+}
+
+/*
+ * LATINn ---> MIC when the charset's local codes map directly to MIC
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+latin2mic(const unsigned char *l, unsigned char *p, int len,
+ int lc, int encoding, bool noError)
+{
+ const unsigned char *start = l;
+ int c1;
+
+ while (len > 0)
+ {
+ c1 = *l;
+ if (c1 == 0)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(encoding, (const char *) l, len);
+ }
+ if (IS_HIGHBIT_SET(c1))
+ *p++ = lc;
+ *p++ = c1;
+ l++;
+ len--;
+ }
+ *p = '\0';
+
+ return l - start;
+}
+
+/*
+ * MIC ---> LATINn when the charset's local codes map directly to MIC
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+mic2latin(const unsigned char *mic, unsigned char *p, int len,
+ int lc, int encoding, bool noError)
+{
+ const unsigned char *start = mic;
+ int c1;
+
+ while (len > 0)
+ {
+ c1 = *mic;
+ if (c1 == 0)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+ }
+ if (!IS_HIGHBIT_SET(c1))
+ {
+ /* easy for ASCII */
+ *p++ = c1;
+ mic++;
+ len--;
+ }
+ else
+ {
+ int l = pg_mule_mblen(mic);
+
+ if (len < l)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+ len);
+ }
+ if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+ {
+ if (noError)
+ break;
+ report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+ (const char *) mic, len);
+ }
+ *p++ = mic[1];
+ mic += 2;
+ len -= 2;
+ }
+ }
+ *p = '\0';
+
+ return mic - start;
+}
+
+
+/*
+ * latin2mic_with_table: a generic single byte charset encoding
+ * conversion from a local charset to the mule internal code.
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the local charset
+ * starting from 128 (0x80). each entry in the table holds the corresponding
+ * code point for the mule encoding, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+latin2mic_with_table(const unsigned char *l,
+ unsigned char *p,
+ int len,
+ int lc,
+ int encoding,
+ const unsigned char *tab,
+ bool noError)
+{
+ const unsigned char *start = l;
+ unsigned char c1,
+ c2;
+
+ while (len > 0)
+ {
+ c1 = *l;
+ if (c1 == 0)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(encoding, (const char *) l, len);
+ }
+ if (!IS_HIGHBIT_SET(c1))
+ *p++ = c1;
+ else
+ {
+ c2 = tab[c1 - HIGHBIT];
+ if (c2)
+ {
+ *p++ = lc;
+ *p++ = c2;
+ }
+ else
+ {
+ if (noError)
+ break;
+ report_untranslatable_char(encoding, PG_MULE_INTERNAL,
+ (const char *) l, len);
+ }
+ }
+ l++;
+ len--;
+ }
+ *p = '\0';
+
+ return l - start;
+}
+
+/*
+ * mic2latin_with_table: a generic single byte charset encoding
+ * conversion from the mule internal code to a local charset.
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the mule internal code's second byte,
+ * starting from 128 (0x80). each entry in the table holds the corresponding
+ * code point for the local charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+mic2latin_with_table(const unsigned char *mic,
+ unsigned char *p,
+ int len,
+ int lc,
+ int encoding,
+ const unsigned char *tab,
+ bool noError)
+{
+ const unsigned char *start = mic;
+ unsigned char c1,
+ c2;
+
+ while (len > 0)
+ {
+ c1 = *mic;
+ if (c1 == 0)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+ }
+ if (!IS_HIGHBIT_SET(c1))
+ {
+ /* easy for ASCII */
+ *p++ = c1;
+ mic++;
+ len--;
+ }
+ else
+ {
+ int l = pg_mule_mblen(mic);
+
+ if (len < l)
+ {
+ if (noError)
+ break;
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+ len);
+ }
+ if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
+ (c2 = tab[mic[1] - HIGHBIT]) == 0)
+ {
+ if (noError)
+ break;
+ report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+ (const char *) mic, len);
+ break; /* keep compiler quiet */
+ }
+ *p++ = c2;
+ mic += 2;
+ len -= 2;
+ }
+ }
+ *p = '\0';
+
+ return mic - start;
+}
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for combined UTF8 -> local code
+ */
+static int
+compare3(const void *p1, const void *p2)
+{
+ uint32 s1,
+ s2,
+ d1,
+ d2;
+
+ s1 = *(const uint32 *) p1;
+ s2 = *((const uint32 *) p1 + 1);
+ d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
+ d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
+ return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
+}
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for local code -> combined UTF8
+ */
+static int
+compare4(const void *p1, const void *p2)
+{
+ uint32 v1,
+ v2;
+
+ v1 = *(const uint32 *) p1;
+ v2 = ((const pg_local_to_utf_combined *) p2)->code;
+ return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * store 32bit character representation into multibyte stream
+ */
+static inline unsigned char *
+store_coded_char(unsigned char *dest, uint32 code)
+{
+ if (code & 0xff000000)
+ *dest++ = code >> 24;
+ if (code & 0x00ff0000)
+ *dest++ = code >> 16;
+ if (code & 0x0000ff00)
+ *dest++ = code >> 8;
+ if (code & 0x000000ff)
+ *dest++ = code;
+ return dest;
+}
+
+/*
+ * Convert a character using a conversion radix tree.
+ *
+ * 'l' is the length of the input character in bytes, and b1-b4 are
+ * the input character's bytes.
+ */
+static inline uint32
+pg_mb_radix_conv(const pg_mb_radix_tree *rt,
+ int l,
+ unsigned char b1,
+ unsigned char b2,
+ unsigned char b3,
+ unsigned char b4)
+{
+ if (l == 4)
+ {
+ /* 4-byte code */
+
+ /* check code validity */
+ if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
+ b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
+ b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
+ b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
+ return 0;
+
+ /* perform lookup */
+ if (rt->chars32)
+ {
+ uint32 idx = rt->b4root;
+
+ idx = rt->chars32[b1 + idx - rt->b4_1_lower];
+ idx = rt->chars32[b2 + idx - rt->b4_2_lower];
+ idx = rt->chars32[b3 + idx - rt->b4_3_lower];
+ return rt->chars32[b4 + idx - rt->b4_4_lower];
+ }
+ else
+ {
+ uint16 idx = rt->b4root;
+
+ idx = rt->chars16[b1 + idx - rt->b4_1_lower];
+ idx = rt->chars16[b2 + idx - rt->b4_2_lower];
+ idx = rt->chars16[b3 + idx - rt->b4_3_lower];
+ return rt->chars16[b4 + idx - rt->b4_4_lower];
+ }
+ }
+ else if (l == 3)
+ {
+ /* 3-byte code */
+
+ /* check code validity */
+ if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
+ b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
+ b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
+ return 0;
+
+ /* perform lookup */
+ if (rt->chars32)
+ {
+ uint32 idx = rt->b3root;
+
+ idx = rt->chars32[b2 + idx - rt->b3_1_lower];
+ idx = rt->chars32[b3 + idx - rt->b3_2_lower];
+ return rt->chars32[b4 + idx - rt->b3_3_lower];
+ }
+ else
+ {
+ uint16 idx = rt->b3root;
+
+ idx = rt->chars16[b2 + idx - rt->b3_1_lower];
+ idx = rt->chars16[b3 + idx - rt->b3_2_lower];
+ return rt->chars16[b4 + idx - rt->b3_3_lower];
+ }
+ }
+ else if (l == 2)
+ {
+ /* 2-byte code */
+
+ /* check code validity - first byte */
+ if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
+ b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
+ return 0;
+
+ /* perform lookup */
+ if (rt->chars32)
+ {
+ uint32 idx = rt->b2root;
+
+ idx = rt->chars32[b3 + idx - rt->b2_1_lower];
+ return rt->chars32[b4 + idx - rt->b2_2_lower];
+ }
+ else
+ {
+ uint16 idx = rt->b2root;
+
+ idx = rt->chars16[b3 + idx - rt->b2_1_lower];
+ return rt->chars16[b4 + idx - rt->b2_2_lower];
+ }
+ }
+ else if (l == 1)
+ {
+ /* 1-byte code */
+
+ /* check code validity - first byte */
+ if (b4 < rt->b1_lower || b4 > rt->b1_upper)
+ return 0;
+
+ /* perform lookup */
+ if (rt->chars32)
+ return rt->chars32[b4 + rt->b1root - rt->b1_lower];
+ else
+ return rt->chars16[b4 + rt->b1root - rt->b1_lower];
+ }
+ return 0; /* shouldn't happen */
+}
+
+/*
+ * UTF8 ---> local code
+ *
+ * utf: input string in UTF8 encoding (need not be null-terminated)
+ * len: length of input string (in bytes)
+ * iso: pointer to the output area (must be large enough!)
+ (output string will be null-terminated)
+ * map: conversion map for single characters
+ * cmap: conversion map for combined characters
+ * (optional, pass NULL if none)
+ * cmapsize: number of entries in the conversion map for combined characters
+ * (optional, pass 0 if none)
+ * conv_func: algorithmic encoding conversion function
+ * (optional, pass NULL if none)
+ * encoding: PG identifier for the local encoding
+ *
+ * For each character, the cmap (if provided) is consulted first; if no match,
+ * the map is consulted next; if still no match, the conv_func (if provided)
+ * is applied. An error is raised if no match is found.
+ *
+ * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+UtfToLocal(const unsigned char *utf, int len,
+ unsigned char *iso,
+ const pg_mb_radix_tree *map,
+ const pg_utf_to_local_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding, bool noError)
+{
+ uint32 iutf;
+ int l;
+ const pg_utf_to_local_combined *cp;
+ const unsigned char *start = utf;
+
+ if (!PG_VALID_ENCODING(encoding))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid encoding number: %d", encoding)));
+
+ for (; len > 0; len -= l)
+ {
+ unsigned char b1 = 0;
+ unsigned char b2 = 0;
+ unsigned char b3 = 0;
+ unsigned char b4 = 0;
+
+ /* "break" cases all represent errors */
+ if (*utf == '\0')
+ break;
+
+ l = pg_utf_mblen(utf);
+ if (len < l)
+ break;
+
+ if (!pg_utf8_islegal(utf, l))
+ break;
+
+ if (l == 1)
+ {
+ /* ASCII case is easy, assume it's one-to-one conversion */
+ *iso++ = *utf++;
+ continue;
+ }
+
+ /* collect coded char of length l */
+ if (l == 2)
+ {
+ b3 = *utf++;
+ b4 = *utf++;
+ }
+ else if (l == 3)
+ {
+ b2 = *utf++;
+ b3 = *utf++;
+ b4 = *utf++;
+ }
+ else if (l == 4)
+ {
+ b1 = *utf++;
+ b2 = *utf++;
+ b3 = *utf++;
+ b4 = *utf++;
+ }
+ else
+ {
+ elog(ERROR, "unsupported character length %d", l);
+ iutf = 0; /* keep compiler quiet */
+ }
+ iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
+
+ /* First, try with combined map if possible */
+ if (cmap && len > l)
+ {
+ const unsigned char *utf_save = utf;
+ int len_save = len;
+ int l_save = l;
+
+ /* collect next character, same as above */
+ len -= l;
+
+ l = pg_utf_mblen(utf);
+ if (len < l)
+ {
+ /* need more data to decide if this is a combined char */
+ utf -= l_save;
+ break;
+ }
+
+ if (!pg_utf8_islegal(utf, l))
+ {
+ if (!noError)
+ report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+ utf -= l_save;
+ break;
+ }
+
+ /* We assume ASCII character cannot be in combined map */
+ if (l > 1)
+ {
+ uint32 iutf2;
+ uint32 cutf[2];
+
+ if (l == 2)
+ {
+ iutf2 = *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else if (l == 3)
+ {
+ iutf2 = *utf++ << 16;
+ iutf2 |= *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else if (l == 4)
+ {
+ iutf2 = *utf++ << 24;
+ iutf2 |= *utf++ << 16;
+ iutf2 |= *utf++ << 8;
+ iutf2 |= *utf++;
+ }
+ else
+ {
+ elog(ERROR, "unsupported character length %d", l);
+ iutf2 = 0; /* keep compiler quiet */
+ }
+
+ cutf[0] = iutf;
+ cutf[1] = iutf2;
+
+ cp = bsearch(cutf, cmap, cmapsize,
+ sizeof(pg_utf_to_local_combined), compare3);
+
+ if (cp)
+ {
+ iso = store_coded_char(iso, cp->code);
+ continue;
+ }
+ }
+
+ /* fail, so back up to reprocess second character next time */
+ utf = utf_save;
+ len = len_save;
+ l = l_save;
+ }
+
+ /* Now check ordinary map */
+ if (map)
+ {
+ uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
+
+ if (converted)
+ {
+ iso = store_coded_char(iso, converted);
+ continue;
+ }
+ }
+
+ /* if there's a conversion function, try that */
+ if (conv_func)
+ {
+ uint32 converted = (*conv_func) (iutf);
+
+ if (converted)
+ {
+ iso = store_coded_char(iso, converted);
+ continue;
+ }
+ }
+
+ /* failed to translate this character */
+ utf -= l;
+ if (noError)
+ break;
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) utf, len);
+ }
+
+ /* if we broke out of loop early, must be invalid input */
+ if (len > 0 && !noError)
+ report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+
+ *iso = '\0';
+
+ return utf - start;
+}
+
+/*
+ * local code ---> UTF8
+ *
+ * iso: input string in local encoding (need not be null-terminated)
+ * len: length of input string (in bytes)
+ * utf: pointer to the output area (must be large enough!)
+ (output string will be null-terminated)
+ * map: conversion map for single characters
+ * cmap: conversion map for combined characters
+ * (optional, pass NULL if none)
+ * cmapsize: number of entries in the conversion map for combined characters
+ * (optional, pass 0 if none)
+ * conv_func: algorithmic encoding conversion function
+ * (optional, pass NULL if none)
+ * encoding: PG identifier for the local encoding
+ *
+ * For each character, the map is consulted first; if no match, the cmap
+ * (if provided) is consulted next; if still no match, the conv_func
+ * (if provided) is applied. An error is raised if no match is found.
+ *
+ * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed. If noError is true, this can
+ * be less than 'len'.
+ */
+int
+LocalToUtf(const unsigned char *iso, int len,
+ unsigned char *utf,
+ const pg_mb_radix_tree *map,
+ const pg_local_to_utf_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding,
+ bool noError)
+{
+ uint32 iiso;
+ int l;
+ const pg_local_to_utf_combined *cp;
+ const unsigned char *start = iso;
+
+ if (!PG_VALID_ENCODING(encoding))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid encoding number: %d", encoding)));
+
+ for (; len > 0; len -= l)
+ {
+ unsigned char b1 = 0;
+ unsigned char b2 = 0;
+ unsigned char b3 = 0;
+ unsigned char b4 = 0;
+
+ /* "break" cases all represent errors */
+ if (*iso == '\0')
+ break;
+
+ if (!IS_HIGHBIT_SET(*iso))
+ {
+ /* ASCII case is easy, assume it's one-to-one conversion */
+ *utf++ = *iso++;
+ l = 1;
+ continue;
+ }
+
+ l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
+ if (l < 0)
+ break;
+
+ /* collect coded char of length l */
+ if (l == 1)
+ b4 = *iso++;
+ else if (l == 2)
+ {
+ b3 = *iso++;
+ b4 = *iso++;
+ }
+ else if (l == 3)
+ {
+ b2 = *iso++;
+ b3 = *iso++;
+ b4 = *iso++;
+ }
+ else if (l == 4)
+ {
+ b1 = *iso++;
+ b2 = *iso++;
+ b3 = *iso++;
+ b4 = *iso++;
+ }
+ else
+ {
+ elog(ERROR, "unsupported character length %d", l);
+ iiso = 0; /* keep compiler quiet */
+ }
+ iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
+
+ if (map)
+ {
+ uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
+
+ if (converted)
+ {
+ utf = store_coded_char(utf, converted);
+ continue;
+ }
+
+ /* If there's a combined character map, try that */
+ if (cmap)
+ {
+ cp = bsearch(&iiso, cmap, cmapsize,
+ sizeof(pg_local_to_utf_combined), compare4);
+
+ if (cp)
+ {
+ utf = store_coded_char(utf, cp->utf1);
+ utf = store_coded_char(utf, cp->utf2);
+ continue;
+ }
+ }
+ }
+
+ /* if there's a conversion function, try that */
+ if (conv_func)
+ {
+ uint32 converted = (*conv_func) (iiso);
+
+ if (converted)
+ {
+ utf = store_coded_char(utf, converted);
+ continue;
+ }
+ }
+
+ /* failed to translate this character */
+ iso -= l;
+ if (noError)
+ break;
+ report_untranslatable_char(encoding, PG_UTF8,
+ (const char *) iso, len);
+ }
+
+ /* if we broke out of loop early, must be invalid input */
+ if (len > 0 && !noError)
+ report_invalid_encoding(encoding, (const char *) iso, len);
+
+ *utf = '\0';
+
+ return iso - start;
+}