/*------------------------------------------------------------------------- * * Utility functions for conversion procs. * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * src/backend/utils/mb/conv.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include "mb/pg_wchar.h" /* * local2local: a generic single byte charset encoding * conversion between two ASCII-superset encodings. * * l points to the source string of length len * p is the output area (must be large enough!) * src_encoding is the PG identifier for the source encoding * dest_encoding is the PG identifier for the target encoding * tab holds conversion entries for the source charset * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the target charset, or 0 if there is no equivalent code. * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int local2local(const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, const unsigned char *tab, bool noError) { const unsigned char *start = l; unsigned char c1, c2; while (len > 0) { c1 = *l; if (c1 == 0) { if (noError) break; report_invalid_encoding(src_encoding, (const char *) l, len); } if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else { c2 = tab[c1 - HIGHBIT]; if (c2) *p++ = c2; else { if (noError) break; report_untranslatable_char(src_encoding, dest_encoding, (const char *) l, len); } } l++; len--; } *p = '\0'; return l - start; } /* * LATINn ---> MIC when the charset's local codes map directly to MIC * * l points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int latin2mic(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, bool noError) { const unsigned char *start = l; int c1; while (len > 0) { c1 = *l; if (c1 == 0) { if (noError) break; report_invalid_encoding(encoding, (const char *) l, len); } if (IS_HIGHBIT_SET(c1)) *p++ = lc; *p++ = c1; l++; len--; } *p = '\0'; return l - start; } /* * MIC ---> LATINn when the charset's local codes map directly to MIC * * mic points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int mic2latin(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, bool noError) { const unsigned char *start = mic; int c1; while (len > 0) { c1 = *mic; if (c1 == 0) { if (noError) break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); } if (!IS_HIGHBIT_SET(c1)) { /* easy for ASCII */ *p++ = c1; mic++; len--; } else { int l = pg_mule_mblen(mic); if (len < l) { if (noError) break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); } if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) { if (noError) break; report_untranslatable_char(PG_MULE_INTERNAL, encoding, (const char *) mic, len); } *p++ = mic[1]; mic += 2; len -= 2; } } *p = '\0'; return mic - start; } /* * latin2mic_with_table: a generic single byte charset encoding * conversion from a local charset to the mule internal code. * * l points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding * tab holds conversion entries for the local charset * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the mule encoding, or 0 if there is no equivalent code. * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int latin2mic_with_table(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab, bool noError) { const unsigned char *start = l; unsigned char c1, c2; while (len > 0) { c1 = *l; if (c1 == 0) { if (noError) break; report_invalid_encoding(encoding, (const char *) l, len); } if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else { c2 = tab[c1 - HIGHBIT]; if (c2) { *p++ = lc; *p++ = c2; } else { if (noError) break; report_untranslatable_char(encoding, PG_MULE_INTERNAL, (const char *) l, len); } } l++; len--; } *p = '\0'; return l - start; } /* * mic2latin_with_table: a generic single byte charset encoding * conversion from the mule internal code to a local charset. * * mic points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding * tab holds conversion entries for the mule internal code's second byte, * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the local charset, or 0 if there is no equivalent code. * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab, bool noError) { const unsigned char *start = mic; unsigned char c1, c2; while (len > 0) { c1 = *mic; if (c1 == 0) { if (noError) break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); } if (!IS_HIGHBIT_SET(c1)) { /* easy for ASCII */ *p++ = c1; mic++; len--; } else { int l = pg_mule_mblen(mic); if (len < l) { if (noError) break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); } if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || (c2 = tab[mic[1] - HIGHBIT]) == 0) { if (noError) break; report_untranslatable_char(PG_MULE_INTERNAL, encoding, (const char *) mic, len); break; /* keep compiler quiet */ } *p++ = c2; mic += 2; len -= 2; } } *p = '\0'; return mic - start; } /* * comparison routine for bsearch() * this routine is intended for combined UTF8 -> local code */ static int compare3(const void *p1, const void *p2) { uint32 s1, s2, d1, d2; s1 = *(const uint32 *) p1; s2 = *((const uint32 *) p1 + 1); d1 = ((const pg_utf_to_local_combined *) p2)->utf1; d2 = ((const pg_utf_to_local_combined *) p2)->utf2; return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1); } /* * comparison routine for bsearch() * this routine is intended for local code -> combined UTF8 */ static int compare4(const void *p1, const void *p2) { uint32 v1, v2; v1 = *(const uint32 *) p1; v2 = ((const pg_local_to_utf_combined *) p2)->code; return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); } /* * store 32bit character representation into multibyte stream */ static inline unsigned char * store_coded_char(unsigned char *dest, uint32 code) { if (code & 0xff000000) *dest++ = code >> 24; if (code & 0x00ff0000) *dest++ = code >> 16; if (code & 0x0000ff00) *dest++ = code >> 8; if (code & 0x000000ff) *dest++ = code; return dest; } /* * Convert a character using a conversion radix tree. * * 'l' is the length of the input character in bytes, and b1-b4 are * the input character's bytes. */ static inline uint32 pg_mb_radix_conv(const pg_mb_radix_tree *rt, int l, unsigned char b1, unsigned char b2, unsigned char b3, unsigned char b4) { if (l == 4) { /* 4-byte code */ /* check code validity */ if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) return 0; /* perform lookup */ if (rt->chars32) { uint32 idx = rt->b4root; idx = rt->chars32[b1 + idx - rt->b4_1_lower]; idx = rt->chars32[b2 + idx - rt->b4_2_lower]; idx = rt->chars32[b3 + idx - rt->b4_3_lower]; return rt->chars32[b4 + idx - rt->b4_4_lower]; } else { uint16 idx = rt->b4root; idx = rt->chars16[b1 + idx - rt->b4_1_lower]; idx = rt->chars16[b2 + idx - rt->b4_2_lower]; idx = rt->chars16[b3 + idx - rt->b4_3_lower]; return rt->chars16[b4 + idx - rt->b4_4_lower]; } } else if (l == 3) { /* 3-byte code */ /* check code validity */ if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) return 0; /* perform lookup */ if (rt->chars32) { uint32 idx = rt->b3root; idx = rt->chars32[b2 + idx - rt->b3_1_lower]; idx = rt->chars32[b3 + idx - rt->b3_2_lower]; return rt->chars32[b4 + idx - rt->b3_3_lower]; } else { uint16 idx = rt->b3root; idx = rt->chars16[b2 + idx - rt->b3_1_lower]; idx = rt->chars16[b3 + idx - rt->b3_2_lower]; return rt->chars16[b4 + idx - rt->b3_3_lower]; } } else if (l == 2) { /* 2-byte code */ /* check code validity - first byte */ if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) return 0; /* perform lookup */ if (rt->chars32) { uint32 idx = rt->b2root; idx = rt->chars32[b3 + idx - rt->b2_1_lower]; return rt->chars32[b4 + idx - rt->b2_2_lower]; } else { uint16 idx = rt->b2root; idx = rt->chars16[b3 + idx - rt->b2_1_lower]; return rt->chars16[b4 + idx - rt->b2_2_lower]; } } else if (l == 1) { /* 1-byte code */ /* check code validity - first byte */ if (b4 < rt->b1_lower || b4 > rt->b1_upper) return 0; /* perform lookup */ if (rt->chars32) return rt->chars32[b4 + rt->b1root - rt->b1_lower]; else return rt->chars16[b4 + rt->b1root - rt->b1_lower]; } return 0; /* shouldn't happen */ } /* * UTF8 ---> local code * * utf: input string in UTF8 encoding (need not be null-terminated) * len: length of input string (in bytes) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters * (optional, pass 0 if none) * conv_func: algorithmic encoding conversion function * (optional, pass NULL if none) * encoding: PG identifier for the local encoding * * For each character, the cmap (if provided) is consulted first; if no match, * the map is consulted next; if still no match, the conv_func (if provided) * is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError) { uint32 iutf; int l; const pg_utf_to_local_combined *cp; const unsigned char *start = utf; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { unsigned char b1 = 0; unsigned char b2 = 0; unsigned char b3 = 0; unsigned char b4 = 0; /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy, assume it's one-to-one conversion */ *iso++ = *utf++; continue; } /* collect coded char of length l */ if (l == 2) { b3 = *utf++; b4 = *utf++; } else if (l == 3) { b2 = *utf++; b3 = *utf++; b4 = *utf++; } else if (l == 4) { b1 = *utf++; b2 = *utf++; b3 = *utf++; b4 = *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); /* First, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; /* collect next character, same as above */ len -= l; l = pg_utf_mblen(utf); if (len < l) { /* need more data to decide if this is a combined char */ utf -= l_save; break; } if (!pg_utf8_islegal(utf, l)) { if (!noError) report_invalid_encoding(PG_UTF8, (const char *) utf, len); utf -= l_save; break; } /* We assume ASCII character cannot be in combined map */ if (l > 1) { uint32 iutf2; uint32 cutf[2]; if (l == 2) { iutf2 = *utf++ << 8; iutf2 |= *utf++; } else if (l == 3) { iutf2 = *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else if (l == 4) { iutf2 = *utf++ << 24; iutf2 |= *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf2 = 0; /* keep compiler quiet */ } cutf[0] = iutf; cutf[1] = iutf2; cp = bsearch(cutf, cmap, cmapsize, sizeof(pg_utf_to_local_combined), compare3); if (cp) { iso = store_coded_char(iso, cp->code); continue; } } /* fail, so back up to reprocess second character next time */ utf = utf_save; len = len_save; l = l_save; } /* Now check ordinary map */ if (map) { uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); if (converted) { iso = store_coded_char(iso, converted); continue; } } /* if there's a conversion function, try that */ if (conv_func) { uint32 converted = (*conv_func) (iutf); if (converted) { iso = store_coded_char(iso, converted); continue; } } /* failed to translate this character */ utf -= l; if (noError) break; report_untranslatable_char(PG_UTF8, encoding, (const char *) utf, len); } /* if we broke out of loop early, must be invalid input */ if (len > 0 && !noError) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; return utf - start; } /* * local code ---> UTF8 * * iso: input string in local encoding (need not be null-terminated) * len: length of input string (in bytes) * utf: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters * (optional, pass 0 if none) * conv_func: algorithmic encoding conversion function * (optional, pass NULL if none) * encoding: PG identifier for the local encoding * * For each character, the map is consulted first; if no match, the cmap * (if provided) is consulted next; if still no match, the conv_func * (if provided) is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. * * Returns the number of input bytes consumed. If noError is true, this can * be less than 'len'. */ int LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding, bool noError) { uint32 iiso; int l; const pg_local_to_utf_combined *cp; const unsigned char *start = iso; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { unsigned char b1 = 0; unsigned char b2 = 0; unsigned char b3 = 0; unsigned char b4 = 0; /* "break" cases all represent errors */ if (*iso == '\0') break; if (!IS_HIGHBIT_SET(*iso)) { /* ASCII case is easy, assume it's one-to-one conversion */ *utf++ = *iso++; l = 1; continue; } l = pg_encoding_verifymbchar(encoding, (const char *) iso, len); if (l < 0) break; /* collect coded char of length l */ if (l == 1) b4 = *iso++; else if (l == 2) { b3 = *iso++; b4 = *iso++; } else if (l == 3) { b2 = *iso++; b3 = *iso++; b4 = *iso++; } else if (l == 4) { b1 = *iso++; b2 = *iso++; b3 = *iso++; b4 = *iso++; } else { elog(ERROR, "unsupported character length %d", l); iiso = 0; /* keep compiler quiet */ } iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); if (map) { uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); if (converted) { utf = store_coded_char(utf, converted); continue; } /* If there's a combined character map, try that */ if (cmap) { cp = bsearch(&iiso, cmap, cmapsize, sizeof(pg_local_to_utf_combined), compare4); if (cp) { utf = store_coded_char(utf, cp->utf1); utf = store_coded_char(utf, cp->utf2); continue; } } } /* if there's a conversion function, try that */ if (conv_func) { uint32 converted = (*conv_func) (iiso); if (converted) { utf = store_coded_char(utf, converted); continue; } } /* failed to translate this character */ iso -= l; if (noError) break; report_untranslatable_char(encoding, PG_UTF8, (const char *) iso, len); } /* if we broke out of loop early, must be invalid input */ if (len > 0 && !noError) report_invalid_encoding(encoding, (const char *) iso, len); *utf = '\0'; return iso - start; }