diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/encode.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/adt/encode.c')
-rw-r--r-- | src/backend/utils/adt/encode.c | 602 |
1 files changed, 602 insertions, 0 deletions
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c new file mode 100644 index 0000000..6dd93f9 --- /dev/null +++ b/src/backend/utils/adt/encode.c @@ -0,0 +1,602 @@ +/*------------------------------------------------------------------------- + * + * encode.c + * Various data encoding/decoding things. + * + * Copyright (c) 2001-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/encode.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <ctype.h> + +#include "mb/pg_wchar.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + + +/* + * Encoding conversion API. + * encode_len() and decode_len() compute the amount of space needed, while + * encode() and decode() perform the actual conversions. It is okay for + * the _len functions to return an overestimate, but not an underestimate. + * (Having said that, large overestimates could cause unnecessary errors, + * so it's better to get it right.) The conversion routines write to the + * buffer at *res and return the true length of their output. + */ +struct pg_encoding +{ + uint64 (*encode_len) (const char *data, size_t dlen); + uint64 (*decode_len) (const char *data, size_t dlen); + uint64 (*encode) (const char *data, size_t dlen, char *res); + uint64 (*decode) (const char *data, size_t dlen, char *res); +}; + +static const struct pg_encoding *pg_find_encoding(const char *name); + +/* + * SQL functions. + */ + +Datum +binary_encode(PG_FUNCTION_ARGS) +{ + bytea *data = PG_GETARG_BYTEA_PP(0); + Datum name = PG_GETARG_DATUM(1); + text *result; + char *namebuf; + char *dataptr; + size_t datalen; + uint64 resultlen; + uint64 res; + const struct pg_encoding *enc; + + namebuf = TextDatumGetCString(name); + + enc = pg_find_encoding(namebuf); + if (enc == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized encoding: \"%s\"", namebuf))); + + dataptr = VARDATA_ANY(data); + datalen = VARSIZE_ANY_EXHDR(data); + + resultlen = enc->encode_len(dataptr, datalen); + + /* + * resultlen possibly overflows uint32, therefore on 32-bit machines it's + * unsafe to rely on palloc's internal check. + */ + if (resultlen > MaxAllocSize - VARHDRSZ) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("result of encoding conversion is too large"))); + + result = palloc(VARHDRSZ + resultlen); + + res = enc->encode(dataptr, datalen, VARDATA(result)); + + /* Make this FATAL 'cause we've trodden on memory ... */ + if (res > resultlen) + elog(FATAL, "overflow - encode estimate too small"); + + SET_VARSIZE(result, VARHDRSZ + res); + + PG_RETURN_TEXT_P(result); +} + +Datum +binary_decode(PG_FUNCTION_ARGS) +{ + text *data = PG_GETARG_TEXT_PP(0); + Datum name = PG_GETARG_DATUM(1); + bytea *result; + char *namebuf; + char *dataptr; + size_t datalen; + uint64 resultlen; + uint64 res; + const struct pg_encoding *enc; + + namebuf = TextDatumGetCString(name); + + enc = pg_find_encoding(namebuf); + if (enc == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized encoding: \"%s\"", namebuf))); + + dataptr = VARDATA_ANY(data); + datalen = VARSIZE_ANY_EXHDR(data); + + resultlen = enc->decode_len(dataptr, datalen); + + /* + * resultlen possibly overflows uint32, therefore on 32-bit machines it's + * unsafe to rely on palloc's internal check. + */ + if (resultlen > MaxAllocSize - VARHDRSZ) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("result of decoding conversion is too large"))); + + result = palloc(VARHDRSZ + resultlen); + + res = enc->decode(dataptr, datalen, VARDATA(result)); + + /* Make this FATAL 'cause we've trodden on memory ... */ + if (res > resultlen) + elog(FATAL, "overflow - decode estimate too small"); + + SET_VARSIZE(result, VARHDRSZ + res); + + PG_RETURN_BYTEA_P(result); +} + + +/* + * HEX + */ + +static const char hextbl[] = "0123456789abcdef"; + +static const int8 hexlookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +uint64 +hex_encode(const char *src, size_t len, char *dst) +{ + const char *end = src + len; + + while (src < end) + { + *dst++ = hextbl[(*src >> 4) & 0xF]; + *dst++ = hextbl[*src & 0xF]; + src++; + } + return (uint64) len * 2; +} + +static inline char +get_hex(const char *cp) +{ + unsigned char c = (unsigned char) *cp; + int res = -1; + + if (c < 127) + res = hexlookup[c]; + + if (res < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid hexadecimal digit: \"%.*s\"", + pg_mblen(cp), cp))); + + return (char) res; +} + +uint64 +hex_decode(const char *src, size_t len, char *dst) +{ + const char *s, + *srcend; + char v1, + v2, + *p; + + srcend = src + len; + s = src; + p = dst; + while (s < srcend) + { + if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r') + { + s++; + continue; + } + v1 = get_hex(s) << 4; + s++; + if (s >= srcend) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid hexadecimal data: odd number of digits"))); + + v2 = get_hex(s); + s++; + *p++ = v1 | v2; + } + + return p - dst; +} + +static uint64 +hex_enc_len(const char *src, size_t srclen) +{ + return (uint64) srclen << 1; +} + +static uint64 +hex_dec_len(const char *src, size_t srclen) +{ + return (uint64) srclen >> 1; +} + +/* + * BASE64 + */ + +static const char _base64[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static const int8 b64lookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, +}; + +static uint64 +pg_base64_encode(const char *src, size_t len, char *dst) +{ + char *p, + *lend = dst + 76; + const char *s, + *end = src + len; + int pos = 2; + uint32 buf = 0; + + s = src; + p = dst; + + while (s < end) + { + buf |= (unsigned char) *s << (pos << 3); + pos--; + s++; + + /* write it out */ + if (pos < 0) + { + *p++ = _base64[(buf >> 18) & 0x3f]; + *p++ = _base64[(buf >> 12) & 0x3f]; + *p++ = _base64[(buf >> 6) & 0x3f]; + *p++ = _base64[buf & 0x3f]; + + pos = 2; + buf = 0; + } + if (p >= lend) + { + *p++ = '\n'; + lend = p + 76; + } + } + if (pos != 2) + { + *p++ = _base64[(buf >> 18) & 0x3f]; + *p++ = _base64[(buf >> 12) & 0x3f]; + *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '='; + *p++ = '='; + } + + return p - dst; +} + +static uint64 +pg_base64_decode(const char *src, size_t len, char *dst) +{ + const char *srcend = src + len, + *s = src; + char *p = dst; + char c; + int b = 0; + uint32 buf = 0; + int pos = 0, + end = 0; + + while (s < srcend) + { + c = *s++; + + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + continue; + + if (c == '=') + { + /* end sequence */ + if (!end) + { + if (pos == 2) + end = 1; + else if (pos == 3) + end = 2; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unexpected \"=\" while decoding base64 sequence"))); + } + b = 0; + } + else + { + b = -1; + if (c > 0 && c < 127) + b = b64lookup[(unsigned char) c]; + if (b < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence", + pg_mblen(s - 1), s - 1))); + } + /* add it to buffer */ + buf = (buf << 6) + b; + pos++; + if (pos == 4) + { + *p++ = (buf >> 16) & 255; + if (end == 0 || end > 1) + *p++ = (buf >> 8) & 255; + if (end == 0 || end > 2) + *p++ = buf & 255; + buf = 0; + pos = 0; + } + } + + if (pos != 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid base64 end sequence"), + errhint("Input data is missing padding, is truncated, or is otherwise corrupted."))); + + return p - dst; +} + + +static uint64 +pg_base64_enc_len(const char *src, size_t srclen) +{ + /* 3 bytes will be converted to 4, linefeed after 76 chars */ + return ((uint64) srclen + 2) * 4 / 3 + (uint64) srclen / (76 * 3 / 4); +} + +static uint64 +pg_base64_dec_len(const char *src, size_t srclen) +{ + return ((uint64) srclen * 3) >> 2; +} + +/* + * Escape + * Minimally escape bytea to text. + * De-escape text to bytea. + * + * We must escape zero bytes and high-bit-set bytes to avoid generating + * text that might be invalid in the current encoding, or that might + * change to something else if passed through an encoding conversion + * (leading to failing to de-escape to the original bytea value). + * Also of course backslash itself has to be escaped. + * + * De-escaping processes \\ and any \### octal + */ + +#define VAL(CH) ((CH) - '0') +#define DIG(VAL) ((VAL) + '0') + +static uint64 +esc_encode(const char *src, size_t srclen, char *dst) +{ + const char *end = src + srclen; + char *rp = dst; + uint64 len = 0; + + while (src < end) + { + unsigned char c = (unsigned char) *src; + + if (c == '\0' || IS_HIGHBIT_SET(c)) + { + rp[0] = '\\'; + rp[1] = DIG(c >> 6); + rp[2] = DIG((c >> 3) & 7); + rp[3] = DIG(c & 7); + rp += 4; + len += 4; + } + else if (c == '\\') + { + rp[0] = '\\'; + rp[1] = '\\'; + rp += 2; + len += 2; + } + else + { + *rp++ = c; + len++; + } + + src++; + } + + return len; +} + +static uint64 +esc_decode(const char *src, size_t srclen, char *dst) +{ + const char *end = src + srclen; + char *rp = dst; + uint64 len = 0; + + while (src < end) + { + if (src[0] != '\\') + *rp++ = *src++; + else if (src + 3 < end && + (src[1] >= '0' && src[1] <= '3') && + (src[2] >= '0' && src[2] <= '7') && + (src[3] >= '0' && src[3] <= '7')) + { + int val; + + val = VAL(src[1]); + val <<= 3; + val += VAL(src[2]); + val <<= 3; + *rp++ = val + VAL(src[3]); + src += 4; + } + else if (src + 1 < end && + (src[1] == '\\')) + { + *rp++ = '\\'; + src += 2; + } + else + { + /* + * One backslash, not followed by ### valid octal. Should never + * get here, since esc_dec_len does same check. + */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "bytea"))); + } + + len++; + } + + return len; +} + +static uint64 +esc_enc_len(const char *src, size_t srclen) +{ + const char *end = src + srclen; + uint64 len = 0; + + while (src < end) + { + if (*src == '\0' || IS_HIGHBIT_SET(*src)) + len += 4; + else if (*src == '\\') + len += 2; + else + len++; + + src++; + } + + return len; +} + +static uint64 +esc_dec_len(const char *src, size_t srclen) +{ + const char *end = src + srclen; + uint64 len = 0; + + while (src < end) + { + if (src[0] != '\\') + src++; + else if (src + 3 < end && + (src[1] >= '0' && src[1] <= '3') && + (src[2] >= '0' && src[2] <= '7') && + (src[3] >= '0' && src[3] <= '7')) + { + /* + * backslash + valid octal + */ + src += 4; + } + else if (src + 1 < end && + (src[1] == '\\')) + { + /* + * two backslashes = backslash + */ + src += 2; + } + else + { + /* + * one backslash, not followed by ### valid octal + */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "bytea"))); + } + + len++; + } + return len; +} + +/* + * Common + */ + +static const struct +{ + const char *name; + struct pg_encoding enc; +} enclist[] = + +{ + { + "hex", + { + hex_enc_len, hex_dec_len, hex_encode, hex_decode + } + }, + { + "base64", + { + pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode + } + }, + { + "escape", + { + esc_enc_len, esc_dec_len, esc_encode, esc_decode + } + }, + { + NULL, + { + NULL, NULL, NULL, NULL + } + } +}; + +static const struct pg_encoding * +pg_find_encoding(const char *name) +{ + int i; + + for (i = 0; enclist[i].name; i++) + if (pg_strcasecmp(enclist[i].name, name) == 0) + return &enclist[i].enc; + + return NULL; +} |