summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/adt/encode.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/encode.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/adt/encode.c')
-rw-r--r--src/backend/utils/adt/encode.c602
1 files changed, 602 insertions, 0 deletions
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
new file mode 100644
index 0000000..6dd93f9
--- /dev/null
+++ b/src/backend/utils/adt/encode.c
@@ -0,0 +1,602 @@
+/*-------------------------------------------------------------------------
+ *
+ * encode.c
+ * Various data encoding/decoding things.
+ *
+ * Copyright (c) 2001-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/encode.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Encoding conversion API.
+ * encode_len() and decode_len() compute the amount of space needed, while
+ * encode() and decode() perform the actual conversions. It is okay for
+ * the _len functions to return an overestimate, but not an underestimate.
+ * (Having said that, large overestimates could cause unnecessary errors,
+ * so it's better to get it right.) The conversion routines write to the
+ * buffer at *res and return the true length of their output.
+ */
+struct pg_encoding
+{
+ uint64 (*encode_len) (const char *data, size_t dlen);
+ uint64 (*decode_len) (const char *data, size_t dlen);
+ uint64 (*encode) (const char *data, size_t dlen, char *res);
+ uint64 (*decode) (const char *data, size_t dlen, char *res);
+};
+
+static const struct pg_encoding *pg_find_encoding(const char *name);
+
+/*
+ * SQL functions.
+ */
+
+Datum
+binary_encode(PG_FUNCTION_ARGS)
+{
+ bytea *data = PG_GETARG_BYTEA_PP(0);
+ Datum name = PG_GETARG_DATUM(1);
+ text *result;
+ char *namebuf;
+ char *dataptr;
+ size_t datalen;
+ uint64 resultlen;
+ uint64 res;
+ const struct pg_encoding *enc;
+
+ namebuf = TextDatumGetCString(name);
+
+ enc = pg_find_encoding(namebuf);
+ if (enc == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized encoding: \"%s\"", namebuf)));
+
+ dataptr = VARDATA_ANY(data);
+ datalen = VARSIZE_ANY_EXHDR(data);
+
+ resultlen = enc->encode_len(dataptr, datalen);
+
+ /*
+ * resultlen possibly overflows uint32, therefore on 32-bit machines it's
+ * unsafe to rely on palloc's internal check.
+ */
+ if (resultlen > MaxAllocSize - VARHDRSZ)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("result of encoding conversion is too large")));
+
+ result = palloc(VARHDRSZ + resultlen);
+
+ res = enc->encode(dataptr, datalen, VARDATA(result));
+
+ /* Make this FATAL 'cause we've trodden on memory ... */
+ if (res > resultlen)
+ elog(FATAL, "overflow - encode estimate too small");
+
+ SET_VARSIZE(result, VARHDRSZ + res);
+
+ PG_RETURN_TEXT_P(result);
+}
+
+Datum
+binary_decode(PG_FUNCTION_ARGS)
+{
+ text *data = PG_GETARG_TEXT_PP(0);
+ Datum name = PG_GETARG_DATUM(1);
+ bytea *result;
+ char *namebuf;
+ char *dataptr;
+ size_t datalen;
+ uint64 resultlen;
+ uint64 res;
+ const struct pg_encoding *enc;
+
+ namebuf = TextDatumGetCString(name);
+
+ enc = pg_find_encoding(namebuf);
+ if (enc == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized encoding: \"%s\"", namebuf)));
+
+ dataptr = VARDATA_ANY(data);
+ datalen = VARSIZE_ANY_EXHDR(data);
+
+ resultlen = enc->decode_len(dataptr, datalen);
+
+ /*
+ * resultlen possibly overflows uint32, therefore on 32-bit machines it's
+ * unsafe to rely on palloc's internal check.
+ */
+ if (resultlen > MaxAllocSize - VARHDRSZ)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("result of decoding conversion is too large")));
+
+ result = palloc(VARHDRSZ + resultlen);
+
+ res = enc->decode(dataptr, datalen, VARDATA(result));
+
+ /* Make this FATAL 'cause we've trodden on memory ... */
+ if (res > resultlen)
+ elog(FATAL, "overflow - decode estimate too small");
+
+ SET_VARSIZE(result, VARHDRSZ + res);
+
+ PG_RETURN_BYTEA_P(result);
+}
+
+
+/*
+ * HEX
+ */
+
+static const char hextbl[] = "0123456789abcdef";
+
+static const int8 hexlookup[128] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
+ -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+uint64
+hex_encode(const char *src, size_t len, char *dst)
+{
+ const char *end = src + len;
+
+ while (src < end)
+ {
+ *dst++ = hextbl[(*src >> 4) & 0xF];
+ *dst++ = hextbl[*src & 0xF];
+ src++;
+ }
+ return (uint64) len * 2;
+}
+
+static inline char
+get_hex(const char *cp)
+{
+ unsigned char c = (unsigned char) *cp;
+ int res = -1;
+
+ if (c < 127)
+ res = hexlookup[c];
+
+ if (res < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid hexadecimal digit: \"%.*s\"",
+ pg_mblen(cp), cp)));
+
+ return (char) res;
+}
+
+uint64
+hex_decode(const char *src, size_t len, char *dst)
+{
+ const char *s,
+ *srcend;
+ char v1,
+ v2,
+ *p;
+
+ srcend = src + len;
+ s = src;
+ p = dst;
+ while (s < srcend)
+ {
+ if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
+ {
+ s++;
+ continue;
+ }
+ v1 = get_hex(s) << 4;
+ s++;
+ if (s >= srcend)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid hexadecimal data: odd number of digits")));
+
+ v2 = get_hex(s);
+ s++;
+ *p++ = v1 | v2;
+ }
+
+ return p - dst;
+}
+
+static uint64
+hex_enc_len(const char *src, size_t srclen)
+{
+ return (uint64) srclen << 1;
+}
+
+static uint64
+hex_dec_len(const char *src, size_t srclen)
+{
+ return (uint64) srclen >> 1;
+}
+
+/*
+ * BASE64
+ */
+
+static const char _base64[] =
+"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static const int8 b64lookup[128] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
+ -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
+ -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
+};
+
+static uint64
+pg_base64_encode(const char *src, size_t len, char *dst)
+{
+ char *p,
+ *lend = dst + 76;
+ const char *s,
+ *end = src + len;
+ int pos = 2;
+ uint32 buf = 0;
+
+ s = src;
+ p = dst;
+
+ while (s < end)
+ {
+ buf |= (unsigned char) *s << (pos << 3);
+ pos--;
+ s++;
+
+ /* write it out */
+ if (pos < 0)
+ {
+ *p++ = _base64[(buf >> 18) & 0x3f];
+ *p++ = _base64[(buf >> 12) & 0x3f];
+ *p++ = _base64[(buf >> 6) & 0x3f];
+ *p++ = _base64[buf & 0x3f];
+
+ pos = 2;
+ buf = 0;
+ }
+ if (p >= lend)
+ {
+ *p++ = '\n';
+ lend = p + 76;
+ }
+ }
+ if (pos != 2)
+ {
+ *p++ = _base64[(buf >> 18) & 0x3f];
+ *p++ = _base64[(buf >> 12) & 0x3f];
+ *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
+ *p++ = '=';
+ }
+
+ return p - dst;
+}
+
+static uint64
+pg_base64_decode(const char *src, size_t len, char *dst)
+{
+ const char *srcend = src + len,
+ *s = src;
+ char *p = dst;
+ char c;
+ int b = 0;
+ uint32 buf = 0;
+ int pos = 0,
+ end = 0;
+
+ while (s < srcend)
+ {
+ c = *s++;
+
+ if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+ continue;
+
+ if (c == '=')
+ {
+ /* end sequence */
+ if (!end)
+ {
+ if (pos == 2)
+ end = 1;
+ else if (pos == 3)
+ end = 2;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unexpected \"=\" while decoding base64 sequence")));
+ }
+ b = 0;
+ }
+ else
+ {
+ b = -1;
+ if (c > 0 && c < 127)
+ b = b64lookup[(unsigned char) c];
+ if (b < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
+ pg_mblen(s - 1), s - 1)));
+ }
+ /* add it to buffer */
+ buf = (buf << 6) + b;
+ pos++;
+ if (pos == 4)
+ {
+ *p++ = (buf >> 16) & 255;
+ if (end == 0 || end > 1)
+ *p++ = (buf >> 8) & 255;
+ if (end == 0 || end > 2)
+ *p++ = buf & 255;
+ buf = 0;
+ pos = 0;
+ }
+ }
+
+ if (pos != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid base64 end sequence"),
+ errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
+
+ return p - dst;
+}
+
+
+static uint64
+pg_base64_enc_len(const char *src, size_t srclen)
+{
+ /* 3 bytes will be converted to 4, linefeed after 76 chars */
+ return ((uint64) srclen + 2) * 4 / 3 + (uint64) srclen / (76 * 3 / 4);
+}
+
+static uint64
+pg_base64_dec_len(const char *src, size_t srclen)
+{
+ return ((uint64) srclen * 3) >> 2;
+}
+
+/*
+ * Escape
+ * Minimally escape bytea to text.
+ * De-escape text to bytea.
+ *
+ * We must escape zero bytes and high-bit-set bytes to avoid generating
+ * text that might be invalid in the current encoding, or that might
+ * change to something else if passed through an encoding conversion
+ * (leading to failing to de-escape to the original bytea value).
+ * Also of course backslash itself has to be escaped.
+ *
+ * De-escaping processes \\ and any \### octal
+ */
+
+#define VAL(CH) ((CH) - '0')
+#define DIG(VAL) ((VAL) + '0')
+
+static uint64
+esc_encode(const char *src, size_t srclen, char *dst)
+{
+ const char *end = src + srclen;
+ char *rp = dst;
+ uint64 len = 0;
+
+ while (src < end)
+ {
+ unsigned char c = (unsigned char) *src;
+
+ if (c == '\0' || IS_HIGHBIT_SET(c))
+ {
+ rp[0] = '\\';
+ rp[1] = DIG(c >> 6);
+ rp[2] = DIG((c >> 3) & 7);
+ rp[3] = DIG(c & 7);
+ rp += 4;
+ len += 4;
+ }
+ else if (c == '\\')
+ {
+ rp[0] = '\\';
+ rp[1] = '\\';
+ rp += 2;
+ len += 2;
+ }
+ else
+ {
+ *rp++ = c;
+ len++;
+ }
+
+ src++;
+ }
+
+ return len;
+}
+
+static uint64
+esc_decode(const char *src, size_t srclen, char *dst)
+{
+ const char *end = src + srclen;
+ char *rp = dst;
+ uint64 len = 0;
+
+ while (src < end)
+ {
+ if (src[0] != '\\')
+ *rp++ = *src++;
+ else if (src + 3 < end &&
+ (src[1] >= '0' && src[1] <= '3') &&
+ (src[2] >= '0' && src[2] <= '7') &&
+ (src[3] >= '0' && src[3] <= '7'))
+ {
+ int val;
+
+ val = VAL(src[1]);
+ val <<= 3;
+ val += VAL(src[2]);
+ val <<= 3;
+ *rp++ = val + VAL(src[3]);
+ src += 4;
+ }
+ else if (src + 1 < end &&
+ (src[1] == '\\'))
+ {
+ *rp++ = '\\';
+ src += 2;
+ }
+ else
+ {
+ /*
+ * One backslash, not followed by ### valid octal. Should never
+ * get here, since esc_dec_len does same check.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid input syntax for type %s", "bytea")));
+ }
+
+ len++;
+ }
+
+ return len;
+}
+
+static uint64
+esc_enc_len(const char *src, size_t srclen)
+{
+ const char *end = src + srclen;
+ uint64 len = 0;
+
+ while (src < end)
+ {
+ if (*src == '\0' || IS_HIGHBIT_SET(*src))
+ len += 4;
+ else if (*src == '\\')
+ len += 2;
+ else
+ len++;
+
+ src++;
+ }
+
+ return len;
+}
+
+static uint64
+esc_dec_len(const char *src, size_t srclen)
+{
+ const char *end = src + srclen;
+ uint64 len = 0;
+
+ while (src < end)
+ {
+ if (src[0] != '\\')
+ src++;
+ else if (src + 3 < end &&
+ (src[1] >= '0' && src[1] <= '3') &&
+ (src[2] >= '0' && src[2] <= '7') &&
+ (src[3] >= '0' && src[3] <= '7'))
+ {
+ /*
+ * backslash + valid octal
+ */
+ src += 4;
+ }
+ else if (src + 1 < end &&
+ (src[1] == '\\'))
+ {
+ /*
+ * two backslashes = backslash
+ */
+ src += 2;
+ }
+ else
+ {
+ /*
+ * one backslash, not followed by ### valid octal
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid input syntax for type %s", "bytea")));
+ }
+
+ len++;
+ }
+ return len;
+}
+
+/*
+ * Common
+ */
+
+static const struct
+{
+ const char *name;
+ struct pg_encoding enc;
+} enclist[] =
+
+{
+ {
+ "hex",
+ {
+ hex_enc_len, hex_dec_len, hex_encode, hex_decode
+ }
+ },
+ {
+ "base64",
+ {
+ pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode
+ }
+ },
+ {
+ "escape",
+ {
+ esc_enc_len, esc_dec_len, esc_encode, esc_decode
+ }
+ },
+ {
+ NULL,
+ {
+ NULL, NULL, NULL, NULL
+ }
+ }
+};
+
+static const struct pg_encoding *
+pg_find_encoding(const char *name)
+{
+ int i;
+
+ for (i = 0; enclist[i].name; i++)
+ if (pg_strcasecmp(enclist[i].name, name) == 0)
+ return &enclist[i].enc;
+
+ return NULL;
+}