Adding upstream version 14.5.upstream/14.5 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:15:05 +0000
commit: 46651ce6fe013220ed397add242004d764fc0153 (patch)
tree: 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/encode.c
parent: Initial commit. (diff)
download: postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
1 files changed, 602 insertions, 0 deletions
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
new file mode 100644
index 0000000..6dd93f9
--- /dev/null
+++ b/src/backend/utils/adt/encode.c
@@ -0,0 +1,602 @@
+/*-------------------------------------------------------------------------
+ *
+ * encode.c
+ *	  Various data encoding/decoding things.
+ *
+ * Copyright (c) 2001-2021, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/encode.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Encoding conversion API.
+ * encode_len() and decode_len() compute the amount of space needed, while
+ * encode() and decode() perform the actual conversions.  It is okay for
+ * the _len functions to return an overestimate, but not an underestimate.
+ * (Having said that, large overestimates could cause unnecessary errors,
+ * so it's better to get it right.)  The conversion routines write to the
+ * buffer at *res and return the true length of their output.
+ */
+struct pg_encoding
+{
+	uint64		(*encode_len) (const char *data, size_t dlen);
+	uint64		(*decode_len) (const char *data, size_t dlen);
+	uint64		(*encode) (const char *data, size_t dlen, char *res);
+	uint64		(*decode) (const char *data, size_t dlen, char *res);
+};
+
+static const struct pg_encoding *pg_find_encoding(const char *name);
+
+/*
+ * SQL functions.
+ */
+
+Datum
+binary_encode(PG_FUNCTION_ARGS)
+{
+	bytea	   *data = PG_GETARG_BYTEA_PP(0);
+	Datum		name = PG_GETARG_DATUM(1);
+	text	   *result;
+	char	   *namebuf;
+	char	   *dataptr;
+	size_t		datalen;
+	uint64		resultlen;
+	uint64		res;
+	const struct pg_encoding *enc;
+
+	namebuf = TextDatumGetCString(name);
+
+	enc = pg_find_encoding(namebuf);
+	if (enc == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("unrecognized encoding: \"%s\"", namebuf)));
+
+	dataptr = VARDATA_ANY(data);
+	datalen = VARSIZE_ANY_EXHDR(data);
+
+	resultlen = enc->encode_len(dataptr, datalen);
+
+	/*
+	 * resultlen possibly overflows uint32, therefore on 32-bit machines it's
+	 * unsafe to rely on palloc's internal check.
+	 */
+	if (resultlen > MaxAllocSize - VARHDRSZ)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("result of encoding conversion is too large")));
+
+	result = palloc(VARHDRSZ + resultlen);
+
+	res = enc->encode(dataptr, datalen, VARDATA(result));
+
+	/* Make this FATAL 'cause we've trodden on memory ... */
+	if (res > resultlen)
+		elog(FATAL, "overflow - encode estimate too small");
+
+	SET_VARSIZE(result, VARHDRSZ + res);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+binary_decode(PG_FUNCTION_ARGS)
+{
+	text	   *data = PG_GETARG_TEXT_PP(0);
+	Datum		name = PG_GETARG_DATUM(1);
+	bytea	   *result;
+	char	   *namebuf;
+	char	   *dataptr;
+	size_t		datalen;
+	uint64		resultlen;
+	uint64		res;
+	const struct pg_encoding *enc;
+
+	namebuf = TextDatumGetCString(name);
+
+	enc = pg_find_encoding(namebuf);
+	if (enc == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("unrecognized encoding: \"%s\"", namebuf)));
+
+	dataptr = VARDATA_ANY(data);
+	datalen = VARSIZE_ANY_EXHDR(data);
+
+	resultlen = enc->decode_len(dataptr, datalen);
+
+	/*
+	 * resultlen possibly overflows uint32, therefore on 32-bit machines it's
+	 * unsafe to rely on palloc's internal check.
+	 */
+	if (resultlen > MaxAllocSize - VARHDRSZ)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("result of decoding conversion is too large")));
+
+	result = palloc(VARHDRSZ + resultlen);
+
+	res = enc->decode(dataptr, datalen, VARDATA(result));
+
+	/* Make this FATAL 'cause we've trodden on memory ... */
+	if (res > resultlen)
+		elog(FATAL, "overflow - decode estimate too small");
+
+	SET_VARSIZE(result, VARHDRSZ + res);
+
+	PG_RETURN_BYTEA_P(result);
+}
+
+
+/*
+ * HEX
+ */
+
+static const char hextbl[] = "0123456789abcdef";
+
+static const int8 hexlookup[128] = {
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
+	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+uint64
+hex_encode(const char *src, size_t len, char *dst)
+{
+	const char *end = src + len;
+
+	while (src < end)
+	{
+		*dst++ = hextbl[(*src >> 4) & 0xF];
+		*dst++ = hextbl[*src & 0xF];
+		src++;
+	}
+	return (uint64) len * 2;
+}
+
+static inline char
+get_hex(const char *cp)
+{
+	unsigned char c = (unsigned char) *cp;
+	int			res = -1;
+
+	if (c < 127)
+		res = hexlookup[c];
+
+	if (res < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid hexadecimal digit: \"%.*s\"",
+						pg_mblen(cp), cp)));
+
+	return (char) res;
+}
+
+uint64
+hex_decode(const char *src, size_t len, char *dst)
+{
+	const char *s,
+			   *srcend;
+	char		v1,
+				v2,
+			   *p;
+
+	srcend = src + len;
+	s = src;
+	p = dst;
+	while (s < srcend)
+	{
+		if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
+		{
+			s++;
+			continue;
+		}
+		v1 = get_hex(s) << 4;
+		s++;
+		if (s >= srcend)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid hexadecimal data: odd number of digits")));
+
+		v2 = get_hex(s);
+		s++;
+		*p++ = v1 | v2;
+	}
+
+	return p - dst;
+}
+
+static uint64
+hex_enc_len(const char *src, size_t srclen)
+{
+	return (uint64) srclen << 1;
+}
+
+static uint64
+hex_dec_len(const char *src, size_t srclen)
+{
+	return (uint64) srclen >> 1;
+}
+
+/*
+ * BASE64
+ */
+
+static const char _base64[] =
+"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static const int8 b64lookup[128] = {
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
+	52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
+	-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+	15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
+	-1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+	41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
+};
+
+static uint64
+pg_base64_encode(const char *src, size_t len, char *dst)
+{
+	char	   *p,
+			   *lend = dst + 76;
+	const char *s,
+			   *end = src + len;
+	int			pos = 2;
+	uint32		buf = 0;
+
+	s = src;
+	p = dst;
+
+	while (s < end)
+	{
+		buf |= (unsigned char) *s << (pos << 3);
+		pos--;
+		s++;
+
+		/* write it out */
+		if (pos < 0)
+		{
+			*p++ = _base64[(buf >> 18) & 0x3f];
+			*p++ = _base64[(buf >> 12) & 0x3f];
+			*p++ = _base64[(buf >> 6) & 0x3f];
+			*p++ = _base64[buf & 0x3f];
+
+			pos = 2;
+			buf = 0;
+		}
+		if (p >= lend)
+		{
+			*p++ = '\n';
+			lend = p + 76;
+		}
+	}
+	if (pos != 2)
+	{
+		*p++ = _base64[(buf >> 18) & 0x3f];
+		*p++ = _base64[(buf >> 12) & 0x3f];
+		*p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
+		*p++ = '=';
+	}
+
+	return p - dst;
+}
+
+static uint64
+pg_base64_decode(const char *src, size_t len, char *dst)
+{
+	const char *srcend = src + len,
+			   *s = src;
+	char	   *p = dst;
+	char		c;
+	int			b = 0;
+	uint32		buf = 0;
+	int			pos = 0,
+				end = 0;
+
+	while (s < srcend)
+	{
+		c = *s++;
+
+		if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+			continue;
+
+		if (c == '=')
+		{
+			/* end sequence */
+			if (!end)
+			{
+				if (pos == 2)
+					end = 1;
+				else if (pos == 3)
+					end = 2;
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("unexpected \"=\" while decoding base64 sequence")));
+			}
+			b = 0;
+		}
+		else
+		{
+			b = -1;
+			if (c > 0 && c < 127)
+				b = b64lookup[(unsigned char) c];
+			if (b < 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
+								pg_mblen(s - 1), s - 1)));
+		}
+		/* add it to buffer */
+		buf = (buf << 6) + b;
+		pos++;
+		if (pos == 4)
+		{
+			*p++ = (buf >> 16) & 255;
+			if (end == 0 || end > 1)
+				*p++ = (buf >> 8) & 255;
+			if (end == 0 || end > 2)
+				*p++ = buf & 255;
+			buf = 0;
+			pos = 0;
+		}
+	}
+
+	if (pos != 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid base64 end sequence"),
+				 errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
+
+	return p - dst;
+}
+
+
+static uint64
+pg_base64_enc_len(const char *src, size_t srclen)
+{
+	/* 3 bytes will be converted to 4, linefeed after 76 chars */
+	return ((uint64) srclen + 2) * 4 / 3 + (uint64) srclen / (76 * 3 / 4);
+}
+
+static uint64
+pg_base64_dec_len(const char *src, size_t srclen)
+{
+	return ((uint64) srclen * 3) >> 2;
+}
+
+/*
+ * Escape
+ * Minimally escape bytea to text.
+ * De-escape text to bytea.
+ *
+ * We must escape zero bytes and high-bit-set bytes to avoid generating
+ * text that might be invalid in the current encoding, or that might
+ * change to something else if passed through an encoding conversion
+ * (leading to failing to de-escape to the original bytea value).
+ * Also of course backslash itself has to be escaped.
+ *
+ * De-escaping processes \\ and any \### octal
+ */
+
+#define VAL(CH)			((CH) - '0')
+#define DIG(VAL)		((VAL) + '0')
+
+static uint64
+esc_encode(const char *src, size_t srclen, char *dst)
+{
+	const char *end = src + srclen;
+	char	   *rp = dst;
+	uint64		len = 0;
+
+	while (src < end)
+	{
+		unsigned char c = (unsigned char) *src;
+
+		if (c == '\0' || IS_HIGHBIT_SET(c))
+		{
+			rp[0] = '\\';
+			rp[1] = DIG(c >> 6);
+			rp[2] = DIG((c >> 3) & 7);
+			rp[3] = DIG(c & 7);
+			rp += 4;
+			len += 4;
+		}
+		else if (c == '\\')
+		{
+			rp[0] = '\\';
+			rp[1] = '\\';
+			rp += 2;
+			len += 2;
+		}
+		else
+		{
+			*rp++ = c;
+			len++;
+		}
+
+		src++;
+	}
+
+	return len;
+}
+
+static uint64
+esc_decode(const char *src, size_t srclen, char *dst)
+{
+	const char *end = src + srclen;
+	char	   *rp = dst;
+	uint64		len = 0;
+
+	while (src < end)
+	{
+		if (src[0] != '\\')
+			*rp++ = *src++;
+		else if (src + 3 < end &&
+				 (src[1] >= '0' && src[1] <= '3') &&
+				 (src[2] >= '0' && src[2] <= '7') &&
+				 (src[3] >= '0' && src[3] <= '7'))
+		{
+			int			val;
+
+			val = VAL(src[1]);
+			val <<= 3;
+			val += VAL(src[2]);
+			val <<= 3;
+			*rp++ = val + VAL(src[3]);
+			src += 4;
+		}
+		else if (src + 1 < end &&
+				 (src[1] == '\\'))
+		{
+			*rp++ = '\\';
+			src += 2;
+		}
+		else
+		{
+			/*
+			 * One backslash, not followed by ### valid octal. Should never
+			 * get here, since esc_dec_len does same check.
+			 */
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s", "bytea")));
+		}
+
+		len++;
+	}
+
+	return len;
+}
+
+static uint64
+esc_enc_len(const char *src, size_t srclen)
+{
+	const char *end = src + srclen;
+	uint64		len = 0;
+
+	while (src < end)
+	{
+		if (*src == '\0' || IS_HIGHBIT_SET(*src))
+			len += 4;
+		else if (*src == '\\')
+			len += 2;
+		else
+			len++;
+
+		src++;
+	}
+
+	return len;
+}
+
+static uint64
+esc_dec_len(const char *src, size_t srclen)
+{
+	const char *end = src + srclen;
+	uint64		len = 0;
+
+	while (src < end)
+	{
+		if (src[0] != '\\')
+			src++;
+		else if (src + 3 < end &&
+				 (src[1] >= '0' && src[1] <= '3') &&
+				 (src[2] >= '0' && src[2] <= '7') &&
+				 (src[3] >= '0' && src[3] <= '7'))
+		{
+			/*
+			 * backslash + valid octal
+			 */
+			src += 4;
+		}
+		else if (src + 1 < end &&
+				 (src[1] == '\\'))
+		{
+			/*
+			 * two backslashes = backslash
+			 */
+			src += 2;
+		}
+		else
+		{
+			/*
+			 * one backslash, not followed by ### valid octal
+			 */
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s", "bytea")));
+		}
+
+		len++;
+	}
+	return len;
+}
+
+/*
+ * Common
+ */
+
+static const struct
+{
+	const char *name;
+	struct pg_encoding enc;
+}			enclist[] =
+
+{
+	{
+		"hex",
+		{
+			hex_enc_len, hex_dec_len, hex_encode, hex_decode
+		}
+	},
+	{
+		"base64",
+		{
+			pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode
+		}
+	},
+	{
+		"escape",
+		{
+			esc_enc_len, esc_dec_len, esc_encode, esc_decode
+		}
+	},
+	{
+		NULL,
+		{
+			NULL, NULL, NULL, NULL
+		}
+	}
+};
+
+static const struct pg_encoding *
+pg_find_encoding(const char *name)
+{
+	int			i;
+
+	for (i = 0; enclist[i].name; i++)
+		if (pg_strcasecmp(enclist[i].name, name) == 0)
+			return &enclist[i].enc;
+
+	return NULL;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:15:05 +0000
commit	46651ce6fe013220ed397add242004d764fc0153 (patch)
tree	6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/adt/encode.c
parent	Initial commit. (diff)
download	postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip