From 5e45211a64149b3c659b90ff2de6fa982a5a93ed Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sat, 4 May 2024 14:17:33 +0200
Subject: Adding upstream version 15.5.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/fe_utils/mbprint.c | 405 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 405 insertions(+)
 create mode 100644 src/fe_utils/mbprint.c

(limited to 'src/fe_utils/mbprint.c')

diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
new file mode 100644
index 0000000..067f281
--- /dev/null
+++ b/src/fe_utils/mbprint.c
@@ -0,0 +1,405 @@
+/*-------------------------------------------------------------------------
+ *
+ * Multibyte character printing support for frontend code
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/fe_utils/mbprint.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include "fe_utils/mbprint.h"
+
+#include "libpq-fe.h"
+
+
+/*
+ * To avoid version-skew problems, this file must not use declarations
+ * from pg_wchar.h: the encoding IDs we are dealing with are determined
+ * by the libpq.so we are linked with, and that might not match the
+ * numbers we see at compile time.  (If this file were inside libpq,
+ * the problem would go away...)
+ *
+ * Hence, we have our own definition of pg_wchar, and we get the values
+ * of any needed encoding IDs on-the-fly.
+ */
+
+typedef unsigned int pg_wchar;
+
+static int
+pg_get_utf8_id(void)
+{
+	static int	utf8_id = -1;
+
+	if (utf8_id < 0)
+		utf8_id = pg_char_to_encoding("utf8");
+	return utf8_id;
+}
+
+#define PG_UTF8		pg_get_utf8_id()
+
+
+/*
+ * Convert a UTF-8 character to a Unicode code point.
+ * This is a one-character version of pg_utf2wchar_with_len.
+ *
+ * No error checks here, c must point to a long-enough string.
+ */
+static pg_wchar
+utf8_to_unicode(const unsigned char *c)
+{
+	if ((*c & 0x80) == 0)
+		return (pg_wchar) c[0];
+	else if ((*c & 0xe0) == 0xc0)
+		return (pg_wchar) (((c[0] & 0x1f) << 6) |
+						   (c[1] & 0x3f));
+	else if ((*c & 0xf0) == 0xe0)
+		return (pg_wchar) (((c[0] & 0x0f) << 12) |
+						   ((c[1] & 0x3f) << 6) |
+						   (c[2] & 0x3f));
+	else if ((*c & 0xf8) == 0xf0)
+		return (pg_wchar) (((c[0] & 0x07) << 18) |
+						   ((c[1] & 0x3f) << 12) |
+						   ((c[2] & 0x3f) << 6) |
+						   (c[3] & 0x3f));
+	else
+		/* that is an invalid code on purpose */
+		return 0xffffffff;
+}
+
+
+/*
+ * Unicode 3.1 compliant validation : for each category, it checks the
+ * combination of each byte to make sure it maps to a valid range. It also
+ * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
+ * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
+ */
+static int
+utf_charcheck(const unsigned char *c)
+{
+	if ((*c & 0x80) == 0)
+		return 1;
+	else if ((*c & 0xe0) == 0xc0)
+	{
+		/* two-byte char */
+		if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
+			return 2;
+		return -1;
+	}
+	else if ((*c & 0xf0) == 0xe0)
+	{
+		/* three-byte char */
+		if (((c[1] & 0xc0) == 0x80) &&
+			(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
+			((c[2] & 0xc0) == 0x80))
+		{
+			int			z = c[0] & 0x0f;
+			int			yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
+			int			lx = yx & 0x7f;
+
+			/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
+			if (((z == 0x0f) &&
+				 (((yx & 0xffe) == 0xffe) ||
+				  (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
+				((z == 0x0d) && ((yx & 0xb00) == 0x800)))
+				return -1;
+			return 3;
+		}
+		return -1;
+	}
+	else if ((*c & 0xf8) == 0xf0)
+	{
+		int			u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
+
+		/* four-byte char */
+		if (((c[1] & 0xc0) == 0x80) &&
+			(u > 0x00) && (u <= 0x10) &&
+			((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
+		{
+			/* test for 0xzzzzfffe/0xzzzzfffff */
+			if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
+				((c[3] & 0x3e) == 0x3e))
+				return -1;
+			return 4;
+		}
+		return -1;
+	}
+	return -1;
+}
+
+
+static void
+mb_utf_validate(unsigned char *pwcs)
+{
+	unsigned char *p = pwcs;
+
+	while (*pwcs)
+	{
+		int			len;
+
+		if ((len = utf_charcheck(pwcs)) > 0)
+		{
+			if (p != pwcs)
+			{
+				int			i;
+
+				for (i = 0; i < len; i++)
+					*p++ = *pwcs++;
+			}
+			else
+			{
+				pwcs += len;
+				p += len;
+			}
+		}
+		else
+			/* we skip the char */
+			pwcs++;
+	}
+	if (p != pwcs)
+		*p = '\0';
+}
+
+/*
+ * public functions : wcswidth and mbvalidate
+ */
+
+/*
+ * pg_wcswidth is the dumb display-width function.
+ * It assumes that everything will appear on one line.
+ * OTOH it is easier to use than pg_wcssize if this applies to you.
+ */
+int
+pg_wcswidth(const char *pwcs, size_t len, int encoding)
+{
+	int			width = 0;
+
+	while (len > 0)
+	{
+		int			chlen,
+					chwidth;
+
+		chlen = PQmblen(pwcs, encoding);
+		if (len < (size_t) chlen)
+			break;				/* Invalid string */
+
+		chwidth = PQdsplen(pwcs, encoding);
+		if (chwidth > 0)
+			width += chwidth;
+
+		pwcs += chlen;
+		len -= chlen;
+	}
+	return width;
+}
+
+/*
+ * pg_wcssize takes the given string in the given encoding and returns three
+ * values:
+ *	  result_width: Width in display characters of the longest line in string
+ *	  result_height: Number of lines in display output
+ *	  result_format_size: Number of bytes required to store formatted
+ *		representation of string
+ *
+ * This MUST be kept in sync with pg_wcsformat!
+ */
+void
+pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
+		   int *result_width, int *result_height, int *result_format_size)
+{
+	int			w,
+				chlen = 0,
+				linewidth = 0;
+	int			width = 0;
+	int			height = 1;
+	int			format_size = 0;
+
+	for (; *pwcs && len > 0; pwcs += chlen)
+	{
+		chlen = PQmblen((const char *) pwcs, encoding);
+		if (len < (size_t) chlen)
+			break;
+		w = PQdsplen((const char *) pwcs, encoding);
+
+		if (chlen == 1)			/* single-byte char */
+		{
+			if (*pwcs == '\n')	/* Newline */
+			{
+				if (linewidth > width)
+					width = linewidth;
+				linewidth = 0;
+				height += 1;
+				format_size += 1;	/* For NUL char */
+			}
+			else if (*pwcs == '\r') /* Linefeed */
+			{
+				linewidth += 2;
+				format_size += 2;
+			}
+			else if (*pwcs == '\t') /* Tab */
+			{
+				do
+				{
+					linewidth++;
+					format_size++;
+				} while (linewidth % 8 != 0);
+			}
+			else if (w < 0)		/* Other control char */
+			{
+				linewidth += 4;
+				format_size += 4;
+			}
+			else				/* Output it as-is */
+			{
+				linewidth += w;
+				format_size += 1;
+			}
+		}
+		else if (w < 0)			/* Non-ascii control char */
+		{
+			linewidth += 6;		/* \u0000 */
+			format_size += 6;
+		}
+		else					/* All other chars */
+		{
+			linewidth += w;
+			format_size += chlen;
+		}
+		len -= chlen;
+	}
+	if (linewidth > width)
+		width = linewidth;
+	format_size += 1;			/* For NUL char */
+
+	/* Set results */
+	if (result_width)
+		*result_width = width;
+	if (result_height)
+		*result_height = height;
+	if (result_format_size)
+		*result_format_size = format_size;
+}
+
+/*
+ *	Format a string into one or more "struct lineptr" lines.
+ *	lines[i].ptr == NULL indicates the end of the array.
+ *
+ * This MUST be kept in sync with pg_wcssize!
+ */
+void
+pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
+			 struct lineptr *lines, int count)
+{
+	int			w,
+				chlen = 0;
+	int			linewidth = 0;
+	unsigned char *ptr = lines->ptr;	/* Pointer to data area */
+
+	for (; *pwcs && len > 0; pwcs += chlen)
+	{
+		chlen = PQmblen((const char *) pwcs, encoding);
+		if (len < (size_t) chlen)
+			break;
+		w = PQdsplen((const char *) pwcs, encoding);
+
+		if (chlen == 1)			/* single-byte char */
+		{
+			if (*pwcs == '\n')	/* Newline */
+			{
+				*ptr++ = '\0';
+				lines->width = linewidth;
+				linewidth = 0;
+				lines++;
+				count--;
+				if (count <= 0)
+					exit(1);	/* Screwup */
+
+				/* make next line point to remaining memory */
+				lines->ptr = ptr;
+			}
+			else if (*pwcs == '\r') /* Linefeed */
+			{
+				strcpy((char *) ptr, "\\r");
+				linewidth += 2;
+				ptr += 2;
+			}
+			else if (*pwcs == '\t') /* Tab */
+			{
+				do
+				{
+					*ptr++ = ' ';
+					linewidth++;
+				} while (linewidth % 8 != 0);
+			}
+			else if (w < 0)		/* Other control char */
+			{
+				sprintf((char *) ptr, "\\x%02X", *pwcs);
+				linewidth += 4;
+				ptr += 4;
+			}
+			else				/* Output it as-is */
+			{
+				linewidth += w;
+				*ptr++ = *pwcs;
+			}
+		}
+		else if (w < 0)			/* Non-ascii control char */
+		{
+			if (encoding == PG_UTF8)
+				sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
+			else
+			{
+				/*
+				 * This case cannot happen in the current code because only
+				 * UTF-8 signals multibyte control characters. But we may need
+				 * to support it at some stage
+				 */
+				sprintf((char *) ptr, "\\u????");
+			}
+			ptr += 6;
+			linewidth += 6;
+		}
+		else					/* All other chars */
+		{
+			int			i;
+
+			for (i = 0; i < chlen; i++)
+				*ptr++ = pwcs[i];
+			linewidth += w;
+		}
+		len -= chlen;
+	}
+	lines->width = linewidth;
+	*ptr++ = '\0';				/* Terminate formatted string */
+
+	if (count <= 0)
+		exit(1);				/* Screwup */
+
+	(lines + 1)->ptr = NULL;	/* terminate line array */
+}
+
+
+/*
+ * Encoding validation: delete any unvalidatable characters from the string
+ *
+ * This seems redundant with existing functionality elsewhere?
+ */
+unsigned char *
+mbvalidate(unsigned char *pwcs, int encoding)
+{
+	if (encoding == PG_UTF8)
+		mb_utf_validate(pwcs);
+	else
+	{
+		/*
+		 * other encodings needing validation should add their own routines
+		 * here
+		 */
+	}
+
+	return pwcs;
+}
-- 
cgit v1.2.3