summaryrefslogtreecommitdiffstats
path: root/src/fe_utils/mbprint.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/fe_utils/mbprint.c
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/fe_utils/mbprint.c')
-rw-r--r--src/fe_utils/mbprint.c405
1 files changed, 405 insertions, 0 deletions
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
new file mode 100644
index 0000000..fe3faba
--- /dev/null
+++ b/src/fe_utils/mbprint.c
@@ -0,0 +1,405 @@
+/*-------------------------------------------------------------------------
+ *
+ * Multibyte character printing support for frontend code
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/fe_utils/mbprint.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include "fe_utils/mbprint.h"
+
+#include "libpq-fe.h"
+
+
+/*
+ * To avoid version-skew problems, this file must not use declarations
+ * from pg_wchar.h: the encoding IDs we are dealing with are determined
+ * by the libpq.so we are linked with, and that might not match the
+ * numbers we see at compile time. (If this file were inside libpq,
+ * the problem would go away...)
+ *
+ * Hence, we have our own definition of pg_wchar, and we get the values
+ * of any needed encoding IDs on-the-fly.
+ */
+
+typedef unsigned int pg_wchar;
+
+static int
+pg_get_utf8_id(void)
+{
+ static int utf8_id = -1;
+
+ if (utf8_id < 0)
+ utf8_id = pg_char_to_encoding("utf8");
+ return utf8_id;
+}
+
+#define PG_UTF8 pg_get_utf8_id()
+
+
+/*
+ * Convert a UTF-8 character to a Unicode code point.
+ * This is a one-character version of pg_utf2wchar_with_len.
+ *
+ * No error checks here, c must point to a long-enough string.
+ */
+static pg_wchar
+utf8_to_unicode(const unsigned char *c)
+{
+ if ((*c & 0x80) == 0)
+ return (pg_wchar) c[0];
+ else if ((*c & 0xe0) == 0xc0)
+ return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ (c[1] & 0x3f));
+ else if ((*c & 0xf0) == 0xe0)
+ return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ ((c[1] & 0x3f) << 6) |
+ (c[2] & 0x3f));
+ else if ((*c & 0xf8) == 0xf0)
+ return (pg_wchar) (((c[0] & 0x07) << 18) |
+ ((c[1] & 0x3f) << 12) |
+ ((c[2] & 0x3f) << 6) |
+ (c[3] & 0x3f));
+ else
+ /* that is an invalid code on purpose */
+ return 0xffffffff;
+}
+
+
+/*
+ * Unicode 3.1 compliant validation : for each category, it checks the
+ * combination of each byte to make sure it maps to a valid range. It also
+ * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
+ * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
+ */
+static int
+utf_charcheck(const unsigned char *c)
+{
+ if ((*c & 0x80) == 0)
+ return 1;
+ else if ((*c & 0xe0) == 0xc0)
+ {
+ /* two-byte char */
+ if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
+ return 2;
+ return -1;
+ }
+ else if ((*c & 0xf0) == 0xe0)
+ {
+ /* three-byte char */
+ if (((c[1] & 0xc0) == 0x80) &&
+ (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
+ ((c[2] & 0xc0) == 0x80))
+ {
+ int z = c[0] & 0x0f;
+ int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
+ int lx = yx & 0x7f;
+
+ /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
+ if (((z == 0x0f) &&
+ (((yx & 0xffe) == 0xffe) ||
+ (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
+ ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
+ return -1;
+ return 3;
+ }
+ return -1;
+ }
+ else if ((*c & 0xf8) == 0xf0)
+ {
+ int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
+
+ /* four-byte char */
+ if (((c[1] & 0xc0) == 0x80) &&
+ (u > 0x00) && (u <= 0x10) &&
+ ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
+ {
+ /* test for 0xzzzzfffe/0xzzzzfffff */
+ if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
+ ((c[3] & 0x3e) == 0x3e))
+ return -1;
+ return 4;
+ }
+ return -1;
+ }
+ return -1;
+}
+
+
+static void
+mb_utf_validate(unsigned char *pwcs)
+{
+ unsigned char *p = pwcs;
+
+ while (*pwcs)
+ {
+ int len;
+
+ if ((len = utf_charcheck(pwcs)) > 0)
+ {
+ if (p != pwcs)
+ {
+ int i;
+
+ for (i = 0; i < len; i++)
+ *p++ = *pwcs++;
+ }
+ else
+ {
+ pwcs += len;
+ p += len;
+ }
+ }
+ else
+ /* we skip the char */
+ pwcs++;
+ }
+ if (p != pwcs)
+ *p = '\0';
+}
+
+/*
+ * public functions : wcswidth and mbvalidate
+ */
+
+/*
+ * pg_wcswidth is the dumb display-width function.
+ * It assumes that everything will appear on one line.
+ * OTOH it is easier to use than pg_wcssize if this applies to you.
+ */
+int
+pg_wcswidth(const char *pwcs, size_t len, int encoding)
+{
+ int width = 0;
+
+ while (len > 0)
+ {
+ int chlen,
+ chwidth;
+
+ chlen = PQmblen(pwcs, encoding);
+ if (len < (size_t) chlen)
+ break; /* Invalid string */
+
+ chwidth = PQdsplen(pwcs, encoding);
+ if (chwidth > 0)
+ width += chwidth;
+
+ pwcs += chlen;
+ len -= chlen;
+ }
+ return width;
+}
+
+/*
+ * pg_wcssize takes the given string in the given encoding and returns three
+ * values:
+ * result_width: Width in display characters of the longest line in string
+ * result_height: Number of lines in display output
+ * result_format_size: Number of bytes required to store formatted
+ * representation of string
+ *
+ * This MUST be kept in sync with pg_wcsformat!
+ */
+void
+pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
+ int *result_width, int *result_height, int *result_format_size)
+{
+ int w,
+ chlen = 0,
+ linewidth = 0;
+ int width = 0;
+ int height = 1;
+ int format_size = 0;
+
+ for (; *pwcs && len > 0; pwcs += chlen)
+ {
+ chlen = PQmblen((const char *) pwcs, encoding);
+ if (len < (size_t) chlen)
+ break;
+ w = PQdsplen((const char *) pwcs, encoding);
+
+ if (chlen == 1) /* single-byte char */
+ {
+ if (*pwcs == '\n') /* Newline */
+ {
+ if (linewidth > width)
+ width = linewidth;
+ linewidth = 0;
+ height += 1;
+ format_size += 1; /* For NUL char */
+ }
+ else if (*pwcs == '\r') /* Linefeed */
+ {
+ linewidth += 2;
+ format_size += 2;
+ }
+ else if (*pwcs == '\t') /* Tab */
+ {
+ do
+ {
+ linewidth++;
+ format_size++;
+ } while (linewidth % 8 != 0);
+ }
+ else if (w < 0) /* Other control char */
+ {
+ linewidth += 4;
+ format_size += 4;
+ }
+ else /* Output it as-is */
+ {
+ linewidth += w;
+ format_size += 1;
+ }
+ }
+ else if (w < 0) /* Non-ascii control char */
+ {
+ linewidth += 6; /* \u0000 */
+ format_size += 6;
+ }
+ else /* All other chars */
+ {
+ linewidth += w;
+ format_size += chlen;
+ }
+ len -= chlen;
+ }
+ if (linewidth > width)
+ width = linewidth;
+ format_size += 1; /* For NUL char */
+
+ /* Set results */
+ if (result_width)
+ *result_width = width;
+ if (result_height)
+ *result_height = height;
+ if (result_format_size)
+ *result_format_size = format_size;
+}
+
+/*
+ * Format a string into one or more "struct lineptr" lines.
+ * lines[i].ptr == NULL indicates the end of the array.
+ *
+ * This MUST be kept in sync with pg_wcssize!
+ */
+void
+pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
+ struct lineptr *lines, int count)
+{
+ int w,
+ chlen = 0;
+ int linewidth = 0;
+ unsigned char *ptr = lines->ptr; /* Pointer to data area */
+
+ for (; *pwcs && len > 0; pwcs += chlen)
+ {
+ chlen = PQmblen((const char *) pwcs, encoding);
+ if (len < (size_t) chlen)
+ break;
+ w = PQdsplen((const char *) pwcs, encoding);
+
+ if (chlen == 1) /* single-byte char */
+ {
+ if (*pwcs == '\n') /* Newline */
+ {
+ *ptr++ = '\0';
+ lines->width = linewidth;
+ linewidth = 0;
+ lines++;
+ count--;
+ if (count <= 0)
+ exit(1); /* Screwup */
+
+ /* make next line point to remaining memory */
+ lines->ptr = ptr;
+ }
+ else if (*pwcs == '\r') /* Linefeed */
+ {
+ strcpy((char *) ptr, "\\r");
+ linewidth += 2;
+ ptr += 2;
+ }
+ else if (*pwcs == '\t') /* Tab */
+ {
+ do
+ {
+ *ptr++ = ' ';
+ linewidth++;
+ } while (linewidth % 8 != 0);
+ }
+ else if (w < 0) /* Other control char */
+ {
+ sprintf((char *) ptr, "\\x%02X", *pwcs);
+ linewidth += 4;
+ ptr += 4;
+ }
+ else /* Output it as-is */
+ {
+ linewidth += w;
+ *ptr++ = *pwcs;
+ }
+ }
+ else if (w < 0) /* Non-ascii control char */
+ {
+ if (encoding == PG_UTF8)
+ sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
+ else
+ {
+ /*
+ * This case cannot happen in the current code because only
+ * UTF-8 signals multibyte control characters. But we may need
+ * to support it at some stage
+ */
+ sprintf((char *) ptr, "\\u????");
+ }
+ ptr += 6;
+ linewidth += 6;
+ }
+ else /* All other chars */
+ {
+ int i;
+
+ for (i = 0; i < chlen; i++)
+ *ptr++ = pwcs[i];
+ linewidth += w;
+ }
+ len -= chlen;
+ }
+ lines->width = linewidth;
+ *ptr++ = '\0'; /* Terminate formatted string */
+
+ if (count <= 0)
+ exit(1); /* Screwup */
+
+ (lines + 1)->ptr = NULL; /* terminate line array */
+}
+
+
+/*
+ * Encoding validation: delete any unvalidatable characters from the string
+ *
+ * This seems redundant with existing functionality elsewhere?
+ */
+unsigned char *
+mbvalidate(unsigned char *pwcs, int encoding)
+{
+ if (encoding == PG_UTF8)
+ mb_utf_validate(pwcs);
+ else
+ {
+ /*
+ * other encodings needing validation should add their own routines
+ * here
+ */
+ }
+
+ return pwcs;
+}