diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/fe_utils/mbprint.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/fe_utils/mbprint.c')
-rw-r--r-- | src/fe_utils/mbprint.c | 405 |
1 files changed, 405 insertions, 0 deletions
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c new file mode 100644 index 0000000..fe3faba --- /dev/null +++ b/src/fe_utils/mbprint.c @@ -0,0 +1,405 @@ +/*------------------------------------------------------------------------- + * + * Multibyte character printing support for frontend code + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/fe_utils/mbprint.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include "fe_utils/mbprint.h" + +#include "libpq-fe.h" + + +/* + * To avoid version-skew problems, this file must not use declarations + * from pg_wchar.h: the encoding IDs we are dealing with are determined + * by the libpq.so we are linked with, and that might not match the + * numbers we see at compile time. (If this file were inside libpq, + * the problem would go away...) + * + * Hence, we have our own definition of pg_wchar, and we get the values + * of any needed encoding IDs on-the-fly. + */ + +typedef unsigned int pg_wchar; + +static int +pg_get_utf8_id(void) +{ + static int utf8_id = -1; + + if (utf8_id < 0) + utf8_id = pg_char_to_encoding("utf8"); + return utf8_id; +} + +#define PG_UTF8 pg_get_utf8_id() + + +/* + * Convert a UTF-8 character to a Unicode code point. + * This is a one-character version of pg_utf2wchar_with_len. + * + * No error checks here, c must point to a long-enough string. + */ +static pg_wchar +utf8_to_unicode(const unsigned char *c) +{ + if ((*c & 0x80) == 0) + return (pg_wchar) c[0]; + else if ((*c & 0xe0) == 0xc0) + return (pg_wchar) (((c[0] & 0x1f) << 6) | + (c[1] & 0x3f)); + else if ((*c & 0xf0) == 0xe0) + return (pg_wchar) (((c[0] & 0x0f) << 12) | + ((c[1] & 0x3f) << 6) | + (c[2] & 0x3f)); + else if ((*c & 0xf8) == 0xf0) + return (pg_wchar) (((c[0] & 0x07) << 18) | + ((c[1] & 0x3f) << 12) | + ((c[2] & 0x3f) << 6) | + (c[3] & 0x3f)); + else + /* that is an invalid code on purpose */ + return 0xffffffff; +} + + +/* + * Unicode 3.1 compliant validation : for each category, it checks the + * combination of each byte to make sure it maps to a valid range. It also + * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe = + * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates) + */ +static int +utf_charcheck(const unsigned char *c) +{ + if ((*c & 0x80) == 0) + return 1; + else if ((*c & 0xe0) == 0xc0) + { + /* two-byte char */ + if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01)) + return 2; + return -1; + } + else if ((*c & 0xf0) == 0xe0) + { + /* three-byte char */ + if (((c[1] & 0xc0) == 0x80) && + (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) && + ((c[2] & 0xc0) == 0x80)) + { + int z = c[0] & 0x0f; + int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f); + int lx = yx & 0x7f; + + /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */ + if (((z == 0x0f) && + (((yx & 0xffe) == 0xffe) || + (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) || + ((z == 0x0d) && ((yx & 0xb00) == 0x800))) + return -1; + return 3; + } + return -1; + } + else if ((*c & 0xf8) == 0xf0) + { + int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4); + + /* four-byte char */ + if (((c[1] & 0xc0) == 0x80) && + (u > 0x00) && (u <= 0x10) && + ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80)) + { + /* test for 0xzzzzfffe/0xzzzzfffff */ + if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) && + ((c[3] & 0x3e) == 0x3e)) + return -1; + return 4; + } + return -1; + } + return -1; +} + + +static void +mb_utf_validate(unsigned char *pwcs) +{ + unsigned char *p = pwcs; + + while (*pwcs) + { + int len; + + if ((len = utf_charcheck(pwcs)) > 0) + { + if (p != pwcs) + { + int i; + + for (i = 0; i < len; i++) + *p++ = *pwcs++; + } + else + { + pwcs += len; + p += len; + } + } + else + /* we skip the char */ + pwcs++; + } + if (p != pwcs) + *p = '\0'; +} + +/* + * public functions : wcswidth and mbvalidate + */ + +/* + * pg_wcswidth is the dumb display-width function. + * It assumes that everything will appear on one line. + * OTOH it is easier to use than pg_wcssize if this applies to you. + */ +int +pg_wcswidth(const char *pwcs, size_t len, int encoding) +{ + int width = 0; + + while (len > 0) + { + int chlen, + chwidth; + + chlen = PQmblen(pwcs, encoding); + if (len < (size_t) chlen) + break; /* Invalid string */ + + chwidth = PQdsplen(pwcs, encoding); + if (chwidth > 0) + width += chwidth; + + pwcs += chlen; + len -= chlen; + } + return width; +} + +/* + * pg_wcssize takes the given string in the given encoding and returns three + * values: + * result_width: Width in display characters of the longest line in string + * result_height: Number of lines in display output + * result_format_size: Number of bytes required to store formatted + * representation of string + * + * This MUST be kept in sync with pg_wcsformat! + */ +void +pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, + int *result_width, int *result_height, int *result_format_size) +{ + int w, + chlen = 0, + linewidth = 0; + int width = 0; + int height = 1; + int format_size = 0; + + for (; *pwcs && len > 0; pwcs += chlen) + { + chlen = PQmblen((const char *) pwcs, encoding); + if (len < (size_t) chlen) + break; + w = PQdsplen((const char *) pwcs, encoding); + + if (chlen == 1) /* single-byte char */ + { + if (*pwcs == '\n') /* Newline */ + { + if (linewidth > width) + width = linewidth; + linewidth = 0; + height += 1; + format_size += 1; /* For NUL char */ + } + else if (*pwcs == '\r') /* Linefeed */ + { + linewidth += 2; + format_size += 2; + } + else if (*pwcs == '\t') /* Tab */ + { + do + { + linewidth++; + format_size++; + } while (linewidth % 8 != 0); + } + else if (w < 0) /* Other control char */ + { + linewidth += 4; + format_size += 4; + } + else /* Output it as-is */ + { + linewidth += w; + format_size += 1; + } + } + else if (w < 0) /* Non-ascii control char */ + { + linewidth += 6; /* \u0000 */ + format_size += 6; + } + else /* All other chars */ + { + linewidth += w; + format_size += chlen; + } + len -= chlen; + } + if (linewidth > width) + width = linewidth; + format_size += 1; /* For NUL char */ + + /* Set results */ + if (result_width) + *result_width = width; + if (result_height) + *result_height = height; + if (result_format_size) + *result_format_size = format_size; +} + +/* + * Format a string into one or more "struct lineptr" lines. + * lines[i].ptr == NULL indicates the end of the array. + * + * This MUST be kept in sync with pg_wcssize! + */ +void +pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, + struct lineptr *lines, int count) +{ + int w, + chlen = 0; + int linewidth = 0; + unsigned char *ptr = lines->ptr; /* Pointer to data area */ + + for (; *pwcs && len > 0; pwcs += chlen) + { + chlen = PQmblen((const char *) pwcs, encoding); + if (len < (size_t) chlen) + break; + w = PQdsplen((const char *) pwcs, encoding); + + if (chlen == 1) /* single-byte char */ + { + if (*pwcs == '\n') /* Newline */ + { + *ptr++ = '\0'; + lines->width = linewidth; + linewidth = 0; + lines++; + count--; + if (count <= 0) + exit(1); /* Screwup */ + + /* make next line point to remaining memory */ + lines->ptr = ptr; + } + else if (*pwcs == '\r') /* Linefeed */ + { + strcpy((char *) ptr, "\\r"); + linewidth += 2; + ptr += 2; + } + else if (*pwcs == '\t') /* Tab */ + { + do + { + *ptr++ = ' '; + linewidth++; + } while (linewidth % 8 != 0); + } + else if (w < 0) /* Other control char */ + { + sprintf((char *) ptr, "\\x%02X", *pwcs); + linewidth += 4; + ptr += 4; + } + else /* Output it as-is */ + { + linewidth += w; + *ptr++ = *pwcs; + } + } + else if (w < 0) /* Non-ascii control char */ + { + if (encoding == PG_UTF8) + sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs)); + else + { + /* + * This case cannot happen in the current code because only + * UTF-8 signals multibyte control characters. But we may need + * to support it at some stage + */ + sprintf((char *) ptr, "\\u????"); + } + ptr += 6; + linewidth += 6; + } + else /* All other chars */ + { + int i; + + for (i = 0; i < chlen; i++) + *ptr++ = pwcs[i]; + linewidth += w; + } + len -= chlen; + } + lines->width = linewidth; + *ptr++ = '\0'; /* Terminate formatted string */ + + if (count <= 0) + exit(1); /* Screwup */ + + (lines + 1)->ptr = NULL; /* terminate line array */ +} + + +/* + * Encoding validation: delete any unvalidatable characters from the string + * + * This seems redundant with existing functionality elsewhere? + */ +unsigned char * +mbvalidate(unsigned char *pwcs, int encoding) +{ + if (encoding == PG_UTF8) + mb_utf_validate(pwcs); + else + { + /* + * other encodings needing validation should add their own routines + * here + */ + } + + return pwcs; +} |