From c294c2bf54a3ec56271a78c284f84e75a8119731 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 05:34:56 +0200 Subject: Merging upstream version 3.4. Signed-off-by: Daniel Baumann --- utf8.c | 239 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 224 insertions(+), 15 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index df75a76..5053e45 100644 --- a/utf8.c +++ b/utf8.c @@ -26,6 +26,171 @@ #include "tmux.h" +static const wchar_t utf8_force_wide[] = { + 0x0261D, + 0x026F9, + 0x0270A, + 0x0270B, + 0x0270C, + 0x0270D, + 0x1F1E6, + 0x1F1E7, + 0x1F1E8, + 0x1F1E9, + 0x1F1EA, + 0x1F1EB, + 0x1F1EC, + 0x1F1ED, + 0x1F1EE, + 0x1F1EF, + 0x1F1F0, + 0x1F1F1, + 0x1F1F2, + 0x1F1F3, + 0x1F1F4, + 0x1F1F5, + 0x1F1F6, + 0x1F1F7, + 0x1F1F8, + 0x1F1F9, + 0x1F1FA, + 0x1F1FB, + 0x1F1FC, + 0x1F1FD, + 0x1F1FE, + 0x1F1FF, + 0x1F385, + 0x1F3C2, + 0x1F3C3, + 0x1F3C4, + 0x1F3C7, + 0x1F3CA, + 0x1F3CB, + 0x1F3CC, + 0x1F3FB, + 0x1F3FC, + 0x1F3FD, + 0x1F3FE, + 0x1F3FF, + 0x1F442, + 0x1F443, + 0x1F446, + 0x1F447, + 0x1F448, + 0x1F449, + 0x1F44A, + 0x1F44B, + 0x1F44C, + 0x1F44D, + 0x1F44E, + 0x1F44F, + 0x1F450, + 0x1F466, + 0x1F467, + 0x1F468, + 0x1F469, + 0x1F46B, + 0x1F46C, + 0x1F46D, + 0x1F46E, + 0x1F470, + 0x1F471, + 0x1F472, + 0x1F473, + 0x1F474, + 0x1F475, + 0x1F476, + 0x1F477, + 0x1F478, + 0x1F47C, + 0x1F481, + 0x1F482, + 0x1F483, + 0x1F485, + 0x1F486, + 0x1F487, + 0x1F48F, + 0x1F491, + 0x1F4AA, + 0x1F574, + 0x1F575, + 0x1F57A, + 0x1F590, + 0x1F595, + 0x1F596, + 0x1F645, + 0x1F646, + 0x1F647, + 0x1F64B, + 0x1F64C, + 0x1F64D, + 0x1F64E, + 0x1F64F, + 0x1F6A3, + 0x1F6B4, + 0x1F6B5, + 0x1F6B6, + 0x1F6C0, + 0x1F6CC, + 0x1F90C, + 0x1F90F, + 0x1F918, + 0x1F919, + 0x1F91A, + 0x1F91B, + 0x1F91C, + 0x1F91D, + 0x1F91E, + 0x1F91F, + 0x1F926, + 0x1F930, + 0x1F931, + 0x1F932, + 0x1F933, + 0x1F934, + 0x1F935, + 0x1F936, + 0x1F937, + 0x1F938, + 0x1F939, + 0x1F93D, + 0x1F93E, + 0x1F977, + 0x1F9B5, + 0x1F9B6, + 0x1F9B8, + 0x1F9B9, + 0x1F9BB, + 0x1F9CD, + 0x1F9CE, + 0x1F9CF, + 0x1F9D1, + 0x1F9D2, + 0x1F9D3, + 0x1F9D4, + 0x1F9D5, + 0x1F9D6, + 0x1F9D7, + 0x1F9D8, + 0x1F9D9, + 0x1F9DA, + 0x1F9DB, + 0x1F9DC, + 0x1F9DD, + 0x1FAC3, + 0x1FAC4, + 0x1FAC5, + 0x1FAF0, + 0x1FAF1, + 0x1FAF2, + 0x1FAF3, + 0x1FAF4, + 0x1FAF5, + 0x1FAF6, + 0x1FAF7, + 0x1FAF8 +}; + struct utf8_item { RB_ENTRY(utf8_item) index_entry; u_int index; @@ -71,7 +236,7 @@ static u_int utf8_next_index; /* Get a UTF-8 item from data. */ static struct utf8_item * -utf8_item_by_data(const char *data, size_t size) +utf8_item_by_data(const u_char *data, size_t size) { struct utf8_item ui; @@ -94,7 +259,7 @@ utf8_item_by_index(u_int index) /* Add a UTF-8 item. */ static int -utf8_put_item(const char *data, size_t size, u_int *index) +utf8_put_item(const u_char *data, size_t size, u_int *index) { struct utf8_item *ui; @@ -122,6 +287,28 @@ utf8_put_item(const char *data, size_t size, u_int *index) return (0); } +static int +utf8_table_cmp(const void *vp1, const void *vp2) +{ + const wchar_t *wc1 = vp1, *wc2 = vp2; + + if (*wc1 < *wc2) + return (-1); + if (*wc1 > *wc2) + return (1); + return (0); +} + +/* Check if character in table. */ +int +utf8_in_table(wchar_t find, const wchar_t *table, u_int count) +{ + wchar_t *found; + + found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp); + return (found != NULL); +} + /* Get UTF-8 character from data. */ enum utf8_state utf8_from_data(const struct utf8_data *ud, utf8_char *uc) @@ -135,8 +322,8 @@ utf8_from_data(const struct utf8_data *ud, utf8_char *uc) goto fail; if (ud->size <= 3) { index = (((utf8_char)ud->data[2] << 16)| - ((utf8_char)ud->data[1] << 8)| - ((utf8_char)ud->data[0])); + ((utf8_char)ud->data[1] << 8)| + ((utf8_char)ud->data[0])); } else if (utf8_put_item(ud->data, ud->size, &index) != 0) goto fail; *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; @@ -216,10 +403,39 @@ utf8_width(struct utf8_data *ud, int *width) { wchar_t wc; + if (utf8_towc(ud, &wc) != UTF8_DONE) + return (UTF8_ERROR); + if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) { + *width = 2; + return (UTF8_DONE); + } +#ifdef HAVE_UTF8PROC + *width = utf8proc_wcwidth(wc); + log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width); +#else + *width = wcwidth(wc); + log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width); + if (*width < 0) { + /* + * C1 control characters are nonprintable, so they are always + * zero width. + */ + *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1; + } +#endif + if (*width >= 0 && *width <= 0xff) + return (UTF8_DONE); + return (UTF8_ERROR); +} + +/* Convert UTF-8 character to wide character. */ +enum utf8_state +utf8_towc(const struct utf8_data *ud, wchar_t *wc) +{ #ifdef HAVE_UTF8PROC - switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) { + switch (utf8proc_mbtowc(wc, ud->data, ud->size)) { #else - switch (mbtowc(&wc, ud->data, ud->size)) { + switch (mbtowc(wc, ud->data, ud->size)) { #endif case -1: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, @@ -229,15 +445,8 @@ utf8_width(struct utf8_data *ud, int *width) case 0: return (UTF8_ERROR); } -#ifdef HAVE_UTF8PROC - *width = utf8proc_wcwidth(wc); -#else - *width = wcwidth(wc); -#endif - if (*width >= 0 && *width <= 0xff) - return (UTF8_DONE); - log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, *width); - return (UTF8_ERROR); + log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc); + return (UTF8_DONE); } /* -- cgit v1.2.3