summaryrefslogtreecommitdiffstats
path: root/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--utf8.c239
1 files changed, 224 insertions, 15 deletions
diff --git a/utf8.c b/utf8.c
index df75a76..5053e45 100644
--- a/utf8.c
+++ b/utf8.c
@@ -26,6 +26,171 @@
#include "tmux.h"
+static const wchar_t utf8_force_wide[] = {
+ 0x0261D,
+ 0x026F9,
+ 0x0270A,
+ 0x0270B,
+ 0x0270C,
+ 0x0270D,
+ 0x1F1E6,
+ 0x1F1E7,
+ 0x1F1E8,
+ 0x1F1E9,
+ 0x1F1EA,
+ 0x1F1EB,
+ 0x1F1EC,
+ 0x1F1ED,
+ 0x1F1EE,
+ 0x1F1EF,
+ 0x1F1F0,
+ 0x1F1F1,
+ 0x1F1F2,
+ 0x1F1F3,
+ 0x1F1F4,
+ 0x1F1F5,
+ 0x1F1F6,
+ 0x1F1F7,
+ 0x1F1F8,
+ 0x1F1F9,
+ 0x1F1FA,
+ 0x1F1FB,
+ 0x1F1FC,
+ 0x1F1FD,
+ 0x1F1FE,
+ 0x1F1FF,
+ 0x1F385,
+ 0x1F3C2,
+ 0x1F3C3,
+ 0x1F3C4,
+ 0x1F3C7,
+ 0x1F3CA,
+ 0x1F3CB,
+ 0x1F3CC,
+ 0x1F3FB,
+ 0x1F3FC,
+ 0x1F3FD,
+ 0x1F3FE,
+ 0x1F3FF,
+ 0x1F442,
+ 0x1F443,
+ 0x1F446,
+ 0x1F447,
+ 0x1F448,
+ 0x1F449,
+ 0x1F44A,
+ 0x1F44B,
+ 0x1F44C,
+ 0x1F44D,
+ 0x1F44E,
+ 0x1F44F,
+ 0x1F450,
+ 0x1F466,
+ 0x1F467,
+ 0x1F468,
+ 0x1F469,
+ 0x1F46B,
+ 0x1F46C,
+ 0x1F46D,
+ 0x1F46E,
+ 0x1F470,
+ 0x1F471,
+ 0x1F472,
+ 0x1F473,
+ 0x1F474,
+ 0x1F475,
+ 0x1F476,
+ 0x1F477,
+ 0x1F478,
+ 0x1F47C,
+ 0x1F481,
+ 0x1F482,
+ 0x1F483,
+ 0x1F485,
+ 0x1F486,
+ 0x1F487,
+ 0x1F48F,
+ 0x1F491,
+ 0x1F4AA,
+ 0x1F574,
+ 0x1F575,
+ 0x1F57A,
+ 0x1F590,
+ 0x1F595,
+ 0x1F596,
+ 0x1F645,
+ 0x1F646,
+ 0x1F647,
+ 0x1F64B,
+ 0x1F64C,
+ 0x1F64D,
+ 0x1F64E,
+ 0x1F64F,
+ 0x1F6A3,
+ 0x1F6B4,
+ 0x1F6B5,
+ 0x1F6B6,
+ 0x1F6C0,
+ 0x1F6CC,
+ 0x1F90C,
+ 0x1F90F,
+ 0x1F918,
+ 0x1F919,
+ 0x1F91A,
+ 0x1F91B,
+ 0x1F91C,
+ 0x1F91D,
+ 0x1F91E,
+ 0x1F91F,
+ 0x1F926,
+ 0x1F930,
+ 0x1F931,
+ 0x1F932,
+ 0x1F933,
+ 0x1F934,
+ 0x1F935,
+ 0x1F936,
+ 0x1F937,
+ 0x1F938,
+ 0x1F939,
+ 0x1F93D,
+ 0x1F93E,
+ 0x1F977,
+ 0x1F9B5,
+ 0x1F9B6,
+ 0x1F9B8,
+ 0x1F9B9,
+ 0x1F9BB,
+ 0x1F9CD,
+ 0x1F9CE,
+ 0x1F9CF,
+ 0x1F9D1,
+ 0x1F9D2,
+ 0x1F9D3,
+ 0x1F9D4,
+ 0x1F9D5,
+ 0x1F9D6,
+ 0x1F9D7,
+ 0x1F9D8,
+ 0x1F9D9,
+ 0x1F9DA,
+ 0x1F9DB,
+ 0x1F9DC,
+ 0x1F9DD,
+ 0x1FAC3,
+ 0x1FAC4,
+ 0x1FAC5,
+ 0x1FAF0,
+ 0x1FAF1,
+ 0x1FAF2,
+ 0x1FAF3,
+ 0x1FAF4,
+ 0x1FAF5,
+ 0x1FAF6,
+ 0x1FAF7,
+ 0x1FAF8
+};
+
struct utf8_item {
RB_ENTRY(utf8_item) index_entry;
u_int index;
@@ -71,7 +236,7 @@ static u_int utf8_next_index;
/* Get a UTF-8 item from data. */
static struct utf8_item *
-utf8_item_by_data(const char *data, size_t size)
+utf8_item_by_data(const u_char *data, size_t size)
{
struct utf8_item ui;
@@ -94,7 +259,7 @@ utf8_item_by_index(u_int index)
/* Add a UTF-8 item. */
static int
-utf8_put_item(const char *data, size_t size, u_int *index)
+utf8_put_item(const u_char *data, size_t size, u_int *index)
{
struct utf8_item *ui;
@@ -122,6 +287,28 @@ utf8_put_item(const char *data, size_t size, u_int *index)
return (0);
}
+static int
+utf8_table_cmp(const void *vp1, const void *vp2)
+{
+ const wchar_t *wc1 = vp1, *wc2 = vp2;
+
+ if (*wc1 < *wc2)
+ return (-1);
+ if (*wc1 > *wc2)
+ return (1);
+ return (0);
+}
+
+/* Check if character in table. */
+int
+utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
+{
+ wchar_t *found;
+
+ found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
+ return (found != NULL);
+}
+
/* Get UTF-8 character from data. */
enum utf8_state
utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
@@ -135,8 +322,8 @@ utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
goto fail;
if (ud->size <= 3) {
index = (((utf8_char)ud->data[2] << 16)|
- ((utf8_char)ud->data[1] << 8)|
- ((utf8_char)ud->data[0]));
+ ((utf8_char)ud->data[1] << 8)|
+ ((utf8_char)ud->data[0]));
} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
goto fail;
*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
@@ -216,10 +403,39 @@ utf8_width(struct utf8_data *ud, int *width)
{
wchar_t wc;
+ if (utf8_towc(ud, &wc) != UTF8_DONE)
+ return (UTF8_ERROR);
+ if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
+ *width = 2;
+ return (UTF8_DONE);
+ }
+#ifdef HAVE_UTF8PROC
+ *width = utf8proc_wcwidth(wc);
+ log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
+#else
+ *width = wcwidth(wc);
+ log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
+ if (*width < 0) {
+ /*
+ * C1 control characters are nonprintable, so they are always
+ * zero width.
+ */
+ *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
+ }
+#endif
+ if (*width >= 0 && *width <= 0xff)
+ return (UTF8_DONE);
+ return (UTF8_ERROR);
+}
+
+/* Convert UTF-8 character to wide character. */
+enum utf8_state
+utf8_towc(const struct utf8_data *ud, wchar_t *wc)
+{
#ifdef HAVE_UTF8PROC
- switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) {
+ switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
#else
- switch (mbtowc(&wc, ud->data, ud->size)) {
+ switch (mbtowc(wc, ud->data, ud->size)) {
#endif
case -1:
log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
@@ -229,15 +445,8 @@ utf8_width(struct utf8_data *ud, int *width)
case 0:
return (UTF8_ERROR);
}
-#ifdef HAVE_UTF8PROC
- *width = utf8proc_wcwidth(wc);
-#else
- *width = wcwidth(wc);
-#endif
- if (*width >= 0 && *width <= 0xff)
- return (UTF8_DONE);
- log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, *width);
- return (UTF8_ERROR);
+ log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
+ return (UTF8_DONE);
}
/*