1 files changed, 224 insertions, 15 deletions
diff --git a/utf8.c b/utf8.c
index df75a76..5053e45 100644
--- a/utf8.c
+++ b/utf8.c
@@ -26,6 +26,171 @@
 
 #include "tmux.h"
 
+static const wchar_t utf8_force_wide[] = {
+	0x0261D,
+	0x026F9,
+	0x0270A,
+	0x0270B,
+	0x0270C,
+	0x0270D,
+	0x1F1E6,
+	0x1F1E7,
+	0x1F1E8,
+	0x1F1E9,
+	0x1F1EA,
+	0x1F1EB,
+	0x1F1EC,
+	0x1F1ED,
+	0x1F1EE,
+	0x1F1EF,
+	0x1F1F0,
+	0x1F1F1,
+	0x1F1F2,
+	0x1F1F3,
+	0x1F1F4,
+	0x1F1F5,
+	0x1F1F6,
+	0x1F1F7,
+	0x1F1F8,
+	0x1F1F9,
+	0x1F1FA,
+	0x1F1FB,
+	0x1F1FC,
+	0x1F1FD,
+	0x1F1FE,
+	0x1F1FF,
+	0x1F385,
+	0x1F3C2,
+	0x1F3C3,
+	0x1F3C4,
+	0x1F3C7,
+	0x1F3CA,
+	0x1F3CB,
+	0x1F3CC,
+	0x1F3FB,
+	0x1F3FC,
+	0x1F3FD,
+	0x1F3FE,
+	0x1F3FF,
+	0x1F442,
+	0x1F443,
+	0x1F446,
+	0x1F447,
+	0x1F448,
+	0x1F449,
+	0x1F44A,
+	0x1F44B,
+	0x1F44C,
+	0x1F44D,
+	0x1F44E,
+	0x1F44F,
+	0x1F450,
+	0x1F466,
+	0x1F467,
+	0x1F468,
+	0x1F469,
+	0x1F46B,
+	0x1F46C,
+	0x1F46D,
+	0x1F46E,
+	0x1F470,
+	0x1F471,
+	0x1F472,
+	0x1F473,
+	0x1F474,
+	0x1F475,
+	0x1F476,
+	0x1F477,
+	0x1F478,
+	0x1F47C,
+	0x1F481,
+	0x1F482,
+	0x1F483,
+	0x1F485,
+	0x1F486,
+	0x1F487,
+	0x1F48F,
+	0x1F491,
+	0x1F4AA,
+	0x1F574,
+	0x1F575,
+	0x1F57A,
+	0x1F590,
+	0x1F595,
+	0x1F596,
+	0x1F645,
+	0x1F646,
+	0x1F647,
+	0x1F64B,
+	0x1F64C,
+	0x1F64D,
+	0x1F64E,
+	0x1F64F,
+	0x1F6A3,
+	0x1F6B4,
+	0x1F6B5,
+	0x1F6B6,
+	0x1F6C0,
+	0x1F6CC,
+	0x1F90C,
+	0x1F90F,
+	0x1F918,
+	0x1F919,
+	0x1F91A,
+	0x1F91B,
+	0x1F91C,
+	0x1F91D,
+	0x1F91E,
+	0x1F91F,
+	0x1F926,
+	0x1F930,
+	0x1F931,
+	0x1F932,
+	0x1F933,
+	0x1F934,
+	0x1F935,
+	0x1F936,
+	0x1F937,
+	0x1F938,
+	0x1F939,
+	0x1F93D,
+	0x1F93E,
+	0x1F977,
+	0x1F9B5,
+	0x1F9B6,
+	0x1F9B8,
+	0x1F9B9,
+	0x1F9BB,
+	0x1F9CD,
+	0x1F9CE,
+	0x1F9CF,
+	0x1F9D1,
+	0x1F9D2,
+	0x1F9D3,
+	0x1F9D4,
+	0x1F9D5,
+	0x1F9D6,
+	0x1F9D7,
+	0x1F9D8,
+	0x1F9D9,
+	0x1F9DA,
+	0x1F9DB,
+	0x1F9DC,
+	0x1F9DD,
+	0x1FAC3,
+	0x1FAC4,
+	0x1FAC5,
+	0x1FAF0,
+	0x1FAF1,
+	0x1FAF2,
+	0x1FAF3,
+	0x1FAF4,
+	0x1FAF5,
+	0x1FAF6,
+	0x1FAF7,
+	0x1FAF8
+};
+
 struct utf8_item {
 	RB_ENTRY(utf8_item)	index_entry;
 	u_int			index;
@@ -71,7 +236,7 @@ static u_int utf8_next_index;
 
 /* Get a UTF-8 item from data. */
 static struct utf8_item *
-utf8_item_by_data(const char *data, size_t size)
+utf8_item_by_data(const u_char *data, size_t size)
 {
 	struct utf8_item	ui;
 
@@ -94,7 +259,7 @@ utf8_item_by_index(u_int index)
 
 /* Add a UTF-8 item. */
 static int
-utf8_put_item(const char *data, size_t size, u_int *index)
+utf8_put_item(const u_char *data, size_t size, u_int *index)
 {
 	struct utf8_item	*ui;
 
@@ -122,6 +287,28 @@ utf8_put_item(const char *data, size_t size, u_int *index)
 	return (0);
 }
 
+static int
+utf8_table_cmp(const void *vp1, const void *vp2)
+{
+	const wchar_t	*wc1 = vp1, *wc2 = vp2;
+
+	if (*wc1 < *wc2)
+		return (-1);
+	if (*wc1 > *wc2)
+		return (1);
+	return (0);
+}
+
+/* Check if character in table. */
+int
+utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
+{
+	wchar_t	*found;
+
+	found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
+	return (found != NULL);
+}
+
 /* Get UTF-8 character from data. */
 enum utf8_state
 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
@@ -135,8 +322,8 @@ utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 		goto fail;
 	if (ud->size <= 3) {
 		index = (((utf8_char)ud->data[2] << 16)|
-		          ((utf8_char)ud->data[1] << 8)|
-		          ((utf8_char)ud->data[0]));
+			  ((utf8_char)ud->data[1] << 8)|
+			  ((utf8_char)ud->data[0]));
 	} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 		goto fail;
 	*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
@@ -216,10 +403,39 @@ utf8_width(struct utf8_data *ud, int *width)
 {
 	wchar_t	wc;
 
+	if (utf8_towc(ud, &wc) != UTF8_DONE)
+		return (UTF8_ERROR);
+	if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
+		*width = 2;
+		return (UTF8_DONE);
+	}
+#ifdef HAVE_UTF8PROC
+	*width = utf8proc_wcwidth(wc);
+	log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
+#else
+	*width = wcwidth(wc);
+	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
+	if (*width < 0) {
+		/*
+		 * C1 control characters are nonprintable, so they are always
+		 * zero width.
+		 */
+		*width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
+	}
+#endif
+	if (*width >= 0 && *width <= 0xff)
+		return (UTF8_DONE);
+	return (UTF8_ERROR);
+}
+
+/* Convert UTF-8 character to wide character. */
+enum utf8_state
+utf8_towc(const struct utf8_data *ud, wchar_t *wc)
+{
 #ifdef HAVE_UTF8PROC
-	switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) {
+	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
 #else
-	switch (mbtowc(&wc, ud->data, ud->size)) {
+	switch (mbtowc(wc, ud->data, ud->size)) {
 #endif
 	case -1:
 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
@@ -229,15 +445,8 @@ utf8_width(struct utf8_data *ud, int *width)
 	case 0:
 		return (UTF8_ERROR);
 	}
-#ifdef HAVE_UTF8PROC
-	*width = utf8proc_wcwidth(wc);
-#else
-	*width = wcwidth(wc);
-#endif
-	if (*width >= 0 && *width <= 0xff)
-		return (UTF8_DONE);
-	log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, *width);
-	return (UTF8_ERROR);
+	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
+	return (UTF8_DONE);
 }
 
 /*