summaryrefslogtreecommitdiffstats
path: root/lib/uniwidth/width.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uniwidth/width.c')
-rw-r--r--lib/uniwidth/width.c95
1 files changed, 95 insertions, 0 deletions
diff --git a/lib/uniwidth/width.c b/lib/uniwidth/width.c
new file mode 100644
index 0000000..c9d4d1d
--- /dev/null
+++ b/lib/uniwidth/width.c
@@ -0,0 +1,95 @@
+/* Determine display width of Unicode character.
+ Copyright (C) 2001-2002, 2006-2023 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2002.
+
+ This file is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ This file is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+/* Specification. */
+#include "uniwidth.h"
+
+#include "cjk.h"
+
+/* The non-spacing attribute table consists of:
+ * Non-spacing characters; generated from PropList.txt or
+ "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
+ * Format control characters; generated from
+ "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
+ * Zero width characters; generated from
+ "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
+ * Hangul Jamo characters that have conjoining behaviour:
+ - jungseong = syllable-middle vowels
+ - jongseong = syllable-final consonants
+ Rationale:
+ 1) These characters act like combining characters. They have no
+ equivalent in legacy character sets. Therefore the EastAsianWidth.txt
+ file does not really matter for them; UAX #11 East Asian Width
+ <https://www.unicode.org/reports/tr11/> makes it clear that it focus
+ is on compatibility with traditional Japanese layout.
+ By contrast, the same glyphs without conjoining behaviour are available
+ in the U+3130..U+318F block, and these characters are mapped to legacy
+ character sets, and traditional Japanese layout matters for them.
+ 2) glibc does the same thing, see
+ <https://sourceware.org/bugzilla/show_bug.cgi?id=21750>
+ <https://sourceware.org/bugzilla/show_bug.cgi?id=26120>
+ */
+#include "uniwidth/width0.h"
+
+#include "uniwidth/width2.h"
+#include "unictype/bitmap.h"
+
+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
+
+
+/* Determine number of column positions required for UC. */
+int
+uc_width (ucs4_t uc, const char *encoding)
+{
+ /* Test for non-spacing or control character. */
+ if ((uc >> 9) < SIZEOF (nonspacing_table_ind))
+ {
+ int ind = nonspacing_table_ind[uc >> 9];
+ if (ind >= 0)
+ if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
+ {
+ if (uc > 0 && uc < 0xa0)
+ return -1;
+ else
+ return 0;
+ }
+ }
+ else if ((uc >> 9) == (0xe0000 >> 9))
+ {
+ if (uc >= 0xe0100)
+ {
+ if (uc <= 0xe01ef)
+ return 0;
+ }
+ else
+ {
+ if (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
+ return 0;
+ }
+ }
+ /* Test for double-width character. */
+ if (bitmap_lookup (&u_width2, uc))
+ return 2;
+ /* In ancient CJK encodings, Cyrillic and most other characters are
+ double-width as well. */
+ if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
+ && is_cjk_encoding (encoding))
+ return 2;
+ return 1;
+}