summaryrefslogtreecommitdiffstats
path: root/lib/sh/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sh/utf8.c')
-rw-r--r--lib/sh/utf8.c196
1 files changed, 196 insertions, 0 deletions
diff --git a/lib/sh/utf8.c b/lib/sh/utf8.c
new file mode 100644
index 0000000..fed2522
--- /dev/null
+++ b/lib/sh/utf8.c
@@ -0,0 +1,196 @@
+/* utf8.c - UTF-8 character handling functions */
+
+/* Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of GNU Bash, the Bourne Again SHell.
+
+ Bash is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Bash is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Bash. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#ifdef HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+
+#include "bashansi.h"
+#include "shmbutil.h"
+
+extern int locale_mb_cur_max;
+extern int locale_utf8locale;
+
+#if defined (HANDLE_MULTIBYTE)
+
+char *
+utf8_mbschr (s, c)
+ const char *s;
+ int c;
+{
+ return strchr (s, c); /* for now */
+}
+
+int
+utf8_mbscmp (s1, s2)
+ const char *s1, *s2;
+{
+ /* Use the fact that the UTF-8 encoding preserves lexicographic order. */
+ return strcmp (s1, s2);
+}
+
+char *
+utf8_mbsmbchar (str)
+ const char *str;
+{
+ register char *s;
+
+ for (s = (char *)str; *s; s++)
+ if ((*s & 0xc0) == 0x80)
+ return s;
+ return (0);
+}
+
+int
+utf8_mbsnlen(src, srclen, maxlen)
+ const char *src;
+ size_t srclen;
+ int maxlen;
+{
+ register int sind, count;
+
+ for (sind = count = 0; src[sind] && sind <= maxlen; sind++)
+ {
+ if ((src[sind] & 0xc0) != 0x80)
+ count++;
+ }
+ return (count);
+}
+
+/* Adapted from GNU gnulib. Handles UTF-8 characters up to 4 bytes long */
+int
+utf8_mblen (s, n)
+ const char *s;
+ size_t n;
+{
+ unsigned char c, c1, c2, c3;
+
+ if (s == 0)
+ return (0); /* no shift states */
+ if (n <= 0)
+ return (-1);
+
+ c = (unsigned char)*s;
+ if (c < 0x80)
+ return (c != 0);
+ if (c >= 0xc2)
+ {
+ c1 = (unsigned char)s[1];
+ if (c < 0xe0)
+ {
+ if (n == 1)
+ return -2;
+
+ /*
+ * c c1
+ *
+ * U+0080..U+07FF C2..DF 80..BF
+ */
+
+ if (n >= 2 && (c1 ^ 0x80) < 0x40) /* 0x80..0xbf */
+ return 2;
+ }
+ else if (c < 0xf0)
+ {
+ if (n == 1)
+ return -2;
+
+ /*
+ * c c1 c2
+ *
+ * U+0800..U+0FFF E0 A0..BF 80..BF
+ * U+1000..U+CFFF E1..EC 80..BF 80..BF
+ * U+D000..U+D7FF ED 80..9F 80..BF
+ * U+E000..U+FFFF EE..EF 80..BF 80..BF
+ */
+
+ if ((c1 ^ 0x80) < 0x40
+ && (c >= 0xe1 || c1 >= 0xa0)
+ && (c != 0xed || c1 < 0xa0))
+ {
+ if (n == 2)
+ return -2; /* incomplete */
+
+ c2 = (unsigned char)s[2];
+ if ((c2 ^ 0x80) < 0x40)
+ return 3;
+ }
+ }
+ else if (c <= 0xf4)
+ {
+ if (n == 1)
+ return -2;
+
+ /*
+ * c c1 c2 c3
+ *
+ * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+ * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+ * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+ */
+ if (((c1 ^ 0x80) < 0x40)
+ && (c >= 0xf1 || c1 >= 0x90)
+ && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
+ {
+ if (n == 2)
+ return -2; /* incomplete */
+
+ c2 = (unsigned char)s[2];
+ if ((c2 ^ 0x80) < 0x40)
+ {
+ if (n == 3)
+ return -2;
+
+ c3 = (unsigned char)s[3];
+ if ((c3 ^ 0x80) < 0x40)
+ return 4;
+ }
+ }
+ }
+ }
+ /* invalid or incomplete multibyte character */
+ return -1;
+}
+
+/* We can optimize this if we know the locale is UTF-8, but needs to handle
+ malformed byte sequences. */
+size_t
+utf8_mbstrlen(s)
+ const char *s;
+{
+ size_t clen, nc;
+ int mb_cur_max;
+
+ nc = 0;
+ mb_cur_max = MB_CUR_MAX;
+ while (*s && (clen = (size_t)utf8_mblen(s, mb_cur_max)) != 0)
+ {
+ if (MB_INVALIDCH(clen))
+ clen = 1; /* assume single byte */
+
+ s += clen;
+ nc++;
+ }
+ return nc;
+}
+
+#endif