summaryrefslogtreecommitdiffstats
path: root/src/util/valid_utf8_string.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/valid_utf8_string.c')
-rw-r--r--src/util/valid_utf8_string.c139
1 files changed, 139 insertions, 0 deletions
diff --git a/src/util/valid_utf8_string.c b/src/util/valid_utf8_string.c
new file mode 100644
index 0000000..96b5b4d
--- /dev/null
+++ b/src/util/valid_utf8_string.c
@@ -0,0 +1,139 @@
+/*++
+/* NAME
+/* valid_utf8_string 3
+/* SUMMARY
+/* predicate if string is valid UTF-8
+/* SYNOPSIS
+/* #include <stringops.h>
+/*
+/* int valid_utf8_string(str, len)
+/* const char *str;
+/* ssize_t len;
+/* DESCRIPTION
+/* valid_utf8_string() determines if a string satisfies the UTF-8
+/* definition in RFC 3629. That is, it contains proper encodings
+/* of code points U+0000..U+10FFFF, excluding over-long encodings
+/* and excluding U+D800..U+DFFF surrogates.
+/*
+/* A zero-length string is considered valid.
+/* DIAGNOSTICS
+/* The result value is zero when the caller specifies a negative
+/* length, or a string that violates RFC 3629, for example a
+/* string that is truncated in the middle of a multi-byte
+/* sequence.
+/* BUGS
+/* But wait, there is more. Code points in the range U+FDD0..U+FDEF
+/* and ending in FFFE or FFFF are non-characters in UNICODE. This
+/* function does not block these.
+/* SEE ALSO
+/* RFC 3629
+/* LICENSE
+/* .ad
+/* .fi
+/* The Secure Mailer license must be distributed with this software.
+/* AUTHOR(S)
+/* Wietse Venema
+/* IBM T.J. Watson Research
+/* P.O. Box 704
+/* Yorktown Heights, NY 10598, USA
+/*--*/
+
+/* System library. */
+
+#include <sys_defs.h>
+
+/* Utility library. */
+
+#include <stringops.h>
+
+/* valid_utf8_string - validate string according to RFC 3629 */
+
+int valid_utf8_string(const char *str, ssize_t len)
+{
+ const unsigned char *end = (const unsigned char *) str + len;
+ const unsigned char *cp;
+ unsigned char c0, ch;
+
+ if (len < 0)
+ return (0);
+ if (len <= 0)
+ return (1);
+
+ /*
+ * Optimized for correct input, time, space, and for CPUs that have a
+ * decent number of registers.
+ */
+ for (cp = (const unsigned char *) str; cp < end; cp++) {
+ /* Single-byte encodings. */
+ if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
+ /* void */ ;
+ }
+ /* Two-byte encodings. */
+ else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
+ /* Exclude over-long encodings. */
+ if (UNEXPECTED(c0 < 0xc2)
+ || UNEXPECTED(cp + 1 >= end)
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+ return (0);
+ }
+ /* Three-byte encodings. */
+ else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
+ if (UNEXPECTED(cp + 2 >= end)
+ /* Exclude over-long encodings. */
+ || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
+ /* Exclude U+D800..U+DFFF. */
+ || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+ return (0);
+ }
+ /* Four-byte encodings. */
+ else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
+ if (UNEXPECTED(cp + 3 >= end)
+ /* Exclude over-long encodings. */
+ || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
+ /* Exclude code points above U+10FFFF. */
+ || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+ return (0);
+ }
+ /* Invalid: c0 >= 0xf5 */
+ else {
+ return (0);
+ }
+ }
+ return (1);
+}
+
+ /*
+ * Stand-alone test program. Each string is a line without line terminator.
+ */
+#ifdef TEST
+#include <stdlib.h>
+#include <vstream.h>
+#include <vstring.h>
+#include <vstring_vstream.h>
+
+#define STR(x) vstring_str(x)
+#define LEN(x) VSTRING_LEN(x)
+
+int main(void)
+{
+ VSTRING *buf = vstring_alloc(1);
+
+ while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
+ vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
+ '!' : ' ');
+ vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
+ vstream_printf("\n");
+ }
+ vstream_fflush(VSTREAM_OUT);
+ vstring_free(buf);
+ exit(0);
+}
+
+#endif