1 files changed, 139 insertions, 0 deletions
diff --git a/src/util/valid_utf8_string.c b/src/util/valid_utf8_string.c
new file mode 100644
index 0000000..96b5b4d
--- /dev/null
+++ b/src/util/valid_utf8_string.c
@@ -0,0 +1,139 @@
+/*++
+/* NAME
+/*	valid_utf8_string 3
+/* SUMMARY
+/*	predicate if string is valid UTF-8
+/* SYNOPSIS
+/*	#include <stringops.h>
+/*
+/*	int	valid_utf8_string(str, len)
+/*	const char *str;
+/*	ssize_t	len;
+/* DESCRIPTION
+/*	valid_utf8_string() determines if a string satisfies the UTF-8
+/*	definition in RFC 3629. That is, it contains proper encodings
+/*	of code points U+0000..U+10FFFF, excluding over-long encodings
+/*	and excluding U+D800..U+DFFF surrogates.
+/*
+/*	A zero-length string is considered valid.
+/* DIAGNOSTICS
+/*	The result value is zero when the caller specifies a negative
+/*	length, or a string that violates RFC 3629, for example a
+/*	string that is truncated in the middle of a multi-byte
+/*	sequence.
+/* BUGS
+/*	But wait, there is more. Code points in the range U+FDD0..U+FDEF
+/*	and ending in FFFE or FFFF are non-characters in UNICODE. This
+/*	function does not block these.
+/* SEE ALSO
+/*	RFC 3629
+/* LICENSE
+/* .ad
+/* .fi
+/*	The Secure Mailer license must be distributed with this software.
+/* AUTHOR(S)
+/*	Wietse Venema
+/*	IBM T.J. Watson Research
+/*	P.O. Box 704
+/*	Yorktown Heights, NY 10598, USA
+/*--*/
+
+/* System library. */
+
+#include <sys_defs.h>
+
+/* Utility library. */
+
+#include <stringops.h>
+
+/* valid_utf8_string - validate string according to RFC 3629 */
+
+int     valid_utf8_string(const char *str, ssize_t len)
+{
+    const unsigned char *end = (const unsigned char *) str + len;
+    const unsigned char *cp;
+    unsigned char c0, ch;
+
+    if (len < 0)
+	return (0);
+    if (len <= 0)
+	return (1);
+
+    /*
+     * Optimized for correct input, time, space, and for CPUs that have a
+     * decent number of registers.
+     */
+    for (cp = (const unsigned char *) str; cp < end; cp++) {
+	/* Single-byte encodings. */
+	if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
+	     /* void */ ;
+	}
+	/* Two-byte encodings. */
+	else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
+	    /* Exclude over-long encodings. */
+	    if (UNEXPECTED(c0 < 0xc2)
+		|| UNEXPECTED(cp + 1 >= end)
+	    /* Require UTF-8 tail byte. */
+		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+		return (0);
+	}
+	/* Three-byte encodings. */
+	else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
+	    if (UNEXPECTED(cp + 2 >= end)
+	    /* Exclude over-long encodings. */
+		|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
+	    /* Exclude U+D800..U+DFFF. */
+		|| UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
+	    /* Require UTF-8 tail byte. */
+		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+		return (0);
+	}
+	/* Four-byte encodings. */
+	else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
+	    if (UNEXPECTED(cp + 3 >= end)
+	    /* Exclude over-long encodings. */
+		|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
+	    /* Exclude code points above U+10FFFF. */
+		|| UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
+	    /* Require UTF-8 tail byte. */
+		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
+	    /* Require UTF-8 tail byte. */
+		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+		return (0);
+	}
+	/* Invalid: c0 >= 0xf5 */
+	else {
+	    return (0);
+	}
+    }
+    return (1);
+}
+
+ /*
+  * Stand-alone test program. Each string is a line without line terminator.
+  */
+#ifdef TEST
+#include <stdlib.h>
+#include <vstream.h>
+#include <vstring.h>
+#include <vstring_vstream.h>
+
+#define STR(x) vstring_str(x)
+#define LEN(x) VSTRING_LEN(x)
+
+int     main(void)
+{
+    VSTRING *buf = vstring_alloc(1);
+
+    while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
+	vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
+		       '!' : ' ');
+	vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
+	vstream_printf("\n");
+    }
+    vstream_fflush(VSTREAM_OUT);
+    vstring_free(buf);
+    exit(0);
+}
+
+#endif