1 files changed, 174 insertions, 73 deletions
diff --git a/src/util/valid_utf8_string.c b/src/util/valid_utf8_string.c
index 96b5b4d..f5b4ff4 100644
--- a/src/util/valid_utf8_string.c
+++ b/src/util/valid_utf8_string.c
@@ -9,24 +9,24 @@
 /*	int	valid_utf8_string(str, len)
 /*	const char *str;
 /*	ssize_t	len;
+/*
+/*	int	valid_utf8_stringz(str)
+/*	const char *str;
+/*	ssize_t	len;
 /* DESCRIPTION
-/*	valid_utf8_string() determines if a string satisfies the UTF-8
-/*	definition in RFC 3629. That is, it contains proper encodings
-/*	of code points U+0000..U+10FFFF, excluding over-long encodings
-/*	and excluding U+D800..U+DFFF surrogates.
+/*	valid_utf8_string() determines if all bytes in a string
+/*	satisfy parse_utf8_char(3h) checks. See there for any
+/*	implementation limitations.
+/*
+/*	valid_utf8_stringz() determines the same for zero-terminated
+/*	strings.
 /*
 /*	A zero-length string is considered valid.
 /* DIAGNOSTICS
 /*	The result value is zero when the caller specifies a negative
-/*	length, or a string that violates RFC 3629, for example a
-/*	string that is truncated in the middle of a multi-byte
-/*	sequence.
-/* BUGS
-/*	But wait, there is more. Code points in the range U+FDD0..U+FDEF
-/*	and ending in FFFE or FFFF are non-characters in UNICODE. This
-/*	function does not block these.
+/*	length, or a string that does not pass parse_utf8_char(3h) checks.
 /* SEE ALSO
-/*	RFC 3629
+/*	parse_utf8_char(3h), parse one UTF-8 multibyte character
 /* LICENSE
 /* .ad
 /* .fi
@@ -36,6 +36,10 @@
 /*	IBM T.J. Watson Research
 /*	P.O. Box 704
 /*	Yorktown Heights, NY 10598, USA
+/*
+/*	Wietse Venema
+/*	porcupine.org
+/*	Amawalk, NY 10501, USA
 /*--*/
 
 /* System library. */
@@ -45,66 +49,50 @@
 /* Utility library. */
 
 #include <stringops.h>
+#include <parse_utf8_char.h>
 
 /* valid_utf8_string - validate string according to RFC 3629 */
 
 int     valid_utf8_string(const char *str, ssize_t len)
 {
-    const unsigned char *end = (const unsigned char *) str + len;
-    const unsigned char *cp;
-    unsigned char c0, ch;
+    const char *ep = str + len;
+    const char *cp;
+    const char *last;
 
     if (len < 0)
 	return (0);
-    if (len <= 0)
+    if (len == 0)
 	return (1);
 
     /*
-     * Optimized for correct input, time, space, and for CPUs that have a
-     * decent number of registers.
+     * Ideally, the compiler will inline parse_utf8_char().
      */
-    for (cp = (const unsigned char *) str; cp < end; cp++) {
-	/* Single-byte encodings. */
-	if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
-	     /* void */ ;
-	}
-	/* Two-byte encodings. */
-	else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
-	    /* Exclude over-long encodings. */
-	    if (UNEXPECTED(c0 < 0xc2)
-		|| UNEXPECTED(cp + 1 >= end)
-	    /* Require UTF-8 tail byte. */
-		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
-		return (0);
-	}
-	/* Three-byte encodings. */
-	else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
-	    if (UNEXPECTED(cp + 2 >= end)
-	    /* Exclude over-long encodings. */
-		|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
-	    /* Exclude U+D800..U+DFFF. */
-		|| UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
-	    /* Require UTF-8 tail byte. */
-		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
-		return (0);
-	}
-	/* Four-byte encodings. */
-	else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
-	    if (UNEXPECTED(cp + 3 >= end)
-	    /* Exclude over-long encodings. */
-		|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
-	    /* Exclude code points above U+10FFFF. */
-		|| UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
-	    /* Require UTF-8 tail byte. */
-		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
-	    /* Require UTF-8 tail byte. */
-		|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
-		return (0);
-	}
-	/* Invalid: c0 >= 0xf5 */
-	else {
+    for (cp = str; cp < ep; cp++) {
+	if ((last = parse_utf8_char(cp, ep)) != 0)
+	    cp = last;
+	else
+	    return (0);
+    }
+    return (1);
+}
+
+/* valid_utf8_stringz - validate string according to RFC 3629 */
+
+int     valid_utf8_stringz(const char *str)
+{
+    const char *cp;
+    const char *last;
+
+    /*
+     * Ideally, the compiler will inline parse_utf8_char(), propagate the
+     * null pointer constant value, and eliminate code branches that test
+     * whether 0 != 0.
+     */
+    for (cp = str; *cp; cp++) {
+	if ((last = parse_utf8_char(cp, 0)) != 0)
+	    cp = last;
+	else
 	    return (0);
-	}
     }
     return (1);
 }
@@ -114,26 +102,139 @@ int     valid_utf8_string(const char *str, ssize_t len)
   */
 #ifdef TEST
 #include <stdlib.h>
+#include <string.h>
+#include <msg.h>
 #include <vstream.h>
-#include <vstring.h>
-#include <vstring_vstream.h>
+#include <msg_vstream.h>
+
+ /*
+  * Test cases for 1-, 2-, and 3-byte encodings. See printable.c for UTF8
+  * parser resychronization tests.
+  * 
+  * XXX Need a test for 4-byte encodings, preferably with strings that can be
+  * displayed.
+  * 
+  * XXX Need tests with hand-crafted over-long encodings and surrogates.
+  */
+struct testcase {
+    const char *name;
+    const char *input;
+    int     expected;
+};
 
-#define STR(x) vstring_str(x)
-#define LEN(x) VSTRING_LEN(x)
+#define T_VALID		(1)
+#define T_INVALID	(0)
+#define valid_to_str(v)	((v) ? "VALID" : "INVALID")
 
-int     main(void)
+static const struct testcase testcases[] = {
+    {"Printable ASCII",
+	"printable", T_VALID,
+    },
+    {"Latin script, accented, no error",
+	"na\303\257ve", T_VALID,
+    },
+    {"Latin script, accented, missing non-leading byte",
+	"na\303ve", T_INVALID,
+    },
+    {"Latin script, accented, missing leading byte",
+	"na\257ve", T_INVALID,
+    },
+    {"Viktor, Cyrillic, no error",
+	"\320\262\320\270\320\272\321\202\320\276\321\200", T_VALID,
+    },
+    {"Viktor, Cyrillic, missing non-leading byte",
+	"\320\262\320\320\272\321\202\320\276\321\200", T_INVALID,
+    },
+    {"Viktor, Cyrillic, missing leading byte",
+	"\320\262\270\320\272\321\202\320\276\321\200", T_INVALID,
+    },
+    {"Viktor, Cyrillic, truncated",
+	"\320\262\320\270\320\272\321\202\320\276\321", T_INVALID,
+    },
+    {"Viktor, Hebrew, no error",
+	"\327\225\327\231\327\247\327\230\327\225\326\274\327\250", T_VALID,
+    },
+    {"Viktor, Hebrew, missing leading byte",
+	"\327\225\231\327\247\327\230\327\225\326\274\327\250", T_INVALID,
+    },
+    {"Chinese (Simplified), no error",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345\221\212", T_VALID,
+    },
+    {"Chinese (Simplified), missing leading byte",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345\221\212", T_INVALID,
+    },
+    {"Chinese (Simplified), missing first non-leading byte",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\347\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345\221\212", T_INVALID,
+    },
+    {"Chinese (Simplified), missing second non-leading byte",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345\221\212", T_INVALID,
+    },
+    {"Chinese (Simplified), truncated",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345", T_INVALID,
+    },
+};
+
+int     main(int argc, char **argv)
 {
-    VSTRING *buf = vstring_alloc(1);
+    const struct testcase *tp;
+    int     pass;
+    int     fail;
+
+#define NUM_TESTS       sizeof(testcases)/sizeof(testcases[0])
+
+    msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
+    util_utf8_enable = 1;
+
+    for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
+	int     actual_l;
+	int     actual_z;
+	int     ok = 0;
 
-    while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
-	vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
-		       '!' : ' ');
-	vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
-	vstream_printf("\n");
+	/*
+	 * Notes:
+	 * 
+	 * - The msg(3) functions use printable() which interferes when logging
+	 * inputs and outputs. Use vstream_fprintf() instead.
+	 */
+	vstream_fprintf(VSTREAM_ERR, "RUN  %s\n", tp->name);
+	actual_l = valid_utf8_string(tp->input, strlen(tp->input));
+	actual_z = valid_utf8_stringz(tp->input);
+
+	if (actual_l != tp->expected) {
+	    vstream_fprintf(VSTREAM_ERR,
+			  "input: >%s<, 'actual_l' got: >%s<, want: >%s<\n",
+			    tp->input, valid_to_str(actual_l),
+			    valid_to_str(tp->expected));
+	} else if (actual_z != tp->expected) {
+	    vstream_fprintf(VSTREAM_ERR,
+			  "input: >%s<, 'actual_z' got: >%s<, want: >%s<\n",
+			    tp->input, valid_to_str(actual_z),
+			    valid_to_str(tp->expected));
+	} else {
+	    vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n",
+			    tp->input, valid_to_str(actual_l));
+	    ok = 1;
+	}
+	if (ok) {
+	    vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
+	    pass++;
+	} else {
+	    vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
+	    fail++;
+	}
     }
-    vstream_fflush(VSTREAM_OUT);
-    vstring_free(buf);
-    exit(0);
+    msg_info("PASS=%d FAIL=%d", pass, fail);
+    return (fail > 0);
 }
 
 #endif