1 files changed, 145 insertions, 17 deletions
diff --git a/src/util/printable.c b/src/util/printable.c
index 6c148fd..0e1ae19 100644
--- a/src/util/printable.c
+++ b/src/util/printable.c
@@ -45,6 +45,10 @@
 /*	Google, Inc.
 /*	111 8th Avenue
 /*	New York, NY 10011, USA
+/*
+/*	Wietse Venema
+/*	porcupine.org
+/*	Amawalk, NY 10501, USA
 /*--*/
 
 /* System library. */
@@ -56,8 +60,9 @@
 /* Utility library. */
 
 #include "stringops.h"
+#include "parse_utf8_char.h"
 
-int util_utf8_enable = 0;
+int     util_utf8_enable = 0;
 
 /* printable -  binary compatibility */
 
@@ -74,27 +79,150 @@ char   *printable(char *string, int replacement)
 
 char   *printable_except(char *string, int replacement, const char *except)
 {
-    unsigned char *cp;
+    char   *cp;
+    char   *last;
     int     ch;
 
     /*
-     * XXX Replace invalid UTF8 sequences (too short, over-long encodings,
-     * out-of-range code points, etc). See valid_utf8_string.c.
+     * In case of a non-UTF8 sequence (bad leader byte, bad non-leader byte,
+     * over-long encodings, out-of-range code points, etc), replace the first
+     * byte, and try to resynchronize at the next byte.
      */
-    cp = (unsigned char *) string;
-    while ((ch = *cp) != 0) {
-	if (ISASCII(ch) && (ISPRINT(ch) || (except && strchr(except, ch)))) {
-	    /* ok */
-	} else if (util_utf8_enable && ch >= 194 && ch <= 254
-		   && cp[1] >= 128 && cp[1] < 192) {
-	    /* UTF8; skip the rest of the bytes in the character. */
-	    while (cp[1] >= 128 && cp[1] < 192)
-		cp++;
-	} else {
-	    /* Not ASCII and not UTF8. */
-	    *cp = replacement;
+#define PRINT_OR_EXCEPT(ch) (ISPRINT(ch) || (except && strchr(except, ch)))
+
+    for (cp = string; (ch = *(unsigned char *) cp) != 0; cp++) {
+	if (util_utf8_enable == 0) {
+	    if (ISASCII(ch) && PRINT_OR_EXCEPT(ch))
+		continue;
+	} else if ((last = parse_utf8_char(cp, 0)) == cp) {	/* ASCII */
+	    if (PRINT_OR_EXCEPT(ch))
+		continue;
+	} else if (last != 0) {			/* Other UTF8 */
+	    cp = last;
+	    continue;
 	}
-	cp++;
+	*cp = replacement;
     }
     return (string);
 }
+
+#ifdef TEST
+
+#include <stdlib.h>
+#include <string.h>
+#include <msg.h>
+#include <msg_vstream.h>
+#include <mymalloc.h>
+#include <vstream.h>
+
+ /*
+  * Test cases for 1-, 2-, and 3-byte encodings. Originally contributed by
+  * Viktor Dukhovni, and annotated using translate.google.com.
+  * 
+  * See valid_utf8_string.c for single-error tests.
+  * 
+  * XXX Need a test for 4-byte encodings, preferably with strings that can be
+  * displayed.
+  */
+struct testcase {
+    const char *name;
+    const char *input;
+    const char *expected;;
+};
+static const struct testcase testcases[] = {
+    {"Printable ASCII",
+	"printable", "printable"
+    },
+    {"ASCII with control character",
+	"non\bn-printable", "non?n-printable"
+    },
+    {"Latin accented text, no error",
+	"na\303\257ve", "na\303\257ve"
+    },
+    {"Latin text, with error",
+	"na\303ve", "na?ve"
+    },
+    {"Viktor, Cyrillic, no error",
+	"\320\262\320\270\320\272\321\202\320\276\321\200",
+	"\320\262\320\270\320\272\321\202\320\276\321\200"
+    },
+    {"Viktor, Cyrillic, two errors",
+	"\320\262\320\320\272\272\321\202\320\276\321\200",
+	"\320\262?\320\272?\321\202\320\276\321\200"
+    },
+    {"Viktor, Hebrew, no error",
+	"\327\225\327\231\327\247\327\230\327\225\326\274\327\250",
+	"\327\225\327\231\327\247\327\230\327\225\326\274\327\250"
+    },
+    {"Viktor, Hebrew, with error",
+	"\327\225\231\327\247\327\230\327\225\326\274\327\250",
+	"\327\225?\327\247\327\230\327\225\326\274\327\250"
+    },
+    {"Chinese (Simplified), no error",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345\221\212",
+	"\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345\221\212"
+    },
+    {"Chinese (Simplified), with errors",
+	"\344\270\255\345\344\272\222\350\224\347\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245\345",
+	"\344\270\255?\344\272\222??\347\275\221\347"
+	"\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+	"\237\350\256\241\346\212\245?"
+    },
+};
+
+int     main(int argc, char **argv)
+{
+    const struct testcase *tp;
+    int     pass;
+    int     fail;
+
+#define NUM_TESTS	sizeof(testcases)/sizeof(testcases[0])
+
+    msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
+    util_utf8_enable = 1;
+
+    for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
+	char   *input;
+	char   *actual;
+	int     ok = 0;
+
+	/*
+	 * Notes:
+	 * 
+	 * - The input is modified, therefore it must be copied.
+	 * 
+	 * - The msg(3) functions use printable() which interferes when logging
+	 * inputs and outputs. Use vstream_fprintf() instead.
+	 */
+	vstream_fprintf(VSTREAM_ERR, "RUN  %s\n", tp->name);
+	input = mystrdup(tp->input);
+	actual = printable(input, '?');
+
+	if (strcmp(actual, tp->expected) != 0) {
+	    vstream_fprintf(VSTREAM_ERR, "input: >%s<, got: >%s<, want: >%s<\n",
+			    tp->input, actual, tp->expected);
+	} else {
+	    vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n",
+			    tp->input, actual);
+	    ok = 1;
+	}
+	if (ok) {
+	    vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
+	    pass++;
+	} else {
+	    vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
+	    fail++;
+	}
+	myfree(input);
+    }
+    msg_info("PASS=%d FAIL=%d", pass, fail);
+    return (fail > 0);
+}
+
+#endif