summaryrefslogtreecommitdiffstats
path: root/src/util/printable.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/util/printable.c162
1 files changed, 145 insertions, 17 deletions
diff --git a/src/util/printable.c b/src/util/printable.c
index 6c148fd..0e1ae19 100644
--- a/src/util/printable.c
+++ b/src/util/printable.c
@@ -45,6 +45,10 @@
/* Google, Inc.
/* 111 8th Avenue
/* New York, NY 10011, USA
+/*
+/* Wietse Venema
+/* porcupine.org
+/* Amawalk, NY 10501, USA
/*--*/
/* System library. */
@@ -56,8 +60,9 @@
/* Utility library. */
#include "stringops.h"
+#include "parse_utf8_char.h"
-int util_utf8_enable = 0;
+int util_utf8_enable = 0;
/* printable - binary compatibility */
@@ -74,27 +79,150 @@ char *printable(char *string, int replacement)
char *printable_except(char *string, int replacement, const char *except)
{
- unsigned char *cp;
+ char *cp;
+ char *last;
int ch;
/*
- * XXX Replace invalid UTF8 sequences (too short, over-long encodings,
- * out-of-range code points, etc). See valid_utf8_string.c.
+ * In case of a non-UTF8 sequence (bad leader byte, bad non-leader byte,
+ * over-long encodings, out-of-range code points, etc), replace the first
+ * byte, and try to resynchronize at the next byte.
*/
- cp = (unsigned char *) string;
- while ((ch = *cp) != 0) {
- if (ISASCII(ch) && (ISPRINT(ch) || (except && strchr(except, ch)))) {
- /* ok */
- } else if (util_utf8_enable && ch >= 194 && ch <= 254
- && cp[1] >= 128 && cp[1] < 192) {
- /* UTF8; skip the rest of the bytes in the character. */
- while (cp[1] >= 128 && cp[1] < 192)
- cp++;
- } else {
- /* Not ASCII and not UTF8. */
- *cp = replacement;
+#define PRINT_OR_EXCEPT(ch) (ISPRINT(ch) || (except && strchr(except, ch)))
+
+ for (cp = string; (ch = *(unsigned char *) cp) != 0; cp++) {
+ if (util_utf8_enable == 0) {
+ if (ISASCII(ch) && PRINT_OR_EXCEPT(ch))
+ continue;
+ } else if ((last = parse_utf8_char(cp, 0)) == cp) { /* ASCII */
+ if (PRINT_OR_EXCEPT(ch))
+ continue;
+ } else if (last != 0) { /* Other UTF8 */
+ cp = last;
+ continue;
}
- cp++;
+ *cp = replacement;
}
return (string);
}
+
+#ifdef TEST
+
+#include <stdlib.h>
+#include <string.h>
+#include <msg.h>
+#include <msg_vstream.h>
+#include <mymalloc.h>
+#include <vstream.h>
+
+ /*
+ * Test cases for 1-, 2-, and 3-byte encodings. Originally contributed by
+ * Viktor Dukhovni, and annotated using translate.google.com.
+ *
+ * See valid_utf8_string.c for single-error tests.
+ *
+ * XXX Need a test for 4-byte encodings, preferably with strings that can be
+ * displayed.
+ */
+struct testcase {
+ const char *name;
+ const char *input;
+ const char *expected;;
+};
+static const struct testcase testcases[] = {
+ {"Printable ASCII",
+ "printable", "printable"
+ },
+ {"ASCII with control character",
+ "non\bn-printable", "non?n-printable"
+ },
+ {"Latin accented text, no error",
+ "na\303\257ve", "na\303\257ve"
+ },
+ {"Latin text, with error",
+ "na\303ve", "na?ve"
+ },
+ {"Viktor, Cyrillic, no error",
+ "\320\262\320\270\320\272\321\202\320\276\321\200",
+ "\320\262\320\270\320\272\321\202\320\276\321\200"
+ },
+ {"Viktor, Cyrillic, two errors",
+ "\320\262\320\320\272\272\321\202\320\276\321\200",
+ "\320\262?\320\272?\321\202\320\276\321\200"
+ },
+ {"Viktor, Hebrew, no error",
+ "\327\225\327\231\327\247\327\230\327\225\326\274\327\250",
+ "\327\225\327\231\327\247\327\230\327\225\326\274\327\250"
+ },
+ {"Viktor, Hebrew, with error",
+ "\327\225\231\327\247\327\230\327\225\326\274\327\250",
+ "\327\225?\327\247\327\230\327\225\326\274\327\250"
+ },
+ {"Chinese (Simplified), no error",
+ "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345\221\212",
+ "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345\221\212"
+ },
+ {"Chinese (Simplified), with errors",
+ "\344\270\255\345\344\272\222\350\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345",
+ "\344\270\255?\344\272\222??\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245?"
+ },
+};
+
+int main(int argc, char **argv)
+{
+ const struct testcase *tp;
+ int pass;
+ int fail;
+
+#define NUM_TESTS sizeof(testcases)/sizeof(testcases[0])
+
+ msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
+ util_utf8_enable = 1;
+
+ for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
+ char *input;
+ char *actual;
+ int ok = 0;
+
+ /*
+ * Notes:
+ *
+ * - The input is modified, therefore it must be copied.
+ *
+ * - The msg(3) functions use printable() which interferes when logging
+ * inputs and outputs. Use vstream_fprintf() instead.
+ */
+ vstream_fprintf(VSTREAM_ERR, "RUN %s\n", tp->name);
+ input = mystrdup(tp->input);
+ actual = printable(input, '?');
+
+ if (strcmp(actual, tp->expected) != 0) {
+ vstream_fprintf(VSTREAM_ERR, "input: >%s<, got: >%s<, want: >%s<\n",
+ tp->input, actual, tp->expected);
+ } else {
+ vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n",
+ tp->input, actual);
+ ok = 1;
+ }
+ if (ok) {
+ vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
+ pass++;
+ } else {
+ vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
+ fail++;
+ }
+ myfree(input);
+ }
+ msg_info("PASS=%d FAIL=%d", pass, fail);
+ return (fail > 0);
+}
+
+#endif