diff options
Diffstat (limited to '')
-rw-r--r-- | src/util/printable.c | 162 |
1 files changed, 145 insertions, 17 deletions
diff --git a/src/util/printable.c b/src/util/printable.c index 6c148fd..0e1ae19 100644 --- a/src/util/printable.c +++ b/src/util/printable.c @@ -45,6 +45,10 @@ /* Google, Inc. /* 111 8th Avenue /* New York, NY 10011, USA +/* +/* Wietse Venema +/* porcupine.org +/* Amawalk, NY 10501, USA /*--*/ /* System library. */ @@ -56,8 +60,9 @@ /* Utility library. */ #include "stringops.h" +#include "parse_utf8_char.h" -int util_utf8_enable = 0; +int util_utf8_enable = 0; /* printable - binary compatibility */ @@ -74,27 +79,150 @@ char *printable(char *string, int replacement) char *printable_except(char *string, int replacement, const char *except) { - unsigned char *cp; + char *cp; + char *last; int ch; /* - * XXX Replace invalid UTF8 sequences (too short, over-long encodings, - * out-of-range code points, etc). See valid_utf8_string.c. + * In case of a non-UTF8 sequence (bad leader byte, bad non-leader byte, + * over-long encodings, out-of-range code points, etc), replace the first + * byte, and try to resynchronize at the next byte. */ - cp = (unsigned char *) string; - while ((ch = *cp) != 0) { - if (ISASCII(ch) && (ISPRINT(ch) || (except && strchr(except, ch)))) { - /* ok */ - } else if (util_utf8_enable && ch >= 194 && ch <= 254 - && cp[1] >= 128 && cp[1] < 192) { - /* UTF8; skip the rest of the bytes in the character. */ - while (cp[1] >= 128 && cp[1] < 192) - cp++; - } else { - /* Not ASCII and not UTF8. */ - *cp = replacement; +#define PRINT_OR_EXCEPT(ch) (ISPRINT(ch) || (except && strchr(except, ch))) + + for (cp = string; (ch = *(unsigned char *) cp) != 0; cp++) { + if (util_utf8_enable == 0) { + if (ISASCII(ch) && PRINT_OR_EXCEPT(ch)) + continue; + } else if ((last = parse_utf8_char(cp, 0)) == cp) { /* ASCII */ + if (PRINT_OR_EXCEPT(ch)) + continue; + } else if (last != 0) { /* Other UTF8 */ + cp = last; + continue; } - cp++; + *cp = replacement; } return (string); } + +#ifdef TEST + +#include <stdlib.h> +#include <string.h> +#include <msg.h> +#include <msg_vstream.h> +#include <mymalloc.h> +#include <vstream.h> + + /* + * Test cases for 1-, 2-, and 3-byte encodings. Originally contributed by + * Viktor Dukhovni, and annotated using translate.google.com. + * + * See valid_utf8_string.c for single-error tests. + * + * XXX Need a test for 4-byte encodings, preferably with strings that can be + * displayed. + */ +struct testcase { + const char *name; + const char *input; + const char *expected;; +}; +static const struct testcase testcases[] = { + {"Printable ASCII", + "printable", "printable" + }, + {"ASCII with control character", + "non\bn-printable", "non?n-printable" + }, + {"Latin accented text, no error", + "na\303\257ve", "na\303\257ve" + }, + {"Latin text, with error", + "na\303ve", "na?ve" + }, + {"Viktor, Cyrillic, no error", + "\320\262\320\270\320\272\321\202\320\276\321\200", + "\320\262\320\270\320\272\321\202\320\276\321\200" + }, + {"Viktor, Cyrillic, two errors", + "\320\262\320\320\272\272\321\202\320\276\321\200", + "\320\262?\320\272?\321\202\320\276\321\200" + }, + {"Viktor, Hebrew, no error", + "\327\225\327\231\327\247\327\230\327\225\326\274\327\250", + "\327\225\327\231\327\247\327\230\327\225\326\274\327\250" + }, + {"Viktor, Hebrew, with error", + "\327\225\231\327\247\327\230\327\225\326\274\327\250", + "\327\225?\327\247\327\230\327\225\326\274\327\250" + }, + {"Chinese (Simplified), no error", + "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345\221\212", + "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345\221\212" + }, + {"Chinese (Simplified), with errors", + "\344\270\255\345\344\272\222\350\224\347\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345", + "\344\270\255?\344\272\222??\347\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245?" + }, +}; + +int main(int argc, char **argv) +{ + const struct testcase *tp; + int pass; + int fail; + +#define NUM_TESTS sizeof(testcases)/sizeof(testcases[0]) + + msg_vstream_init(basename(argv[0]), VSTREAM_ERR); + util_utf8_enable = 1; + + for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) { + char *input; + char *actual; + int ok = 0; + + /* + * Notes: + * + * - The input is modified, therefore it must be copied. + * + * - The msg(3) functions use printable() which interferes when logging + * inputs and outputs. Use vstream_fprintf() instead. + */ + vstream_fprintf(VSTREAM_ERR, "RUN %s\n", tp->name); + input = mystrdup(tp->input); + actual = printable(input, '?'); + + if (strcmp(actual, tp->expected) != 0) { + vstream_fprintf(VSTREAM_ERR, "input: >%s<, got: >%s<, want: >%s<\n", + tp->input, actual, tp->expected); + } else { + vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n", + tp->input, actual); + ok = 1; + } + if (ok) { + vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name); + pass++; + } else { + vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name); + fail++; + } + myfree(input); + } + msg_info("PASS=%d FAIL=%d", pass, fail); + return (fail > 0); +} + +#endif |