From 426ff88c97805d5359804bcfd7186dcd2c9fbf47 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 13 Apr 2024 10:42:06 +0200 Subject: Merging upstream version 3.9.0. Signed-off-by: Daniel Baumann --- src/util/valid_utf8_string.c | 247 ++++++++++++++++++++++++++++++------------- 1 file changed, 174 insertions(+), 73 deletions(-) (limited to 'src/util/valid_utf8_string.c') diff --git a/src/util/valid_utf8_string.c b/src/util/valid_utf8_string.c index 96b5b4d..f5b4ff4 100644 --- a/src/util/valid_utf8_string.c +++ b/src/util/valid_utf8_string.c @@ -9,24 +9,24 @@ /* int valid_utf8_string(str, len) /* const char *str; /* ssize_t len; +/* +/* int valid_utf8_stringz(str) +/* const char *str; +/* ssize_t len; /* DESCRIPTION -/* valid_utf8_string() determines if a string satisfies the UTF-8 -/* definition in RFC 3629. That is, it contains proper encodings -/* of code points U+0000..U+10FFFF, excluding over-long encodings -/* and excluding U+D800..U+DFFF surrogates. +/* valid_utf8_string() determines if all bytes in a string +/* satisfy parse_utf8_char(3h) checks. See there for any +/* implementation limitations. +/* +/* valid_utf8_stringz() determines the same for zero-terminated +/* strings. /* /* A zero-length string is considered valid. /* DIAGNOSTICS /* The result value is zero when the caller specifies a negative -/* length, or a string that violates RFC 3629, for example a -/* string that is truncated in the middle of a multi-byte -/* sequence. -/* BUGS -/* But wait, there is more. Code points in the range U+FDD0..U+FDEF -/* and ending in FFFE or FFFF are non-characters in UNICODE. This -/* function does not block these. +/* length, or a string that does not pass parse_utf8_char(3h) checks. /* SEE ALSO -/* RFC 3629 +/* parse_utf8_char(3h), parse one UTF-8 multibyte character /* LICENSE /* .ad /* .fi @@ -36,6 +36,10 @@ /* IBM T.J. Watson Research /* P.O. Box 704 /* Yorktown Heights, NY 10598, USA +/* +/* Wietse Venema +/* porcupine.org +/* Amawalk, NY 10501, USA /*--*/ /* System library. */ @@ -45,66 +49,50 @@ /* Utility library. */ #include +#include /* valid_utf8_string - validate string according to RFC 3629 */ int valid_utf8_string(const char *str, ssize_t len) { - const unsigned char *end = (const unsigned char *) str + len; - const unsigned char *cp; - unsigned char c0, ch; + const char *ep = str + len; + const char *cp; + const char *last; if (len < 0) return (0); - if (len <= 0) + if (len == 0) return (1); /* - * Optimized for correct input, time, space, and for CPUs that have a - * decent number of registers. + * Ideally, the compiler will inline parse_utf8_char(). */ - for (cp = (const unsigned char *) str; cp < end; cp++) { - /* Single-byte encodings. */ - if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) { - /* void */ ; - } - /* Two-byte encodings. */ - else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) { - /* Exclude over-long encodings. */ - if (UNEXPECTED(c0 < 0xc2) - || UNEXPECTED(cp + 1 >= end) - /* Require UTF-8 tail byte. */ - || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) - return (0); - } - /* Three-byte encodings. */ - else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) { - if (UNEXPECTED(cp + 2 >= end) - /* Exclude over-long encodings. */ - || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80)) - /* Exclude U+D800..U+DFFF. */ - || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf)) - /* Require UTF-8 tail byte. */ - || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) - return (0); - } - /* Four-byte encodings. */ - else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) { - if (UNEXPECTED(cp + 3 >= end) - /* Exclude over-long encodings. */ - || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80)) - /* Exclude code points above U+10FFFF. */ - || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf)) - /* Require UTF-8 tail byte. */ - || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80) - /* Require UTF-8 tail byte. */ - || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) - return (0); - } - /* Invalid: c0 >= 0xf5 */ - else { + for (cp = str; cp < ep; cp++) { + if ((last = parse_utf8_char(cp, ep)) != 0) + cp = last; + else + return (0); + } + return (1); +} + +/* valid_utf8_stringz - validate string according to RFC 3629 */ + +int valid_utf8_stringz(const char *str) +{ + const char *cp; + const char *last; + + /* + * Ideally, the compiler will inline parse_utf8_char(), propagate the + * null pointer constant value, and eliminate code branches that test + * whether 0 != 0. + */ + for (cp = str; *cp; cp++) { + if ((last = parse_utf8_char(cp, 0)) != 0) + cp = last; + else return (0); - } } return (1); } @@ -114,26 +102,139 @@ int valid_utf8_string(const char *str, ssize_t len) */ #ifdef TEST #include +#include +#include #include -#include -#include +#include + + /* + * Test cases for 1-, 2-, and 3-byte encodings. See printable.c for UTF8 + * parser resychronization tests. + * + * XXX Need a test for 4-byte encodings, preferably with strings that can be + * displayed. + * + * XXX Need tests with hand-crafted over-long encodings and surrogates. + */ +struct testcase { + const char *name; + const char *input; + int expected; +}; -#define STR(x) vstring_str(x) -#define LEN(x) VSTRING_LEN(x) +#define T_VALID (1) +#define T_INVALID (0) +#define valid_to_str(v) ((v) ? "VALID" : "INVALID") -int main(void) +static const struct testcase testcases[] = { + {"Printable ASCII", + "printable", T_VALID, + }, + {"Latin script, accented, no error", + "na\303\257ve", T_VALID, + }, + {"Latin script, accented, missing non-leading byte", + "na\303ve", T_INVALID, + }, + {"Latin script, accented, missing leading byte", + "na\257ve", T_INVALID, + }, + {"Viktor, Cyrillic, no error", + "\320\262\320\270\320\272\321\202\320\276\321\200", T_VALID, + }, + {"Viktor, Cyrillic, missing non-leading byte", + "\320\262\320\320\272\321\202\320\276\321\200", T_INVALID, + }, + {"Viktor, Cyrillic, missing leading byte", + "\320\262\270\320\272\321\202\320\276\321\200", T_INVALID, + }, + {"Viktor, Cyrillic, truncated", + "\320\262\320\270\320\272\321\202\320\276\321", T_INVALID, + }, + {"Viktor, Hebrew, no error", + "\327\225\327\231\327\247\327\230\327\225\326\274\327\250", T_VALID, + }, + {"Viktor, Hebrew, missing leading byte", + "\327\225\231\327\247\327\230\327\225\326\274\327\250", T_INVALID, + }, + {"Chinese (Simplified), no error", + "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345\221\212", T_VALID, + }, + {"Chinese (Simplified), missing leading byte", + "\344\270\255\345\233\275\344\272\222\350\201\224\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345\221\212", T_INVALID, + }, + {"Chinese (Simplified), missing first non-leading byte", + "\344\270\255\345\233\275\344\272\222\350\201\224\347\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345\221\212", T_INVALID, + }, + {"Chinese (Simplified), missing second non-leading byte", + "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345\221\212", T_INVALID, + }, + {"Chinese (Simplified), truncated", + "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347" + "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273" + "\237\350\256\241\346\212\245\345", T_INVALID, + }, +}; + +int main(int argc, char **argv) { - VSTRING *buf = vstring_alloc(1); + const struct testcase *tp; + int pass; + int fail; + +#define NUM_TESTS sizeof(testcases)/sizeof(testcases[0]) + + msg_vstream_init(basename(argv[0]), VSTREAM_ERR); + util_utf8_enable = 1; + + for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) { + int actual_l; + int actual_z; + int ok = 0; - while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) { - vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ? - '!' : ' '); - vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf)); - vstream_printf("\n"); + /* + * Notes: + * + * - The msg(3) functions use printable() which interferes when logging + * inputs and outputs. Use vstream_fprintf() instead. + */ + vstream_fprintf(VSTREAM_ERR, "RUN %s\n", tp->name); + actual_l = valid_utf8_string(tp->input, strlen(tp->input)); + actual_z = valid_utf8_stringz(tp->input); + + if (actual_l != tp->expected) { + vstream_fprintf(VSTREAM_ERR, + "input: >%s<, 'actual_l' got: >%s<, want: >%s<\n", + tp->input, valid_to_str(actual_l), + valid_to_str(tp->expected)); + } else if (actual_z != tp->expected) { + vstream_fprintf(VSTREAM_ERR, + "input: >%s<, 'actual_z' got: >%s<, want: >%s<\n", + tp->input, valid_to_str(actual_z), + valid_to_str(tp->expected)); + } else { + vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n", + tp->input, valid_to_str(actual_l)); + ok = 1; + } + if (ok) { + vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name); + pass++; + } else { + vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name); + fail++; + } } - vstream_fflush(VSTREAM_OUT); - vstring_free(buf); - exit(0); + msg_info("PASS=%d FAIL=%d", pass, fail); + return (fail > 0); } #endif -- cgit v1.2.3