diff options
Diffstat (limited to 'src/util/valid_utf8_string.c')
-rw-r--r-- | src/util/valid_utf8_string.c | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/src/util/valid_utf8_string.c b/src/util/valid_utf8_string.c new file mode 100644 index 0000000..96b5b4d --- /dev/null +++ b/src/util/valid_utf8_string.c @@ -0,0 +1,139 @@ +/*++ +/* NAME +/* valid_utf8_string 3 +/* SUMMARY +/* predicate if string is valid UTF-8 +/* SYNOPSIS +/* #include <stringops.h> +/* +/* int valid_utf8_string(str, len) +/* const char *str; +/* ssize_t len; +/* DESCRIPTION +/* valid_utf8_string() determines if a string satisfies the UTF-8 +/* definition in RFC 3629. That is, it contains proper encodings +/* of code points U+0000..U+10FFFF, excluding over-long encodings +/* and excluding U+D800..U+DFFF surrogates. +/* +/* A zero-length string is considered valid. +/* DIAGNOSTICS +/* The result value is zero when the caller specifies a negative +/* length, or a string that violates RFC 3629, for example a +/* string that is truncated in the middle of a multi-byte +/* sequence. +/* BUGS +/* But wait, there is more. Code points in the range U+FDD0..U+FDEF +/* and ending in FFFE or FFFF are non-characters in UNICODE. This +/* function does not block these. +/* SEE ALSO +/* RFC 3629 +/* LICENSE +/* .ad +/* .fi +/* The Secure Mailer license must be distributed with this software. +/* AUTHOR(S) +/* Wietse Venema +/* IBM T.J. Watson Research +/* P.O. Box 704 +/* Yorktown Heights, NY 10598, USA +/*--*/ + +/* System library. */ + +#include <sys_defs.h> + +/* Utility library. */ + +#include <stringops.h> + +/* valid_utf8_string - validate string according to RFC 3629 */ + +int valid_utf8_string(const char *str, ssize_t len) +{ + const unsigned char *end = (const unsigned char *) str + len; + const unsigned char *cp; + unsigned char c0, ch; + + if (len < 0) + return (0); + if (len <= 0) + return (1); + + /* + * Optimized for correct input, time, space, and for CPUs that have a + * decent number of registers. + */ + for (cp = (const unsigned char *) str; cp < end; cp++) { + /* Single-byte encodings. */ + if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) { + /* void */ ; + } + /* Two-byte encodings. */ + else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) { + /* Exclude over-long encodings. */ + if (UNEXPECTED(c0 < 0xc2) + || UNEXPECTED(cp + 1 >= end) + /* Require UTF-8 tail byte. */ + || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) + return (0); + } + /* Three-byte encodings. */ + else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) { + if (UNEXPECTED(cp + 2 >= end) + /* Exclude over-long encodings. */ + || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80)) + /* Exclude U+D800..U+DFFF. */ + || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf)) + /* Require UTF-8 tail byte. */ + || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) + return (0); + } + /* Four-byte encodings. */ + else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) { + if (UNEXPECTED(cp + 3 >= end) + /* Exclude over-long encodings. */ + || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80)) + /* Exclude code points above U+10FFFF. */ + || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf)) + /* Require UTF-8 tail byte. */ + || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80) + /* Require UTF-8 tail byte. */ + || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) + return (0); + } + /* Invalid: c0 >= 0xf5 */ + else { + return (0); + } + } + return (1); +} + + /* + * Stand-alone test program. Each string is a line without line terminator. + */ +#ifdef TEST +#include <stdlib.h> +#include <vstream.h> +#include <vstring.h> +#include <vstring_vstream.h> + +#define STR(x) vstring_str(x) +#define LEN(x) VSTRING_LEN(x) + +int main(void) +{ + VSTRING *buf = vstring_alloc(1); + + while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) { + vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ? + '!' : ' '); + vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf)); + vstream_printf("\n"); + } + vstream_fflush(VSTREAM_OUT); + vstring_free(buf); + exit(0); +} + +#endif |