/*++ /* NAME /* valid_utf8_string 3 /* SUMMARY /* predicate if string is valid UTF-8 /* SYNOPSIS /* #include /* /* int valid_utf8_string(str, len) /* const char *str; /* ssize_t len; /* DESCRIPTION /* valid_utf8_string() determines if a string satisfies the UTF-8 /* definition in RFC 3629. That is, it contains proper encodings /* of code points U+0000..U+10FFFF, excluding over-long encodings /* and excluding U+D800..U+DFFF surrogates. /* /* A zero-length string is considered valid. /* DIAGNOSTICS /* The result value is zero when the caller specifies a negative /* length, or a string that violates RFC 3629, for example a /* string that is truncated in the middle of a multi-byte /* sequence. /* BUGS /* But wait, there is more. Code points in the range U+FDD0..U+FDEF /* and ending in FFFE or FFFF are non-characters in UNICODE. This /* function does not block these. /* SEE ALSO /* RFC 3629 /* LICENSE /* .ad /* .fi /* The Secure Mailer license must be distributed with this software. /* AUTHOR(S) /* Wietse Venema /* IBM T.J. Watson Research /* P.O. Box 704 /* Yorktown Heights, NY 10598, USA /*--*/ /* System library. */ #include /* Utility library. */ #include /* valid_utf8_string - validate string according to RFC 3629 */ int valid_utf8_string(const char *str, ssize_t len) { const unsigned char *end = (const unsigned char *) str + len; const unsigned char *cp; unsigned char c0, ch; if (len < 0) return (0); if (len <= 0) return (1); /* * Optimized for correct input, time, space, and for CPUs that have a * decent number of registers. */ for (cp = (const unsigned char *) str; cp < end; cp++) { /* Single-byte encodings. */ if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) { /* void */ ; } /* Two-byte encodings. */ else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) { /* Exclude over-long encodings. */ if (UNEXPECTED(c0 < 0xc2) || UNEXPECTED(cp + 1 >= end) /* Require UTF-8 tail byte. */ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) return (0); } /* Three-byte encodings. */ else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) { if (UNEXPECTED(cp + 2 >= end) /* Exclude over-long encodings. */ || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80)) /* Exclude U+D800..U+DFFF. */ || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf)) /* Require UTF-8 tail byte. */ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) return (0); } /* Four-byte encodings. */ else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) { if (UNEXPECTED(cp + 3 >= end) /* Exclude over-long encodings. */ || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80)) /* Exclude code points above U+10FFFF. */ || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf)) /* Require UTF-8 tail byte. */ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80) /* Require UTF-8 tail byte. */ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)) return (0); } /* Invalid: c0 >= 0xf5 */ else { return (0); } } return (1); } /* * Stand-alone test program. Each string is a line without line terminator. */ #ifdef TEST #include #include #include #include #define STR(x) vstring_str(x) #define LEN(x) VSTRING_LEN(x) int main(void) { VSTRING *buf = vstring_alloc(1); while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) { vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ? '!' : ' '); vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf)); vstream_printf("\n"); } vstream_fflush(VSTREAM_OUT); vstring_free(buf); exit(0); } #endif