summaryrefslogtreecommitdiffstats
path: root/src/util/parse_utf8_char.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/parse_utf8_char.h')
-rw-r--r--src/util/parse_utf8_char.h122
1 files changed, 122 insertions, 0 deletions
diff --git a/src/util/parse_utf8_char.h b/src/util/parse_utf8_char.h
new file mode 100644
index 0000000..b00a1c2
--- /dev/null
+++ b/src/util/parse_utf8_char.h
@@ -0,0 +1,122 @@
+/*++
+/* NAME
+/* parse_utf8_char 3h
+/* SUMMARY
+/* parse one UTF-8 multibyte character
+/* SYNOPSIS
+/* #include <parse_utf8_char.h>
+/*
+/* char *parse_utf8_char(str, end)
+/* const char *str;
+/* const char *end;
+/* DESCRIPTION
+/* parse_utf8_char() determines if the byte sequence starting
+/* at \fBstr\fR begins with a complete UTF-8 character as
+/* defined in RFC 3629. That is, a proper encoding of code
+/* points U+0000..U+10FFFF, excluding over-long encodings and
+/* excluding U+D800..U+DFFF surrogates.
+/*
+/* When the byte sequence starting at \fBstr\fR begins with a
+/* complete UTF-8 character, this function returns a pointer
+/* to the last byte in that character. Otherwise, it returns
+/* a null pointer.
+/*
+/* The \fBend\fR argument is either null (the byte sequence
+/* starting at \fBstr\fR must be null terminated), or \fBend
+/* - str\fR specifies the length of the byte sequence.
+/* BUGS
+/* Code points in the range U+FDD0..U+FDEF and ending in FFFE
+/* or FFFF are non-characters in UNICODE. This function does
+/* not reject these.
+/* LICENSE
+/* .ad
+/* .fi
+/* The Secure Mailer license must be distributed with this software.
+/* AUTHOR(S)
+/* Wietse Venema
+/* IBM T.J. Watson Research
+/* P.O. Box 704
+/* Yorktown Heights, NY 10598, USA
+/*
+/* Wietse Venema
+/* porcupine.org
+/* Amawalk, NY 10501, USA
+/*--*/
+
+ /*
+ * System library.
+ */
+#include <sys_defs.h>
+
+#ifdef NO_INLINE
+#define inline /* */
+#endif
+
+/* parse_utf8_char - parse and validate one UTF8 multibyte sequence */
+
+static inline char *parse_utf8_char(const char *str, const char *end)
+{
+ const unsigned char *cp = (const unsigned char *) str;
+ const unsigned char *ep = (const unsigned char *) end;
+ unsigned char c0, ch;
+
+ /*
+ * Optimized for correct input, time, space, and for CPUs that have a
+ * decent number of registers. Other implementation considerations:
+ *
+ * - In the UTF-8 encoding, a non-leading byte is never null. Therefore,
+ * this function will correctly reject a partial UTF-8 character at the
+ * end of a null-terminated string.
+ *
+ * - If the "end" argument is a null constant, and if this function is
+ * inlined, then an optimizing compiler should propagate the constant
+ * through the "ep" variable, and eliminate any code branches that
+ * require ep != 0.
+ */
+ /* Single-byte encodings. */
+ if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
+ return ((char *) cp);
+ }
+ /* Two-byte encodings. */
+ else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
+ /* Exclude over-long encodings. */
+ if (UNEXPECTED(c0 < 0xc2)
+ || UNEXPECTED(ep && cp + 1 >= ep)
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+ return (0);
+ return ((char *) cp);
+ }
+ /* Three-byte encodings. */
+ else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
+ if (UNEXPECTED(ep && cp + 2 >= ep)
+ /* Exclude over-long encodings. */
+ || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
+ /* Exclude U+D800..U+DFFF. */
+ || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+ return (0);
+ return ((char *) cp);
+ }
+ /* Four-byte encodings. */
+ else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
+ if (UNEXPECTED(ep && cp + 3 >= ep)
+ /* Exclude over-long encodings. */
+ || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
+ /* Exclude code points above U+10FFFF. */
+ || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
+ /* Require UTF-8 tail byte. */
+ || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
+ return (0);
+ return ((char *) cp);
+ }
+ /* Invalid: c0 >= 0xf5 */
+ else {
+ return (0);
+ }
+}
+
+#undef inline