src/util/parse_utf8_char.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

/*++
/* NAME
/*	parse_utf8_char 3h
/* SUMMARY
/*	parse one UTF-8 multibyte character
/* SYNOPSIS
/*	#include <parse_utf8_char.h>
/*
/*	char	*parse_utf8_char(str, end)
/*	const char *str;
/*	const char *end;
/* DESCRIPTION
/*	parse_utf8_char() determines if the byte sequence starting
/*	at \fBstr\fR begins with a complete UTF-8 character as
/*	defined in RFC 3629. That is, a proper encoding of code
/*	points U+0000..U+10FFFF, excluding over-long encodings and
/*	excluding U+D800..U+DFFF surrogates.
/*
/*	When the byte sequence starting at \fBstr\fR begins with a
/*	complete UTF-8 character, this function returns a pointer
/*	to the last byte in that character. Otherwise, it returns
/*	a null pointer.
/*
/*	The \fBend\fR argument is either null (the byte sequence
/*	starting at \fBstr\fR must be null terminated), or \fBend
/*	- str\fR specifies the length of the byte sequence.
/* BUGS
/*	Code points in the range U+FDD0..U+FDEF and ending in FFFE
/*	or FFFF are non-characters in UNICODE. This function does
/*	not reject these.
/* LICENSE
/* .ad
/* .fi
/*	The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/*	Wietse Venema
/*	IBM T.J. Watson Research
/*	P.O. Box 704
/*	Yorktown Heights, NY 10598, USA
/*
/*	Wietse Venema
/*	porcupine.org
/*	Amawalk, NY 10501, USA
/*--*/

 /*
  * System library.
  */
#include <sys_defs.h>

#ifdef NO_INLINE
#define inline /* */
#endif

/* parse_utf8_char - parse and validate one UTF8 multibyte sequence */

static inline char *parse_utf8_char(const char *str, const char *end)
{
    const unsigned char *cp = (const unsigned char *) str;
    const unsigned char *ep = (const unsigned char *) end;
    unsigned char c0, ch;

    /*
     * Optimized for correct input, time, space, and for CPUs that have a
     * decent number of registers. Other implementation considerations:
     * 
     * - In the UTF-8 encoding, a non-leading byte is never null. Therefore,
     * this function will correctly reject a partial UTF-8 character at the
     * end of a null-terminated string.
     * 
     * - If the "end" argument is a null constant, and if this function is
     * inlined, then an optimizing compiler should propagate the constant
     * through the "ep" variable, and eliminate any code branches that
     * require ep != 0.
     */
    /* Single-byte encodings. */
    if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
	return ((char *) cp);
    }
    /* Two-byte encodings. */
    else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
	/* Exclude over-long encodings. */
	if (UNEXPECTED(c0 < 0xc2)
	    || UNEXPECTED(ep && cp + 1 >= ep)
	/* Require UTF-8 tail byte. */
	    || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
	    return (0);
	return ((char *) cp);
    }
    /* Three-byte encodings. */
    else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
	if (UNEXPECTED(ep && cp + 2 >= ep)
	/* Exclude over-long encodings. */
	    || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
	/* Exclude U+D800..U+DFFF. */
	    || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
	/* Require UTF-8 tail byte. */
	    || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
	    return (0);
	return ((char *) cp);
    }
    /* Four-byte encodings. */
    else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
	if (UNEXPECTED(ep && cp + 3 >= ep)
	/* Exclude over-long encodings. */
	    || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
	/* Exclude code points above U+10FFFF. */
	    || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
	/* Require UTF-8 tail byte. */
	    || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
	/* Require UTF-8 tail byte. */
	    || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
	    return (0);
	return ((char *) cp);
    }
    /* Invalid: c0 >= 0xf5 */
    else {
	return (0);
    }
}

#undef inline