1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
/*++
/* NAME
/* parse_utf8_char 3h
/* SUMMARY
/* parse one UTF-8 multibyte character
/* SYNOPSIS
/* #include <parse_utf8_char.h>
/*
/* char *parse_utf8_char(str, end)
/* const char *str;
/* const char *end;
/* DESCRIPTION
/* parse_utf8_char() determines if the byte sequence starting
/* at \fBstr\fR begins with a complete UTF-8 character as
/* defined in RFC 3629. That is, a proper encoding of code
/* points U+0000..U+10FFFF, excluding over-long encodings and
/* excluding U+D800..U+DFFF surrogates.
/*
/* When the byte sequence starting at \fBstr\fR begins with a
/* complete UTF-8 character, this function returns a pointer
/* to the last byte in that character. Otherwise, it returns
/* a null pointer.
/*
/* The \fBend\fR argument is either null (the byte sequence
/* starting at \fBstr\fR must be null terminated), or \fBend
/* - str\fR specifies the length of the byte sequence.
/* BUGS
/* Code points in the range U+FDD0..U+FDEF and ending in FFFE
/* or FFFF are non-characters in UNICODE. This function does
/* not reject these.
/* LICENSE
/* .ad
/* .fi
/* The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/* Wietse Venema
/* IBM T.J. Watson Research
/* P.O. Box 704
/* Yorktown Heights, NY 10598, USA
/*
/* Wietse Venema
/* porcupine.org
/* Amawalk, NY 10501, USA
/*--*/
/*
* System library.
*/
#include <sys_defs.h>
#ifdef NO_INLINE
#define inline /* */
#endif
/* parse_utf8_char - parse and validate one UTF8 multibyte sequence */
static inline char *parse_utf8_char(const char *str, const char *end)
{
const unsigned char *cp = (const unsigned char *) str;
const unsigned char *ep = (const unsigned char *) end;
unsigned char c0, ch;
/*
* Optimized for correct input, time, space, and for CPUs that have a
* decent number of registers. Other implementation considerations:
*
* - In the UTF-8 encoding, a non-leading byte is never null. Therefore,
* this function will correctly reject a partial UTF-8 character at the
* end of a null-terminated string.
*
* - If the "end" argument is a null constant, and if this function is
* inlined, then an optimizing compiler should propagate the constant
* through the "ep" variable, and eliminate any code branches that
* require ep != 0.
*/
/* Single-byte encodings. */
if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
return ((char *) cp);
}
/* Two-byte encodings. */
else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
/* Exclude over-long encodings. */
if (UNEXPECTED(c0 < 0xc2)
|| UNEXPECTED(ep && cp + 1 >= ep)
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
return ((char *) cp);
}
/* Three-byte encodings. */
else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
if (UNEXPECTED(ep && cp + 2 >= ep)
/* Exclude over-long encodings. */
|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
/* Exclude U+D800..U+DFFF. */
|| UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
return ((char *) cp);
}
/* Four-byte encodings. */
else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
if (UNEXPECTED(ep && cp + 3 >= ep)
/* Exclude over-long encodings. */
|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
/* Exclude code points above U+10FFFF. */
|| UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
return ((char *) cp);
}
/* Invalid: c0 >= 0xf5 */
else {
return (0);
}
}
#undef inline
|