1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
/*++
/* NAME
/* valid_utf8_string 3
/* SUMMARY
/* predicate if string is valid UTF-8
/* SYNOPSIS
/* #include <stringops.h>
/*
/* int valid_utf8_string(str, len)
/* const char *str;
/* ssize_t len;
/* DESCRIPTION
/* valid_utf8_string() determines if a string satisfies the UTF-8
/* definition in RFC 3629. That is, it contains proper encodings
/* of code points U+0000..U+10FFFF, excluding over-long encodings
/* and excluding U+D800..U+DFFF surrogates.
/*
/* A zero-length string is considered valid.
/* DIAGNOSTICS
/* The result value is zero when the caller specifies a negative
/* length, or a string that violates RFC 3629, for example a
/* string that is truncated in the middle of a multi-byte
/* sequence.
/* BUGS
/* But wait, there is more. Code points in the range U+FDD0..U+FDEF
/* and ending in FFFE or FFFF are non-characters in UNICODE. This
/* function does not block these.
/* SEE ALSO
/* RFC 3629
/* LICENSE
/* .ad
/* .fi
/* The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/* Wietse Venema
/* IBM T.J. Watson Research
/* P.O. Box 704
/* Yorktown Heights, NY 10598, USA
/*--*/
/* System library. */
#include <sys_defs.h>
/* Utility library. */
#include <stringops.h>
/* valid_utf8_string - validate string according to RFC 3629 */
int valid_utf8_string(const char *str, ssize_t len)
{
const unsigned char *end = (const unsigned char *) str + len;
const unsigned char *cp;
unsigned char c0, ch;
if (len < 0)
return (0);
if (len <= 0)
return (1);
/*
* Optimized for correct input, time, space, and for CPUs that have a
* decent number of registers.
*/
for (cp = (const unsigned char *) str; cp < end; cp++) {
/* Single-byte encodings. */
if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
/* void */ ;
}
/* Two-byte encodings. */
else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
/* Exclude over-long encodings. */
if (UNEXPECTED(c0 < 0xc2)
|| UNEXPECTED(cp + 1 >= end)
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
}
/* Three-byte encodings. */
else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
if (UNEXPECTED(cp + 2 >= end)
/* Exclude over-long encodings. */
|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
/* Exclude U+D800..U+DFFF. */
|| UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
}
/* Four-byte encodings. */
else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
if (UNEXPECTED(cp + 3 >= end)
/* Exclude over-long encodings. */
|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
/* Exclude code points above U+10FFFF. */
|| UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
}
/* Invalid: c0 >= 0xf5 */
else {
return (0);
}
}
return (1);
}
/*
* Stand-alone test program. Each string is a line without line terminator.
*/
#ifdef TEST
#include <stdlib.h>
#include <vstream.h>
#include <vstring.h>
#include <vstring_vstream.h>
#define STR(x) vstring_str(x)
#define LEN(x) VSTRING_LEN(x)
int main(void)
{
VSTRING *buf = vstring_alloc(1);
while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
'!' : ' ');
vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
vstream_printf("\n");
}
vstream_fflush(VSTREAM_OUT);
vstring_free(buf);
exit(0);
}
#endif
|