blob: a348c5dab2f4ef8aa4ea8eb4a577ecc08773f89d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* SPDX-License-Identifier: MPL-2.0
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
#include <string.h>
#include <isc/utf8.h>
#include <isc/util.h>
/*
* UTF-8 is defined in "The Unicode Standard -- Version 4.0"
* Also see RFC 3629.
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
bool
isc_utf8_valid(const unsigned char *buf, size_t len) {
REQUIRE(buf != NULL);
for (size_t i = 0; i < len; i++) {
if (buf[i] <= 0x7f) {
continue;
}
if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
(buf[i + 1] & 0xc0) == 0x80)
{
unsigned int w;
w = (buf[i] & 0x1f) << 6;
w |= (buf[++i] & 0x3f);
if (w < 0x80) {
return (false);
}
continue;
}
if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
(buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
{
unsigned int w;
w = (buf[i] & 0x0f) << 12;
w |= (buf[++i] & 0x3f) << 6;
w |= (buf[++i] & 0x3f);
if (w < 0x0800) {
return (false);
}
continue;
}
if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
(buf[i + 1] & 0xc0) == 0x80 &&
(buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
{
unsigned int w;
w = (buf[i] & 0x07) << 18;
w |= (buf[++i] & 0x3f) << 12;
w |= (buf[++i] & 0x3f) << 6;
w |= (buf[++i] & 0x3f);
if (w < 0x10000 || w > 0x10FFFF) {
return (false);
}
continue;
}
return (false);
}
return (true);
}
bool
isc_utf8_bom(const unsigned char *buf, size_t len) {
REQUIRE(buf != NULL);
if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
return (true);
}
return (false);
}
|