summaryrefslogtreecommitdiffstats
path: root/lib/isc/utf8.c
blob: a348c5dab2f4ef8aa4ea8eb4a577ecc08773f89d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/*
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
 *
 * SPDX-License-Identifier: MPL-2.0
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
 */

#include <string.h>

#include <isc/utf8.h>
#include <isc/util.h>

/*
 * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
 * Also see RFC 3629.
 *
 * Char. number range  |        UTF-8 octet sequence
 *    (hexadecimal)    |              (binary)
 *  --------------------+---------------------------------------------
 * 0000 0000-0000 007F | 0xxxxxxx
 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 */
bool
isc_utf8_valid(const unsigned char *buf, size_t len) {
	REQUIRE(buf != NULL);

	for (size_t i = 0; i < len; i++) {
		if (buf[i] <= 0x7f) {
			continue;
		}
		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
		    (buf[i + 1] & 0xc0) == 0x80)
		{
			unsigned int w;
			w = (buf[i] & 0x1f) << 6;
			w |= (buf[++i] & 0x3f);
			if (w < 0x80) {
				return (false);
			}
			continue;
		}
		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
		{
			unsigned int w;
			w = (buf[i] & 0x0f) << 12;
			w |= (buf[++i] & 0x3f) << 6;
			w |= (buf[++i] & 0x3f);
			if (w < 0x0800) {
				return (false);
			}
			continue;
		}
		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
		    (buf[i + 1] & 0xc0) == 0x80 &&
		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
		{
			unsigned int w;
			w = (buf[i] & 0x07) << 18;
			w |= (buf[++i] & 0x3f) << 12;
			w |= (buf[++i] & 0x3f) << 6;
			w |= (buf[++i] & 0x3f);
			if (w < 0x10000 || w > 0x10FFFF) {
				return (false);
			}
			continue;
		}
		return (false);
	}
	return (true);
}

bool
isc_utf8_bom(const unsigned char *buf, size_t len) {
	REQUIRE(buf != NULL);

	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
		return (true);
	}
	return (false);
}