summaryrefslogtreecommitdiffstats
path: root/lib/isc/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/isc/utf8.c89
1 files changed, 89 insertions, 0 deletions
diff --git a/lib/isc/utf8.c b/lib/isc/utf8.c
new file mode 100644
index 0000000..a348c5d
--- /dev/null
+++ b/lib/isc/utf8.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
+ *
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, you can obtain one at https://mozilla.org/MPL/2.0/.
+ *
+ * See the COPYRIGHT file distributed with this work for additional
+ * information regarding copyright ownership.
+ */
+
+#include <string.h>
+
+#include <isc/utf8.h>
+#include <isc/util.h>
+
+/*
+ * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
+ * Also see RFC 3629.
+ *
+ * Char. number range | UTF-8 octet sequence
+ * (hexadecimal) | (binary)
+ * --------------------+---------------------------------------------
+ * 0000 0000-0000 007F | 0xxxxxxx
+ * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+bool
+isc_utf8_valid(const unsigned char *buf, size_t len) {
+ REQUIRE(buf != NULL);
+
+ for (size_t i = 0; i < len; i++) {
+ if (buf[i] <= 0x7f) {
+ continue;
+ }
+ if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
+ (buf[i + 1] & 0xc0) == 0x80)
+ {
+ unsigned int w;
+ w = (buf[i] & 0x1f) << 6;
+ w |= (buf[++i] & 0x3f);
+ if (w < 0x80) {
+ return (false);
+ }
+ continue;
+ }
+ if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
+ (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
+ {
+ unsigned int w;
+ w = (buf[i] & 0x0f) << 12;
+ w |= (buf[++i] & 0x3f) << 6;
+ w |= (buf[++i] & 0x3f);
+ if (w < 0x0800) {
+ return (false);
+ }
+ continue;
+ }
+ if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
+ (buf[i + 1] & 0xc0) == 0x80 &&
+ (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
+ {
+ unsigned int w;
+ w = (buf[i] & 0x07) << 18;
+ w |= (buf[++i] & 0x3f) << 12;
+ w |= (buf[++i] & 0x3f) << 6;
+ w |= (buf[++i] & 0x3f);
+ if (w < 0x10000 || w > 0x10FFFF) {
+ return (false);
+ }
+ continue;
+ }
+ return (false);
+ }
+ return (true);
+}
+
+bool
+isc_utf8_bom(const unsigned char *buf, size_t len) {
+ REQUIRE(buf != NULL);
+
+ if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
+ return (true);
+ }
+ return (false);
+}