diff options
Diffstat (limited to 'lib/isc/utf8.c')
-rw-r--r-- | lib/isc/utf8.c | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/lib/isc/utf8.c b/lib/isc/utf8.c new file mode 100644 index 0000000..a348c5d --- /dev/null +++ b/lib/isc/utf8.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#include <string.h> + +#include <isc/utf8.h> +#include <isc/util.h> + +/* + * UTF-8 is defined in "The Unicode Standard -- Version 4.0" + * Also see RFC 3629. + * + * Char. number range | UTF-8 octet sequence + * (hexadecimal) | (binary) + * --------------------+--------------------------------------------- + * 0000 0000-0000 007F | 0xxxxxxx + * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ +bool +isc_utf8_valid(const unsigned char *buf, size_t len) { + REQUIRE(buf != NULL); + + for (size_t i = 0; i < len; i++) { + if (buf[i] <= 0x7f) { + continue; + } + if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 && + (buf[i + 1] & 0xc0) == 0x80) + { + unsigned int w; + w = (buf[i] & 0x1f) << 6; + w |= (buf[++i] & 0x3f); + if (w < 0x80) { + return (false); + } + continue; + } + if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 && + (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80) + { + unsigned int w; + w = (buf[i] & 0x0f) << 12; + w |= (buf[++i] & 0x3f) << 6; + w |= (buf[++i] & 0x3f); + if (w < 0x0800) { + return (false); + } + continue; + } + if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 && + (buf[i + 1] & 0xc0) == 0x80 && + (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80) + { + unsigned int w; + w = (buf[i] & 0x07) << 18; + w |= (buf[++i] & 0x3f) << 12; + w |= (buf[++i] & 0x3f) << 6; + w |= (buf[++i] & 0x3f); + if (w < 0x10000 || w > 0x10FFFF) { + return (false); + } + continue; + } + return (false); + } + return (true); +} + +bool +isc_utf8_bom(const unsigned char *buf, size_t len) { + REQUIRE(buf != NULL); + + if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) { + return (true); + } + return (false); +} |