summaryrefslogtreecommitdiffstats
path: root/src/spdk/include/spdk_internal/utf.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/spdk/include/spdk_internal/utf.h325
1 files changed, 325 insertions, 0 deletions
diff --git a/src/spdk/include/spdk_internal/utf.h b/src/spdk/include/spdk_internal/utf.h
new file mode 100644
index 00000000..b2b1c3c4
--- /dev/null
+++ b/src/spdk/include/spdk_internal/utf.h
@@ -0,0 +1,325 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_UTF_H_
+#define SPDK_UTF_H_
+
+#include "spdk/stdinc.h"
+
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+
+static inline bool
+utf8_tail(uint8_t c)
+{
+ /* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
+ return (c & 0xC0) == 0x80;
+}
+
+/*
+ * Check for a valid UTF-8 encoding of a single codepoint.
+ *
+ * \return Length of valid UTF-8 byte sequence, or negative if invalid.
+ */
+static inline int
+utf8_valid(const uint8_t *start, const uint8_t *end)
+{
+ const uint8_t *p = start;
+ uint8_t b0, b1, b2, b3;
+
+ if (p == end) {
+ return 0;
+ }
+
+ b0 = *p;
+
+ if (b0 <= 0x7F) {
+ return 1;
+ }
+
+ if (b0 <= 0xC1) {
+ /* Invalid start byte */
+ return -1;
+ }
+
+ if (++p == end) {
+ /* Not enough bytes left */
+ return -1;
+ }
+ b1 = *p;
+
+ if (b0 <= 0xDF) {
+ /* C2..DF 80..BF */
+ if (!utf8_tail(b1)) {
+ return -1;
+ }
+ return 2;
+ }
+
+ if (++p == end) {
+ /* Not enough bytes left */
+ return -1;
+ }
+ b2 = *p;
+
+ if (b0 == 0xE0) {
+ /* E0 A0..BF 80..BF */
+ if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
+ return -1;
+ }
+ return 3;
+ } else if (b0 == 0xED && b1 >= 0xA0) {
+ /*
+ * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
+ * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
+ */
+ return -1;
+ } else if (b0 <= 0xEF) {
+ /* E1..EF 80..BF 80..BF */
+ if (!utf8_tail(b1) || !utf8_tail(b2)) {
+ return -1;
+ }
+ return 3;
+ }
+
+ if (++p == end) {
+ /* Not enough bytes left */
+ return -1;
+ }
+ b3 = *p;
+
+ if (b0 == 0xF0) {
+ /* F0 90..BF 80..BF 80..BF */
+ if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
+ return -1;
+ }
+ return 4;
+ } else if (b0 <= 0xF3) {
+ /* F1..F3 80..BF 80..BF 80..BF */
+ if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
+ return -1;
+ }
+ return 4;
+ } else if (b0 == 0xF4) {
+ /* F4 80..8F 80..BF 80..BF */
+ if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
+ return -1;
+ }
+ return 4;
+ }
+
+ return -1;
+}
+
+static inline uint32_t
+utf8_decode_unsafe_1(const uint8_t *data)
+{
+ return data[0];
+}
+
+static inline uint32_t
+utf8_decode_unsafe_2(const uint8_t *data)
+{
+ uint32_t codepoint;
+
+ codepoint = ((data[0] & 0x1F) << 6);
+ codepoint |= (data[1] & 0x3F);
+
+ return codepoint;
+}
+
+static inline uint32_t
+utf8_decode_unsafe_3(const uint8_t *data)
+{
+ uint32_t codepoint;
+
+ codepoint = ((data[0] & 0x0F) << 12);
+ codepoint |= (data[1] & 0x3F) << 6;
+ codepoint |= (data[2] & 0x3F);
+
+ return codepoint;
+}
+
+static inline uint32_t
+utf8_decode_unsafe_4(const uint8_t *data)
+{
+ uint32_t codepoint;
+
+ codepoint = ((data[0] & 0x07) << 18);
+ codepoint |= (data[1] & 0x3F) << 12;
+ codepoint |= (data[2] & 0x3F) << 6;
+ codepoint |= (data[3] & 0x3F);
+
+ return codepoint;
+}
+
+/*
+ * Encode a single Unicode codepoint as UTF-8.
+ *
+ * buf must have at least 4 bytes of space available (hence unsafe).
+ *
+ * \return Number of bytes appended to buf, or negative if encoding failed.
+ */
+static inline int
+utf8_encode_unsafe(uint8_t *buf, uint32_t c)
+{
+ if (c <= 0x7F) {
+ buf[0] = c;
+ return 1;
+ } else if (c <= 0x7FF) {
+ buf[0] = 0xC0 | (c >> 6);
+ buf[1] = 0x80 | (c & 0x3F);
+ return 2;
+ } else if (c >= 0xD800 && c <= 0xDFFF) {
+ /* UTF-16 surrogate pairs - invalid in UTF-8 */
+ return -1;
+ } else if (c <= 0xFFFF) {
+ buf[0] = 0xE0 | (c >> 12);
+ buf[1] = 0x80 | ((c >> 6) & 0x3F);
+ buf[2] = 0x80 | (c & 0x3F);
+ return 3;
+ } else if (c <= 0x10FFFF) {
+ buf[0] = 0xF0 | (c >> 18);
+ buf[1] = 0x80 | ((c >> 12) & 0x3F);
+ buf[2] = 0x80 | ((c >> 6) & 0x3F);
+ buf[3] = 0x80 | (c & 0x3F);
+ return 4;
+ }
+ return -1;
+}
+
+static inline int
+utf8_codepoint_len(uint32_t c)
+{
+ if (c <= 0x7F) {
+ return 1;
+ } else if (c <= 0x7FF) {
+ return 2;
+ } else if (c >= 0xD800 && c <= 0xDFFF) {
+ /* UTF-16 surrogate pairs - invalid in UTF-8 */
+ return -1;
+ } else if (c <= 0xFFFF) {
+ return 3;
+ } else if (c <= 0x10FFFF) {
+ return 4;
+ }
+ return -1;
+}
+
+static inline bool
+utf16_valid_surrogate_high(uint32_t val)
+{
+ return val >= 0xD800 && val <= 0xDBFF;
+}
+
+static inline bool
+utf16_valid_surrogate_low(uint32_t val)
+{
+ return val >= 0xDC00 && val <= 0xDFFF;
+}
+
+/*
+ * Check for a valid UTF-16LE encoding of a single codepoint.
+ *
+ * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
+ */
+static inline int
+utf16le_valid(const uint16_t *start, const uint16_t *end)
+{
+ const uint16_t *p = start;
+ uint16_t high, low;
+
+ if (p == end) {
+ return 0;
+ }
+
+ high = from_le16(p);
+
+ if (high <= 0xD7FF || high >= 0xE000) {
+ /* Single code unit in BMP */
+ return 1;
+ }
+
+ if (high >= 0xDC00) {
+ /* Low surrogate in first code unit - invalid */
+ return -1;
+ }
+
+ assert(utf16_valid_surrogate_high(high));
+
+ if (++p == end) {
+ /* Not enough code units left */
+ return -1;
+ }
+ low = from_le16(p);
+
+ if (!utf16_valid_surrogate_low(low)) {
+ return -1;
+ }
+
+ /* Valid surrogate pair */
+ return 2;
+}
+
+static inline uint32_t
+utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
+{
+ uint32_t codepoint;
+
+ assert(utf16_valid_surrogate_high(high));
+ assert(utf16_valid_surrogate_low(low));
+
+ codepoint = low;
+ codepoint &= 0x3FF;
+ codepoint |= ((high & 0x3FF) << 10);
+ codepoint += 0x10000;
+
+ return codepoint;
+}
+
+static inline void
+utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
+{
+ assert(codepoint >= 0x10000);
+ assert(codepoint <= 0x10FFFF);
+
+ codepoint -= 0x10000;
+ *high = 0xD800 | (codepoint >> 10);
+ *low = 0xDC00 | (codepoint & 0x3FF);
+
+ assert(utf16_valid_surrogate_high(*high));
+ assert(utf16_valid_surrogate_low(*low));
+}
+
+#endif