1 files changed, 325 insertions, 0 deletions
diff --git a/src/spdk/include/spdk_internal/utf.h b/src/spdk/include/spdk_internal/utf.h
new file mode 100644
index 00000000..b2b1c3c4
--- /dev/null
+++ b/src/spdk/include/spdk_internal/utf.h
@@ -0,0 +1,325 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) Intel Corporation.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_UTF_H_
+#define SPDK_UTF_H_
+
+#include "spdk/stdinc.h"
+
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+
+static inline bool
+utf8_tail(uint8_t c)
+{
+	/* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */
+	return (c & 0xC0) == 0x80;
+}
+
+/*
+ * Check for a valid UTF-8 encoding of a single codepoint.
+ *
+ * \return Length of valid UTF-8 byte sequence, or negative if invalid.
+ */
+static inline int
+utf8_valid(const uint8_t *start, const uint8_t *end)
+{
+	const uint8_t *p = start;
+	uint8_t b0, b1, b2, b3;
+
+	if (p == end) {
+		return 0;
+	}
+
+	b0 = *p;
+
+	if (b0 <= 0x7F) {
+		return 1;
+	}
+
+	if (b0 <= 0xC1) {
+		/* Invalid start byte */
+		return -1;
+	}
+
+	if (++p == end) {
+		/* Not enough bytes left */
+		return -1;
+	}
+	b1 = *p;
+
+	if (b0 <= 0xDF) {
+		/* C2..DF 80..BF */
+		if (!utf8_tail(b1)) {
+			return -1;
+		}
+		return 2;
+	}
+
+	if (++p == end) {
+		/* Not enough bytes left */
+		return -1;
+	}
+	b2 = *p;
+
+	if (b0 == 0xE0) {
+		/* E0 A0..BF 80..BF */
+		if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) {
+			return -1;
+		}
+		return 3;
+	} else if (b0 == 0xED && b1 >= 0xA0) {
+		/*
+		 * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as
+		 * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8.
+		 */
+		return -1;
+	} else if (b0 <= 0xEF) {
+		/* E1..EF 80..BF 80..BF */
+		if (!utf8_tail(b1) || !utf8_tail(b2)) {
+			return -1;
+		}
+		return 3;
+	}
+
+	if (++p == end) {
+		/* Not enough bytes left */
+		return -1;
+	}
+	b3 = *p;
+
+	if (b0 == 0xF0) {
+		/* F0 90..BF 80..BF 80..BF */
+		if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) {
+			return -1;
+		}
+		return 4;
+	} else if (b0 <= 0xF3) {
+		/* F1..F3 80..BF 80..BF 80..BF */
+		if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) {
+			return -1;
+		}
+		return 4;
+	} else if (b0 == 0xF4) {
+		/* F4 80..8F 80..BF 80..BF */
+		if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) {
+			return -1;
+		}
+		return 4;
+	}
+
+	return -1;
+}
+
+static inline uint32_t
+utf8_decode_unsafe_1(const uint8_t *data)
+{
+	return data[0];
+}
+
+static inline uint32_t
+utf8_decode_unsafe_2(const uint8_t *data)
+{
+	uint32_t codepoint;
+
+	codepoint = ((data[0] & 0x1F) << 6);
+	codepoint |= (data[1] & 0x3F);
+
+	return codepoint;
+}
+
+static inline uint32_t
+utf8_decode_unsafe_3(const uint8_t *data)
+{
+	uint32_t codepoint;
+
+	codepoint = ((data[0] & 0x0F) << 12);
+	codepoint |= (data[1] & 0x3F) << 6;
+	codepoint |= (data[2] & 0x3F);
+
+	return codepoint;
+}
+
+static inline uint32_t
+utf8_decode_unsafe_4(const uint8_t *data)
+{
+	uint32_t codepoint;
+
+	codepoint = ((data[0] & 0x07) << 18);
+	codepoint |= (data[1] & 0x3F) << 12;
+	codepoint |= (data[2] & 0x3F) << 6;
+	codepoint |= (data[3] & 0x3F);
+
+	return codepoint;
+}
+
+/*
+ * Encode a single Unicode codepoint as UTF-8.
+ *
+ * buf must have at least 4 bytes of space available (hence unsafe).
+ *
+ * \return Number of bytes appended to buf, or negative if encoding failed.
+ */
+static inline int
+utf8_encode_unsafe(uint8_t *buf, uint32_t c)
+{
+	if (c <= 0x7F) {
+		buf[0] = c;
+		return 1;
+	} else if (c <= 0x7FF) {
+		buf[0] = 0xC0 | (c >> 6);
+		buf[1] = 0x80 | (c & 0x3F);
+		return 2;
+	} else if (c >= 0xD800 && c <= 0xDFFF) {
+		/* UTF-16 surrogate pairs - invalid in UTF-8 */
+		return -1;
+	} else if (c <= 0xFFFF) {
+		buf[0] = 0xE0 | (c >> 12);
+		buf[1] = 0x80 | ((c >> 6) & 0x3F);
+		buf[2] = 0x80 | (c & 0x3F);
+		return 3;
+	} else if (c <= 0x10FFFF) {
+		buf[0] = 0xF0 | (c >> 18);
+		buf[1] = 0x80 | ((c >> 12) & 0x3F);
+		buf[2] = 0x80 | ((c >> 6) & 0x3F);
+		buf[3] = 0x80 | (c & 0x3F);
+		return 4;
+	}
+	return -1;
+}
+
+static inline int
+utf8_codepoint_len(uint32_t c)
+{
+	if (c <= 0x7F) {
+		return 1;
+	} else if (c <= 0x7FF) {
+		return 2;
+	} else if (c >= 0xD800 && c <= 0xDFFF) {
+		/* UTF-16 surrogate pairs - invalid in UTF-8 */
+		return -1;
+	} else if (c <= 0xFFFF) {
+		return 3;
+	} else if (c <= 0x10FFFF) {
+		return 4;
+	}
+	return -1;
+}
+
+static inline bool
+utf16_valid_surrogate_high(uint32_t val)
+{
+	return val >= 0xD800 && val <= 0xDBFF;
+}
+
+static inline bool
+utf16_valid_surrogate_low(uint32_t val)
+{
+	return val >= 0xDC00 && val <= 0xDFFF;
+}
+
+/*
+ * Check for a valid UTF-16LE encoding of a single codepoint.
+ *
+ * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid.
+ */
+static inline int
+utf16le_valid(const uint16_t *start, const uint16_t *end)
+{
+	const uint16_t *p = start;
+	uint16_t high, low;
+
+	if (p == end) {
+		return 0;
+	}
+
+	high = from_le16(p);
+
+	if (high <= 0xD7FF || high >= 0xE000) {
+		/* Single code unit in BMP */
+		return 1;
+	}
+
+	if (high >= 0xDC00) {
+		/* Low surrogate in first code unit - invalid */
+		return -1;
+	}
+
+	assert(utf16_valid_surrogate_high(high));
+
+	if (++p == end) {
+		/* Not enough code units left */
+		return -1;
+	}
+	low = from_le16(p);
+
+	if (!utf16_valid_surrogate_low(low)) {
+		return -1;
+	}
+
+	/* Valid surrogate pair */
+	return 2;
+}
+
+static inline uint32_t
+utf16_decode_surrogate_pair(uint32_t high, uint32_t low)
+{
+	uint32_t codepoint;
+
+	assert(utf16_valid_surrogate_high(high));
+	assert(utf16_valid_surrogate_low(low));
+
+	codepoint = low;
+	codepoint &= 0x3FF;
+	codepoint |= ((high & 0x3FF) << 10);
+	codepoint += 0x10000;
+
+	return codepoint;
+}
+
+static inline void
+utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low)
+{
+	assert(codepoint >= 0x10000);
+	assert(codepoint <= 0x10FFFF);
+
+	codepoint -= 0x10000;
+	*high = 0xD800 | (codepoint >> 10);
+	*low = 0xDC00 | (codepoint & 0x3FF);
+
+	assert(utf16_valid_surrogate_high(*high));
+	assert(utf16_valid_surrogate_low(*low));
+}
+
+#endif