diff options
Diffstat (limited to '')
-rw-r--r-- | src/spdk/include/spdk_internal/utf.h | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/src/spdk/include/spdk_internal/utf.h b/src/spdk/include/spdk_internal/utf.h new file mode 100644 index 00000000..b2b1c3c4 --- /dev/null +++ b/src/spdk/include/spdk_internal/utf.h @@ -0,0 +1,325 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_UTF_H_ +#define SPDK_UTF_H_ + +#include "spdk/stdinc.h" + +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/string.h" + +static inline bool +utf8_tail(uint8_t c) +{ + /* c >= 0x80 && c <= 0xBF, or binary 01xxxxxx */ + return (c & 0xC0) == 0x80; +} + +/* + * Check for a valid UTF-8 encoding of a single codepoint. + * + * \return Length of valid UTF-8 byte sequence, or negative if invalid. + */ +static inline int +utf8_valid(const uint8_t *start, const uint8_t *end) +{ + const uint8_t *p = start; + uint8_t b0, b1, b2, b3; + + if (p == end) { + return 0; + } + + b0 = *p; + + if (b0 <= 0x7F) { + return 1; + } + + if (b0 <= 0xC1) { + /* Invalid start byte */ + return -1; + } + + if (++p == end) { + /* Not enough bytes left */ + return -1; + } + b1 = *p; + + if (b0 <= 0xDF) { + /* C2..DF 80..BF */ + if (!utf8_tail(b1)) { + return -1; + } + return 2; + } + + if (++p == end) { + /* Not enough bytes left */ + return -1; + } + b2 = *p; + + if (b0 == 0xE0) { + /* E0 A0..BF 80..BF */ + if (b1 < 0xA0 || b1 > 0xBF || !utf8_tail(b2)) { + return -1; + } + return 3; + } else if (b0 == 0xED && b1 >= 0xA0) { + /* + * UTF-16 surrogate pairs use U+D800..U+DFFF, which would be encoded as + * ED A0..BF 80..BF in UTF-8; however, surrogate pairs are not allowed in UTF-8. + */ + return -1; + } else if (b0 <= 0xEF) { + /* E1..EF 80..BF 80..BF */ + if (!utf8_tail(b1) || !utf8_tail(b2)) { + return -1; + } + return 3; + } + + if (++p == end) { + /* Not enough bytes left */ + return -1; + } + b3 = *p; + + if (b0 == 0xF0) { + /* F0 90..BF 80..BF 80..BF */ + if (b1 < 0x90 || b1 > 0xBF || !utf8_tail(b2) || !utf8_tail(b3)) { + return -1; + } + return 4; + } else if (b0 <= 0xF3) { + /* F1..F3 80..BF 80..BF 80..BF */ + if (!utf8_tail(b1) || !utf8_tail(b2) || !utf8_tail(b3)) { + return -1; + } + return 4; + } else if (b0 == 0xF4) { + /* F4 80..8F 80..BF 80..BF */ + if (b1 < 0x80 || b1 > 0x8F || !utf8_tail(b2) || !utf8_tail(b3)) { + return -1; + } + return 4; + } + + return -1; +} + +static inline uint32_t +utf8_decode_unsafe_1(const uint8_t *data) +{ + return data[0]; +} + +static inline uint32_t +utf8_decode_unsafe_2(const uint8_t *data) +{ + uint32_t codepoint; + + codepoint = ((data[0] & 0x1F) << 6); + codepoint |= (data[1] & 0x3F); + + return codepoint; +} + +static inline uint32_t +utf8_decode_unsafe_3(const uint8_t *data) +{ + uint32_t codepoint; + + codepoint = ((data[0] & 0x0F) << 12); + codepoint |= (data[1] & 0x3F) << 6; + codepoint |= (data[2] & 0x3F); + + return codepoint; +} + +static inline uint32_t +utf8_decode_unsafe_4(const uint8_t *data) +{ + uint32_t codepoint; + + codepoint = ((data[0] & 0x07) << 18); + codepoint |= (data[1] & 0x3F) << 12; + codepoint |= (data[2] & 0x3F) << 6; + codepoint |= (data[3] & 0x3F); + + return codepoint; +} + +/* + * Encode a single Unicode codepoint as UTF-8. + * + * buf must have at least 4 bytes of space available (hence unsafe). + * + * \return Number of bytes appended to buf, or negative if encoding failed. + */ +static inline int +utf8_encode_unsafe(uint8_t *buf, uint32_t c) +{ + if (c <= 0x7F) { + buf[0] = c; + return 1; + } else if (c <= 0x7FF) { + buf[0] = 0xC0 | (c >> 6); + buf[1] = 0x80 | (c & 0x3F); + return 2; + } else if (c >= 0xD800 && c <= 0xDFFF) { + /* UTF-16 surrogate pairs - invalid in UTF-8 */ + return -1; + } else if (c <= 0xFFFF) { + buf[0] = 0xE0 | (c >> 12); + buf[1] = 0x80 | ((c >> 6) & 0x3F); + buf[2] = 0x80 | (c & 0x3F); + return 3; + } else if (c <= 0x10FFFF) { + buf[0] = 0xF0 | (c >> 18); + buf[1] = 0x80 | ((c >> 12) & 0x3F); + buf[2] = 0x80 | ((c >> 6) & 0x3F); + buf[3] = 0x80 | (c & 0x3F); + return 4; + } + return -1; +} + +static inline int +utf8_codepoint_len(uint32_t c) +{ + if (c <= 0x7F) { + return 1; + } else if (c <= 0x7FF) { + return 2; + } else if (c >= 0xD800 && c <= 0xDFFF) { + /* UTF-16 surrogate pairs - invalid in UTF-8 */ + return -1; + } else if (c <= 0xFFFF) { + return 3; + } else if (c <= 0x10FFFF) { + return 4; + } + return -1; +} + +static inline bool +utf16_valid_surrogate_high(uint32_t val) +{ + return val >= 0xD800 && val <= 0xDBFF; +} + +static inline bool +utf16_valid_surrogate_low(uint32_t val) +{ + return val >= 0xDC00 && val <= 0xDFFF; +} + +/* + * Check for a valid UTF-16LE encoding of a single codepoint. + * + * \return Length of valid UTF-16LE sequence in 16-bit code units, or negative if invalid. + */ +static inline int +utf16le_valid(const uint16_t *start, const uint16_t *end) +{ + const uint16_t *p = start; + uint16_t high, low; + + if (p == end) { + return 0; + } + + high = from_le16(p); + + if (high <= 0xD7FF || high >= 0xE000) { + /* Single code unit in BMP */ + return 1; + } + + if (high >= 0xDC00) { + /* Low surrogate in first code unit - invalid */ + return -1; + } + + assert(utf16_valid_surrogate_high(high)); + + if (++p == end) { + /* Not enough code units left */ + return -1; + } + low = from_le16(p); + + if (!utf16_valid_surrogate_low(low)) { + return -1; + } + + /* Valid surrogate pair */ + return 2; +} + +static inline uint32_t +utf16_decode_surrogate_pair(uint32_t high, uint32_t low) +{ + uint32_t codepoint; + + assert(utf16_valid_surrogate_high(high)); + assert(utf16_valid_surrogate_low(low)); + + codepoint = low; + codepoint &= 0x3FF; + codepoint |= ((high & 0x3FF) << 10); + codepoint += 0x10000; + + return codepoint; +} + +static inline void +utf16_encode_surrogate_pair(uint32_t codepoint, uint16_t *high, uint16_t *low) +{ + assert(codepoint >= 0x10000); + assert(codepoint <= 0x10FFFF); + + codepoint -= 0x10000; + *high = 0xD800 | (codepoint >> 10); + *low = 0xDC00 | (codepoint & 0x3FF); + + assert(utf16_valid_surrogate_high(*high)); + assert(utf16_valid_surrogate_low(*low)); +} + +#endif |