diff options
Diffstat (limited to 'xpcom/string/nsUTF8Utils.h')
-rw-r--r-- | xpcom/string/nsUTF8Utils.h | 247 |
1 files changed, 247 insertions, 0 deletions
diff --git a/xpcom/string/nsUTF8Utils.h b/xpcom/string/nsUTF8Utils.h new file mode 100644 index 0000000000..0145011ec1 --- /dev/null +++ b/xpcom/string/nsUTF8Utils.h @@ -0,0 +1,247 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsUTF8Utils_h_ +#define nsUTF8Utils_h_ + +// NB: This code may be used from non-XPCOM code, in particular, the +// standalone updater executable. That is, this file may be used in +// two ways: if MOZILLA_INTERNAL_API is defined, this file will +// provide signatures for the Mozilla abstract string types. It will +// use XPCOM assertion/debugging macros, etc. + +#include <type_traits> + +#include "mozilla/Assertions.h" +#include "mozilla/EndianUtils.h" + +#include "nsCharTraits.h" + +#ifdef MOZILLA_INTERNAL_API +# define UTF8UTILS_WARNING(msg) NS_WARNING(msg) +#else +# define UTF8UTILS_WARNING(msg) +#endif + +class UTF8traits { + public: + static bool isASCII(char aChar) { return (aChar & 0x80) == 0x00; } + static bool isInSeq(char aChar) { return (aChar & 0xC0) == 0x80; } + static bool is2byte(char aChar) { return (aChar & 0xE0) == 0xC0; } + static bool is3byte(char aChar) { return (aChar & 0xF0) == 0xE0; } + static bool is4byte(char aChar) { return (aChar & 0xF8) == 0xF0; } + static bool is5byte(char aChar) { return (aChar & 0xFC) == 0xF8; } + static bool is6byte(char aChar) { return (aChar & 0xFE) == 0xFC; } + // return the number of bytes in a sequence beginning with aChar + static int bytes(char aChar) { + if (isASCII(aChar)) { + return 1; + } + if (is2byte(aChar)) { + return 2; + } + if (is3byte(aChar)) { + return 3; + } + if (is4byte(aChar)) { + return 4; + } + MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters"); + return 1; + } +}; + +/** + * Extract the next Unicode scalar value from the buffer and return it. The + * pointer passed in is advanced to the start of the next character in the + * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced + * over the maximal valid prefix and *aErr is set to true (if aErr is not + * null). + * + * Note: This method never sets *aErr to false to allow error accumulation + * across multiple calls. + * + * Precondition: *aBuffer < aEnd + */ +class UTF8CharEnumerator { + public: + static inline char32_t NextChar(const char** aBuffer, const char* aEnd, + bool* aErr = nullptr) { + MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); + MOZ_ASSERT(aEnd, "null end pointer"); + + const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer); + const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd); + + MOZ_ASSERT(p, "null buffer"); + MOZ_ASSERT(p < end, "Bogus range"); + + unsigned char first = *p; + ++p; + + if (MOZ_LIKELY(first < 0x80U)) { + *aBuffer = reinterpret_cast<const char*>(p); + return first; + } + + // Unsigned underflow is defined behavior + if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) { + *aBuffer = reinterpret_cast<const char*>(p); + if (aErr) { + *aErr = true; + } + return 0xFFFDU; + } + + unsigned char second = *p; + + if (first < 0xE0U) { + // Two-byte + if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) { + ++p; + *aBuffer = reinterpret_cast<const char*>(p); + return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU); + } + *aBuffer = reinterpret_cast<const char*>(p); + if (aErr) { + *aErr = true; + } + return 0xFFFDU; + } + + if (MOZ_LIKELY(first < 0xF0U)) { + // Three-byte + unsigned char lower = 0x80U; + unsigned char upper = 0xBFU; + if (first == 0xE0U) { + lower = 0xA0U; + } else if (first == 0xEDU) { + upper = 0x9FU; + } + if (MOZ_LIKELY(second >= lower && second <= upper)) { + ++p; + if (MOZ_LIKELY(p != end)) { + unsigned char third = *p; + if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) { + ++p; + *aBuffer = reinterpret_cast<const char*>(p); + return ((uint32_t(first) & 0xFU) << 12) | + ((uint32_t(second) & 0x3FU) << 6) | + (uint32_t(third) & 0x3FU); + } + } + } + *aBuffer = reinterpret_cast<const char*>(p); + if (aErr) { + *aErr = true; + } + return 0xFFFDU; + } + + // Four-byte + unsigned char lower = 0x80U; + unsigned char upper = 0xBFU; + if (first == 0xF0U) { + lower = 0x90U; + } else if (first == 0xF4U) { + upper = 0x8FU; + } + if (MOZ_LIKELY(second >= lower && second <= upper)) { + ++p; + if (MOZ_LIKELY(p != end)) { + unsigned char third = *p; + if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) { + ++p; + if (MOZ_LIKELY(p != end)) { + unsigned char fourth = *p; + if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) { + ++p; + *aBuffer = reinterpret_cast<const char*>(p); + return ((uint32_t(first) & 0x7U) << 18) | + ((uint32_t(second) & 0x3FU) << 12) | + ((uint32_t(third) & 0x3FU) << 6) | + (uint32_t(fourth) & 0x3FU); + } + } + } + } + } + *aBuffer = reinterpret_cast<const char*>(p); + if (aErr) { + *aErr = true; + } + return 0xFFFDU; + } +}; + +/** + * Extract the next Unicode scalar value from the buffer and return it. The + * pointer passed in is advanced to the start of the next character in the + * buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over + * the unpaired surrogate and *aErr is set to true (if aErr is not null). + * + * Note: This method never sets *aErr to false to allow error accumulation + * across multiple calls. + * + * Precondition: *aBuffer < aEnd + */ +class UTF16CharEnumerator { + public: + static inline char32_t NextChar(const char16_t** aBuffer, + const char16_t* aEnd, bool* aErr = nullptr) { + MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); + MOZ_ASSERT(aEnd, "null end pointer"); + + const char16_t* p = *aBuffer; + + MOZ_ASSERT(p, "null buffer"); + MOZ_ASSERT(p < aEnd, "Bogus range"); + + char16_t c = *p++; + + // Let's use encoding_rs-style code golf here. + // Unsigned underflow is defined behavior + char16_t cMinusSurrogateStart = c - 0xD800U; + if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) { + *aBuffer = p; + return c; + } + if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) { + // High surrogate + if (MOZ_LIKELY(p != aEnd)) { + char16_t second = *p; + // Unsigned underflow is defined behavior + if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) { + *aBuffer = ++p; + return (uint32_t(c) << 10) + uint32_t(second) - + (((0xD800U << 10) - 0x10000U) + 0xDC00U); + } + } + } + // Unpaired surrogate + *aBuffer = p; + if (aErr) { + *aErr = true; + } + return 0xFFFDU; + } +}; + +template <typename Char, typename UnsignedT> +inline UnsignedT RewindToPriorUTF8Codepoint(const Char* utf8Chars, + UnsignedT index) { + static_assert(std::is_same_v<Char, char> || + std::is_same_v<Char, unsigned char> || + std::is_same_v<Char, signed char>, + "UTF-8 data must be in 8-bit units"); + static_assert(std::is_unsigned_v<UnsignedT>, "index type must be unsigned"); + while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) --index; + + return index; +} + +#undef UTF8UTILS_WARNING + +#endif /* !defined(nsUTF8Utils_h_) */ |