diff options
Diffstat (limited to '')
-rw-r--r-- | xpcom/string/nsCharTraits.h | 487 |
1 files changed, 487 insertions, 0 deletions
diff --git a/xpcom/string/nsCharTraits.h b/xpcom/string/nsCharTraits.h new file mode 100644 index 0000000000..9941d4996d --- /dev/null +++ b/xpcom/string/nsCharTraits.h @@ -0,0 +1,487 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsCharTraits_h___ +#define nsCharTraits_h___ + +#include <ctype.h> // for |EOF|, |WEOF| +#include <stdint.h> // for |uint32_t| +#include <string.h> // for |memcpy|, et al +#include "mozilla/MemoryChecking.h" + +// This file may be used (through nsUTF8Utils.h) from non-XPCOM code, in +// particular the standalone software updater. In that case stub out +// the macros provided by nsDebug.h which are only usable when linking XPCOM + +#ifdef NS_NO_XPCOM +# define NS_WARNING(msg) +# define NS_ASSERTION(cond, msg) +# define NS_ERROR(msg) +#else +# include "nsDebug.h" // for NS_ASSERTION +#endif + +/* + * Some macros for converting char16_t (UTF-16) to and from Unicode scalar + * values. + * + * Note that UTF-16 represents all Unicode scalar values up to U+10FFFF by + * using "surrogate pairs". These consist of a high surrogate, i.e. a code + * point in the range U+D800 - U+DBFF, and a low surrogate, i.e. a code point + * in the range U+DC00 - U+DFFF, like this: + * + * U+D800 U+DC00 = U+10000 + * U+D800 U+DC01 = U+10001 + * ... + * U+DBFF U+DFFE = U+10FFFE + * U+DBFF U+DFFF = U+10FFFF + * + * These surrogate code points U+D800 - U+DFFF are not themselves valid Unicode + * scalar values and are not well-formed UTF-16 except as high-surrogate / + * low-surrogate pairs. + */ + +#define PLANE1_BASE uint32_t(0x00010000) +// High surrogates are in the range 0xD800 -- OxDBFF +#define NS_IS_HIGH_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xD800) +// Low surrogates are in the range 0xDC00 -- 0xDFFF +#define NS_IS_LOW_SURROGATE(u) ((uint32_t(u) & 0xFFFFFC00) == 0xDC00) +// Easier to type than NS_IS_HIGH_SURROGATE && NS_IS_LOW_SURROGATE +#define NS_IS_SURROGATE_PAIR(h, l) \ + (NS_IS_HIGH_SURROGATE(h) && NS_IS_LOW_SURROGATE(l)) +// Faster than testing NS_IS_HIGH_SURROGATE || NS_IS_LOW_SURROGATE +#define IS_SURROGATE(u) ((uint32_t(u) & 0xFFFFF800) == 0xD800) + +// Everything else is not a surrogate: 0x000 -- 0xD7FF, 0xE000 -- 0xFFFF + +// N = (H - 0xD800) * 0x400 + 0x10000 + (L - 0xDC00) +// I wonder whether we could somehow assert that H is a high surrogate +// and L is a low surrogate +#define SURROGATE_TO_UCS4(h, l) \ + (((uint32_t(h) & 0x03FF) << 10) + (uint32_t(l) & 0x03FF) + PLANE1_BASE) + +// Extract surrogates from a UCS4 char +// Reference: the Unicode standard 4.0, section 3.9 +// Since (c - 0x10000) >> 10 == (c >> 10) - 0x0080 and +// 0xD7C0 == 0xD800 - 0x0080, +// ((c - 0x10000) >> 10) + 0xD800 can be simplified to +#define H_SURROGATE(c) char16_t(char16_t(uint32_t(c) >> 10) + char16_t(0xD7C0)) +// where it's to be noted that 0xD7C0 is not bitwise-OR'd +// but added. + +// Since 0x10000 & 0x03FF == 0, +// (c - 0x10000) & 0x03FF == c & 0x03FF so that +// ((c - 0x10000) & 0x03FF) | 0xDC00 is equivalent to +#define L_SURROGATE(c) \ + char16_t(char16_t(uint32_t(c) & uint32_t(0x03FF)) | char16_t(0xDC00)) + +#define IS_IN_BMP(ucs) (uint32_t(ucs) < PLANE1_BASE) +#define UCS2_REPLACEMENT_CHAR char16_t(0xFFFD) + +#define UCS_END uint32_t(0x00110000) +#define IS_VALID_CHAR(c) ((uint32_t(c) < UCS_END) && !IS_SURROGATE(c)) +#define ENSURE_VALID_CHAR(c) (IS_VALID_CHAR(c) ? (c) : UCS2_REPLACEMENT_CHAR) + +template <class CharT> +struct nsCharTraits {}; + +template <> +struct nsCharTraits<char16_t> { + typedef char16_t char_type; + typedef uint16_t unsigned_char_type; + typedef char incompatible_char_type; + + static char_type* const sEmptyBuffer; + + // integer representation of characters: + typedef int int_type; + + static char_type to_char_type(int_type aChar) { return char_type(aChar); } + + static int_type to_int_type(char_type aChar) { + return int_type(static_cast<unsigned_char_type>(aChar)); + } + + static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; } + + // |char_type| comparisons: + + static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; } + + static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; } + + // operations on s[n] arrays: + + static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) { + return static_cast<char_type*>( + memmove(aStr1, aStr2, aN * sizeof(char_type))); + } + + static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) { + return static_cast<char_type*>( + memcpy(aStr1, aStr2, aN * sizeof(char_type))); + } + + static void uninitialize(char_type* aStr, size_t aN) { +#ifdef DEBUG + memset(aStr, 0xE4, aN * sizeof(char_type)); +#endif + MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type)); + } + + static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) { + for (char_type* s = aStr1; aN--; ++s, ++aStr2) { + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + *s = static_cast<char_type>(*aStr2); + } + return aStr1; + } + + static int compare(const char_type* aStr1, const char_type* aStr2, + size_t aN) { + for (; aN--; ++aStr1, ++aStr2) { + if (!eq(*aStr1, *aStr2)) { + return to_int_type(*aStr1) - to_int_type(*aStr2); + } + } + + return 0; + } + + static int compareASCII(const char_type* aStr1, const char* aStr2, + size_t aN) { + for (; aN--; ++aStr1, ++aStr2) { + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + if (!eq_int_type(to_int_type(*aStr1), + to_int_type(static_cast<char_type>(*aStr2)))) { + return to_int_type(*aStr1) - + to_int_type(static_cast<char_type>(*aStr2)); + } + } + + return 0; + } + + static bool equalsLatin1(const char_type* aStr1, const char* aStr2, + const size_t aN) { + for (size_t i = aN; i > 0; --i, ++aStr1, ++aStr2) { + if (*aStr1 != static_cast<char_type>(*aStr2)) { + return false; + } + } + + return true; + } + + // this version assumes that s2 is null-terminated and s1 has length n. + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, + // we return 1. + static int compareASCIINullTerminated(const char_type* aStr1, size_t aN, + const char* aStr2) { + for (; aN--; ++aStr1, ++aStr2) { + if (!*aStr2) { + return 1; + } + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + if (!eq_int_type(to_int_type(*aStr1), + to_int_type(static_cast<char_type>(*aStr2)))) { + return to_int_type(*aStr1) - + to_int_type(static_cast<char_type>(*aStr2)); + } + } + + if (*aStr2) { + return -1; + } + + return 0; + } + + /** + * Convert c to its lower-case form, but only if c is in the ASCII + * range. Otherwise leave it alone. + */ + static char_type ASCIIToLower(char_type aChar) { + if (aChar >= 'A' && aChar <= 'Z') { + return char_type(aChar + ('a' - 'A')); + } + + return aChar; + } + + static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, + size_t aN) { + for (; aN--; ++aStr1, ++aStr2) { + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), + "Unexpected uppercase character"); + char_type lower_s1 = ASCIIToLower(*aStr1); + if (lower_s1 != static_cast<char_type>(*aStr2)) { + return to_int_type(lower_s1) - + to_int_type(static_cast<char_type>(*aStr2)); + } + } + + return 0; + } + + // this version assumes that s2 is null-terminated and s1 has length n. + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, + // we return 1. + static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1, + size_t aN, + const char* aStr2) { + for (; aN--; ++aStr1, ++aStr2) { + if (!*aStr2) { + return 1; + } + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), + "Unexpected uppercase character"); + char_type lower_s1 = ASCIIToLower(*aStr1); + if (lower_s1 != static_cast<char_type>(*aStr2)) { + return to_int_type(lower_s1) - + to_int_type(static_cast<char_type>(*aStr2)); + } + } + + if (*aStr2) { + return -1; + } + + return 0; + } + + static size_t length(const char_type* aStr) { + size_t result = 0; + while (!eq(*aStr++, char_type(0))) { + ++result; + } + return result; + } + + static const char_type* find(const char_type* aStr, size_t aN, + char_type aChar) { + while (aN--) { + if (eq(*aStr, aChar)) { + return aStr; + } + ++aStr; + } + + return 0; + } +}; + +template <> +struct nsCharTraits<char> { + typedef char char_type; + typedef unsigned char unsigned_char_type; + typedef char16_t incompatible_char_type; + + static char_type* const sEmptyBuffer; + + // integer representation of characters: + + typedef int int_type; + + static char_type to_char_type(int_type aChar) { return char_type(aChar); } + + static int_type to_int_type(char_type aChar) { + return int_type(static_cast<unsigned_char_type>(aChar)); + } + + static bool eq_int_type(int_type aLhs, int_type aRhs) { return aLhs == aRhs; } + + // |char_type| comparisons: + + static bool eq(char_type aLhs, char_type aRhs) { return aLhs == aRhs; } + + static bool lt(char_type aLhs, char_type aRhs) { return aLhs < aRhs; } + + // operations on s[n] arrays: + + static char_type* move(char_type* aStr1, const char_type* aStr2, size_t aN) { + return static_cast<char_type*>( + memmove(aStr1, aStr2, aN * sizeof(char_type))); + } + + static char_type* copy(char_type* aStr1, const char_type* aStr2, size_t aN) { + return static_cast<char_type*>( + memcpy(aStr1, aStr2, aN * sizeof(char_type))); + } + + static void uninitialize(char_type* aStr, size_t aN) { +#ifdef DEBUG + memset(aStr, 0xE4, aN * sizeof(char_type)); +#endif + MOZ_MAKE_MEM_UNDEFINED(aStr, aN * sizeof(char_type)); + } + + static char_type* copyASCII(char_type* aStr1, const char* aStr2, size_t aN) { + return copy(aStr1, aStr2, aN); + } + + static int compare(const char_type* aStr1, const char_type* aStr2, + size_t aN) { + return memcmp(aStr1, aStr2, aN); + } + + static int compareASCII(const char_type* aStr1, const char* aStr2, + size_t aN) { +#ifdef DEBUG + for (size_t i = 0; i < aN; ++i) { + NS_ASSERTION(!(aStr2[i] & ~0x7F), "Unexpected non-ASCII character"); + } +#endif + return compare(aStr1, aStr2, aN); + } + + static bool equalsLatin1(const char_type* aStr1, const char* aStr2, + size_t aN) { + return memcmp(aStr1, aStr2, aN) == 0; + } + + // this version assumes that s2 is null-terminated and s1 has length n. + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, + // we return 1. + static int compareASCIINullTerminated(const char_type* aStr1, size_t aN, + const char* aStr2) { + // can't use strcmp here because we don't want to stop when aStr1 + // contains a null + for (; aN--; ++aStr1, ++aStr2) { + if (!*aStr2) { + return 1; + } + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + if (*aStr1 != *aStr2) { + return to_int_type(*aStr1) - to_int_type(*aStr2); + } + } + + if (*aStr2) { + return -1; + } + + return 0; + } + + /** + * Convert c to its lower-case form, but only if c is ASCII. + */ + static char_type ASCIIToLower(char_type aChar) { + if (aChar >= 'A' && aChar <= 'Z') { + return char_type(aChar + ('a' - 'A')); + } + + return aChar; + } + + static int compareLowerCaseToASCII(const char_type* aStr1, const char* aStr2, + size_t aN) { + for (; aN--; ++aStr1, ++aStr2) { + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), + "Unexpected uppercase character"); + char_type lower_s1 = ASCIIToLower(*aStr1); + if (lower_s1 != *aStr2) { + return to_int_type(lower_s1) - to_int_type(*aStr2); + } + } + return 0; + } + + // this version assumes that s2 is null-terminated and s1 has length n. + // if s1 is shorter than s2 then we return -1; if s1 is longer than s2, + // we return 1. + static int compareLowerCaseToASCIINullTerminated(const char_type* aStr1, + size_t aN, + const char* aStr2) { + for (; aN--; ++aStr1, ++aStr2) { + if (!*aStr2) { + return 1; + } + NS_ASSERTION(!(*aStr2 & ~0x7F), "Unexpected non-ASCII character"); + NS_ASSERTION(!(*aStr2 >= 'A' && *aStr2 <= 'Z'), + "Unexpected uppercase character"); + char_type lower_s1 = ASCIIToLower(*aStr1); + if (lower_s1 != *aStr2) { + return to_int_type(lower_s1) - to_int_type(*aStr2); + } + } + + if (*aStr2) { + return -1; + } + + return 0; + } + + static size_t length(const char_type* aStr) { return strlen(aStr); } + + static const char_type* find(const char_type* aStr, size_t aN, + char_type aChar) { + return reinterpret_cast<const char_type*>( + memchr(aStr, to_int_type(aChar), aN)); + } +}; + +template <class InputIterator> +struct nsCharSourceTraits { + typedef typename InputIterator::difference_type difference_type; + + static uint32_t readable_distance(const InputIterator& aFirst, + const InputIterator& aLast) { + // assumes single fragment + return uint32_t(aLast.get() - aFirst.get()); + } + + static const typename InputIterator::value_type* read( + const InputIterator& aIter) { + return aIter.get(); + } + + static void advance(InputIterator& aStr, difference_type aN) { + aStr.advance(aN); + } +}; + +template <class CharT> +struct nsCharSourceTraits<CharT*> { + typedef ptrdiff_t difference_type; + + static uint32_t readable_distance(CharT* aStr) { + return uint32_t(nsCharTraits<CharT>::length(aStr)); + // return numeric_limits<uint32_t>::max(); + } + + static uint32_t readable_distance(CharT* aFirst, CharT* aLast) { + return uint32_t(aLast - aFirst); + } + + static const CharT* read(CharT* aStr) { return aStr; } + + static void advance(CharT*& aStr, difference_type aN) { aStr += aN; } +}; + +template <class OutputIterator> +struct nsCharSinkTraits { + static void write(OutputIterator& aIter, + const typename OutputIterator::value_type* aStr, + uint32_t aN) { + aIter.write(aStr, aN); + } +}; + +template <class CharT> +struct nsCharSinkTraits<CharT*> { + static void write(CharT*& aIter, const CharT* aStr, uint32_t aN) { + nsCharTraits<CharT>::move(aIter, aStr, aN); + aIter += aN; + } +}; + +#endif // !defined(nsCharTraits_h___) |