diff options
Diffstat (limited to 'js/public/CharacterEncoding.h')
-rw-r--r-- | js/public/CharacterEncoding.h | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/js/public/CharacterEncoding.h b/js/public/CharacterEncoding.h new file mode 100644 index 0000000000..9d1df4664b --- /dev/null +++ b/js/public/CharacterEncoding.h @@ -0,0 +1,448 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * vim: set ts=8 sts=2 et sw=2 tw=80: + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef js_CharacterEncoding_h +#define js_CharacterEncoding_h + +#include "mozilla/Range.h" +#include "mozilla/Span.h" + +#include "js/TypeDecls.h" +#include "js/Utility.h" + +class JSLinearString; + +namespace mozilla { +union Utf8Unit; +} + +namespace JS { + +/* + * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI + * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each + * byte is treated as a 2-byte character, and there is no way to pass in a + * string containing characters beyond U+00FF. + */ +class Latin1Chars : public mozilla::Range<Latin1Char> { + typedef mozilla::Range<Latin1Char> Base; + + public: + using CharT = Latin1Char; + + Latin1Chars() = default; + Latin1Chars(char* aBytes, size_t aLength) + : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {} + Latin1Chars(const Latin1Char* aBytes, size_t aLength) + : Base(const_cast<Latin1Char*>(aBytes), aLength) {} + Latin1Chars(const char* aBytes, size_t aLength) + : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), + aLength) {} +}; + +/* + * Like Latin1Chars, but the chars are const. + */ +class ConstLatin1Chars : public mozilla::Range<const Latin1Char> { + typedef mozilla::Range<const Latin1Char> Base; + + public: + using CharT = Latin1Char; + + ConstLatin1Chars() = default; + ConstLatin1Chars(const Latin1Char* aChars, size_t aLength) + : Base(aChars, aLength) {} +}; + +/* + * A Latin1Chars, but with \0 termination for C compatibility. + */ +class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char> { + typedef mozilla::RangedPtr<Latin1Char> Base; + + public: + using CharT = Latin1Char; + + Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT + + Latin1CharsZ(char* aBytes, size_t aLength) + : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) { + MOZ_ASSERT(aBytes[aLength] == '\0'); + } + + Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) { + MOZ_ASSERT(aBytes[aLength] == '\0'); + } + + using Base::operator=; + + char* c_str() { return reinterpret_cast<char*>(get()); } +}; + +class UTF8Chars : public mozilla::Range<unsigned char> { + typedef mozilla::Range<unsigned char> Base; + + public: + using CharT = unsigned char; + + UTF8Chars() = default; + UTF8Chars(char* aBytes, size_t aLength) + : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) {} + UTF8Chars(const char* aBytes, size_t aLength) + : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), + aLength) {} + UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength) + : UTF8Chars(reinterpret_cast<char*>(aUnits), aLength) {} + UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength) + : UTF8Chars(reinterpret_cast<const char*>(aUnits), aLength) {} +}; + +/* + * SpiderMonkey also deals directly with UTF-8 encoded text in some places. + */ +class UTF8CharsZ : public mozilla::RangedPtr<unsigned char> { + typedef mozilla::RangedPtr<unsigned char> Base; + + public: + using CharT = unsigned char; + + UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT + + UTF8CharsZ(char* aBytes, size_t aLength) + : Base(reinterpret_cast<unsigned char*>(aBytes), aLength) { + MOZ_ASSERT(aBytes[aLength] == '\0'); + } + + UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) { + MOZ_ASSERT(aBytes[aLength] == '\0'); + } + + UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength) + : UTF8CharsZ(reinterpret_cast<char*>(aUnits), aLength) {} + + using Base::operator=; + + char* c_str() { return reinterpret_cast<char*>(get()); } +}; + +/* + * A wrapper for a "const char*" that is encoded using UTF-8. + * This class does not manage ownership of the data; that is left + * to others. This differs from UTF8CharsZ in that the chars are + * const and it disallows assignment. + */ +class JS_PUBLIC_API ConstUTF8CharsZ { + const char* data_; + + public: + using CharT = unsigned char; + + ConstUTF8CharsZ() : data_(nullptr) {} + + explicit ConstUTF8CharsZ(const char* aBytes) : data_(aBytes) { +#ifdef DEBUG + if (aBytes) { + validateWithoutLength(); + } +#endif + } + + ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) { + MOZ_ASSERT(aBytes[aLength] == '\0'); +#ifdef DEBUG + validate(aLength); +#endif + } + + const void* get() const { return data_; } + + const char* c_str() const { return data_; } + + explicit operator bool() const { return data_ != nullptr; } + + private: +#ifdef DEBUG + void validate(size_t aLength); + void validateWithoutLength(); +#endif +}; + +/* + * SpiderMonkey uses a 2-byte character representation: it is a + * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, + * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a + * sufficiently dedicated JavaScript program to be fully unicode-aware by + * manually interpreting UTF-16 extension characters embedded in the JS + * string. + */ +class TwoByteChars : public mozilla::Range<char16_t> { + typedef mozilla::Range<char16_t> Base; + + public: + using CharT = char16_t; + + TwoByteChars() = default; + TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} + TwoByteChars(const char16_t* aChars, size_t aLength) + : Base(const_cast<char16_t*>(aChars), aLength) {} +}; + +/* + * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. + */ +class TwoByteCharsZ : public mozilla::RangedPtr<char16_t> { + typedef mozilla::RangedPtr<char16_t> Base; + + public: + using CharT = char16_t; + + TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT + + TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) { + MOZ_ASSERT(chars[length] == '\0'); + } + + using Base::operator=; +}; + +typedef mozilla::RangedPtr<const char16_t> ConstCharPtr; + +/* + * Like TwoByteChars, but the chars are const. + */ +class ConstTwoByteChars : public mozilla::Range<const char16_t> { + typedef mozilla::Range<const char16_t> Base; + + public: + using CharT = char16_t; + + ConstTwoByteChars() = default; + ConstTwoByteChars(const char16_t* aChars, size_t aLength) + : Base(aChars, aLength) {} +}; + +/* + * Convert a 2-byte character sequence to "ISO-Latin-1". This works by + * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source + * contains any UTF-16 extension characters, then this may give invalid Latin1 + * output. The returned string is zero terminated. The returned string or the + * returned string's |start()| must be freed with JS_free or js_free, + * respectively. If allocation fails, an OOM error will be set and the method + * will return a nullptr chars (which can be tested for with the ! operator). + * This method cannot trigger GC. + */ +extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ( + JSContext* cx, const mozilla::Range<const char16_t>& tbchars); + +inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, + const char16_t* begin, + size_t length) { + const mozilla::Range<const char16_t> tbchars(begin, length); + return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars); +} + +template <typename CharT, typename Allocator> +extern UTF8CharsZ CharsToNewUTF8CharsZ(Allocator* alloc, + const mozilla::Range<CharT>& chars); + +JS_PUBLIC_API char32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, + int utf8Length); + +/* + * Inflate bytes in UTF-8 encoding to char16_t. + * - On error, returns an empty TwoByteCharsZ. + * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold + * its length; the length value excludes the trailing null. + */ +extern JS_PUBLIC_API TwoByteCharsZ +UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8, + size_t* outlen, arena_id_t destArenaId); + +/* + * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ. + */ +extern JS_PUBLIC_API TwoByteCharsZ +UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, + size_t* outlen, arena_id_t destArenaId); + +/* + * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 + * characters will be replaced by \uFFFD. No exception will be thrown for + * malformed UTF-8 input. + */ +extern JS_PUBLIC_API TwoByteCharsZ +LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars& utf8, + size_t* outlen, arena_id_t destArenaId); + +extern JS_PUBLIC_API TwoByteCharsZ +LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, + size_t* outlen, arena_id_t destArenaId); + +/* + * Returns the length of the char buffer required to encode |s| as UTF8. + * Does not include the null-terminator. + */ +JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s); + +/* + * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is + * exhausted or too little space is available in |dst| to fit the scalar + * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return + * the number of bytes of |dst| that were filled. + * + * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already + * linear. + * + * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|, + * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted + * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is + * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|. + * + * The exact space required is always |GetDeflatedUTF8StringLength(str)|. + */ +JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src, + mozilla::Span<char> dst); + +/* + * The smallest character encoding capable of fully representing a particular + * string. + */ +enum class SmallestEncoding { ASCII, Latin1, UTF16 }; + +/* + * Returns the smallest encoding possible for the given string: if all + * codepoints are <128 then ASCII, otherwise if all codepoints are <256 + * Latin-1, else UTF16. + */ +JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(const UTF8Chars& utf8); + +/* + * Return a null-terminated Latin-1 string copied from the input string, + * storing its length (excluding null terminator) in |*outlen|. Fail and + * report an error if the string contains non-Latin-1 codepoints. Returns + * Latin1CharsZ() on failure. + */ +extern JS_PUBLIC_API Latin1CharsZ +UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8, size_t* outlen, + arena_id_t destArenaId); + +/* + * Return a null-terminated Latin-1 string copied from the input string, + * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1 + * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure. + */ +extern JS_PUBLIC_API Latin1CharsZ +LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars& utf8, + size_t* outlen, arena_id_t destArenaId); + +/* + * Returns true if all characters in the given null-terminated string are + * ASCII, i.e. < 0x80, false otherwise. + */ +extern JS_PUBLIC_API bool StringIsASCII(const char* s); + +/* + * Returns true if all characters in the given span are ASCII, + * i.e. < 0x80, false otherwise. + */ +extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s); + +/** + * Encode a narrow multibyte character string to a UTF-8 string. + * + * NOTE: Should only be used when interacting with POSIX/OS functions and not + * for encoding ASCII/Latin-1/etc. strings to UTF-8. + */ +extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx, + const char* chars); + +/** + * Encode a wide string to a UTF-8 string. + * + * NOTE: Should only be used when interacting with Windows API functions. + */ +extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx, + const wchar_t* chars); + +/** + * Encode a UTF-8 string to a narrow multibyte character string. + * + * NOTE: Should only be used when interacting with POSIX/OS functions and not + * for encoding UTF-8 to ASCII/Latin-1/etc. strings. + */ +extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx, + const char* chars); + +/** + * Encode a UTF-8 string to a wide string. + * + * NOTE: Should only be used when interacting with Windows API functions. + */ +extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx, + const char* chars); + +} // namespace JS + +inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } +inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); } + +/** + * DEPRECATED + * + * Allocate memory sufficient to contain the characters of |str| truncated to + * Latin-1 and a trailing null terminator, fill the memory with the characters + * interpreted in that manner plus the null terminator, and return a pointer to + * the memory. + * + * This function *loses information* when it copies the characters of |str| if + * |str| contains code units greater than 0xFF. Additionally, users that + * depend on null-termination will misinterpret the copied characters if |str| + * contains any nulls. Avoid using this function if possible, because it will + * eventually be removed. + */ +extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx, + JSString* str); + +/** + * DEPRECATED + * + * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string. + * + * This function *loses information* when it copies the characters of |str| if + * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied + * instead. + * + * The returned string is also subject to misinterpretation if |str| contains + * any nulls (which are faithfully transcribed into the returned string, but + * which will implicitly truncate the string if it's passed to functions that + * expect null-terminated strings). + * + * Avoid using this function if possible, because we'll remove it once we can + * devise a better API for the task. + */ +extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8( + JSContext* cx, JS::Handle<JSString*> str); + +/** + * DEPRECATED + * + * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string. + * + * This function asserts in debug mode that the input string contains only + * ASCII characters. + * + * The returned string is also subject to misinterpretation if |str| contains + * any nulls (which are faithfully transcribed into the returned string, but + * which will implicitly truncate the string if it's passed to functions that + * expect null-terminated strings). + * + * Avoid using this function if possible, because we'll remove it once we can + * devise a better API for the task. + */ +extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx, + JSString* str); + +#endif /* js_CharacterEncoding_h */ |