/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef js_CharacterEncoding_h #define js_CharacterEncoding_h #include "mozilla/Range.h" #include "mozilla/Span.h" #include "mozilla/Utf8.h" #include "js/TypeDecls.h" #include "js/Utility.h" class JSLinearString; namespace JS { /* * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each * byte is treated as a 2-byte character, and there is no way to pass in a * string containing characters beyond U+00FF. */ class Latin1Chars : public mozilla::Range { typedef mozilla::Range Base; public: using CharT = Latin1Char; Latin1Chars() = default; Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} Latin1Chars(const Latin1Char* aBytes, size_t aLength) : Base(const_cast(aBytes), aLength) {} Latin1Chars(const char* aBytes, size_t aLength) : Base(reinterpret_cast(const_cast(aBytes)), aLength) {} }; /* * Like Latin1Chars, but the chars are const. */ class ConstLatin1Chars : public mozilla::Range { typedef mozilla::Range Base; public: using CharT = Latin1Char; ConstLatin1Chars() = default; ConstLatin1Chars(const Latin1Char* aChars, size_t aLength) : Base(aChars, aLength) {} }; /* * A Latin1Chars, but with \0 termination for C compatibility. */ class Latin1CharsZ : public mozilla::RangedPtr { typedef mozilla::RangedPtr Base; public: using CharT = Latin1Char; Latin1CharsZ() : Base(nullptr, 0) {} // NOLINT Latin1CharsZ(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } Latin1CharsZ(Latin1Char* aBytes, size_t aLength) : Base(aBytes, aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } using Base::operator=; char* c_str() { return reinterpret_cast(get()); } }; class UTF8Chars : public mozilla::Range { typedef mozilla::Range Base; public: using CharT = unsigned char; UTF8Chars() = default; UTF8Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} UTF8Chars(const char* aBytes, size_t aLength) : Base(reinterpret_cast(const_cast(aBytes)), aLength) {} UTF8Chars(mozilla::Utf8Unit* aUnits, size_t aLength) : UTF8Chars(reinterpret_cast(aUnits), aLength) {} UTF8Chars(const mozilla::Utf8Unit* aUnits, size_t aLength) : UTF8Chars(reinterpret_cast(aUnits), aLength) {} }; /* * Similar to UTF8Chars, but contains WTF-8. * https://simonsapin.github.io/wtf-8/ */ class WTF8Chars : public mozilla::Range { typedef mozilla::Range Base; public: using CharT = unsigned char; WTF8Chars() = default; WTF8Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} WTF8Chars(const char* aBytes, size_t aLength) : Base(reinterpret_cast(const_cast(aBytes)), aLength) {} }; /* * SpiderMonkey also deals directly with UTF-8 encoded text in some places. */ class UTF8CharsZ : public mozilla::RangedPtr { typedef mozilla::RangedPtr Base; public: using CharT = unsigned char; UTF8CharsZ() : Base(nullptr, 0) {} // NOLINT UTF8CharsZ(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } UTF8CharsZ(unsigned char* aBytes, size_t aLength) : Base(aBytes, aLength) { MOZ_ASSERT(aBytes[aLength] == '\0'); } UTF8CharsZ(mozilla::Utf8Unit* aUnits, size_t aLength) : UTF8CharsZ(reinterpret_cast(aUnits), aLength) {} using Base::operator=; char* c_str() { return reinterpret_cast(get()); } }; /* * A wrapper for a "const char*" that is encoded using UTF-8. * This class does not manage ownership of the data; that is left * to others. This differs from UTF8CharsZ in that the chars are * const and it disallows assignment. */ class JS_PUBLIC_API ConstUTF8CharsZ { const char* data_; public: using CharT = unsigned char; ConstUTF8CharsZ() : data_(nullptr) {} ConstUTF8CharsZ(const char* aBytes, size_t aLength) : data_(aBytes) { MOZ_ASSERT(aBytes[aLength] == '\0'); #ifdef DEBUG validate(aLength); #endif } const void* get() const { return data_; } const char* c_str() const { return data_; } explicit operator bool() const { return data_ != nullptr; } private: #ifdef DEBUG void validate(size_t aLength); #endif }; /* * SpiderMonkey uses a 2-byte character representation: it is a * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2, * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a * sufficiently dedicated JavaScript program to be fully unicode-aware by * manually interpreting UTF-16 extension characters embedded in the JS * string. */ class TwoByteChars : public mozilla::Range { typedef mozilla::Range Base; public: using CharT = char16_t; TwoByteChars() = default; TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast(aChars), aLength) {} }; /* * A TwoByteChars, but \0 terminated for compatibility with JSFlatString. */ class TwoByteCharsZ : public mozilla::RangedPtr { typedef mozilla::RangedPtr Base; public: using CharT = char16_t; TwoByteCharsZ() : Base(nullptr, 0) {} // NOLINT TwoByteCharsZ(char16_t* chars, size_t length) : Base(chars, length) { MOZ_ASSERT(chars[length] == '\0'); } using Base::operator=; }; typedef mozilla::RangedPtr ConstCharPtr; /* * Like TwoByteChars, but the chars are const. */ class ConstTwoByteChars : public mozilla::Range { typedef mozilla::Range Base; public: using CharT = char16_t; ConstTwoByteChars() = default; ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} }; /* * Convert a 2-byte character sequence to "ISO-Latin-1". This works by * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source * contains any UTF-16 extension characters, then this may give invalid Latin1 * output. The returned string is zero terminated. The returned string or the * returned string's |start()| must be freed with JS_free or js_free, * respectively. If allocation fails, an OOM error will be set and the method * will return a nullptr chars (which can be tested for with the ! operator). * This method cannot trigger GC. */ extern Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ( JSContext* cx, const mozilla::Range tbchars); inline Latin1CharsZ LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, const char16_t* begin, size_t length) { const mozilla::Range tbchars(begin, length); return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars); } template extern UTF8CharsZ CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range chars); JS_PUBLIC_API uint32_t Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length); /* * Inflate bytes in UTF-8 encoding to char16_t. * - On error, returns an empty TwoByteCharsZ. * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold * its length; the length value excludes the trailing null. */ extern JS_PUBLIC_API TwoByteCharsZ UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId); /* * Like UTF8CharsToNewTwoByteCharsZ, but for WTF8Chars. */ extern JS_PUBLIC_API TwoByteCharsZ WTF8CharsToNewTwoByteCharsZ(JSContext* cx, const WTF8Chars wtf8, size_t* outlen, arena_id_t destArenaId); /* * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ. */ extern JS_PUBLIC_API TwoByteCharsZ UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen, arena_id_t destArenaId); /* * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 * characters will be replaced by \uFFFD. No exception will be thrown for * malformed UTF-8 input. */ extern JS_PUBLIC_API TwoByteCharsZ LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId); extern JS_PUBLIC_API TwoByteCharsZ LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen, arena_id_t destArenaId); /* * Returns the length of the char buffer required to encode |s| as UTF8. * Does not include the null-terminator. */ JS_PUBLIC_API size_t GetDeflatedUTF8StringLength(JSLinearString* s); /* * Encode whole scalar values of |src| into |dst| as UTF-8 until |src| is * exhausted or too little space is available in |dst| to fit the scalar * value. Lone surrogates are converted to REPLACEMENT CHARACTER. Return * the number of bytes of |dst| that were filled. * * Use |JS_EncodeStringToUTF8BufferPartial| if your string isn't already * linear. * * Given |JSString* str = JS_FORGET_STRING_LINEARNESS(src)|, * if |JS::StringHasLatin1Chars(str)|, then |src| is always fully converted * if |dst.Length() >= JS_GetStringLength(str) * 2|. Otherwise |src| is * always fully converted if |dst.Length() >= JS_GetStringLength(str) * 3|. * * The exact space required is always |GetDeflatedUTF8StringLength(str)|. */ JS_PUBLIC_API size_t DeflateStringToUTF8Buffer(JSLinearString* src, mozilla::Span dst); /* * The smallest character encoding capable of fully representing a particular * string. */ enum class SmallestEncoding { ASCII, Latin1, UTF16 }; /* * Returns the smallest encoding possible for the given string: if all * codepoints are <128 then ASCII, otherwise if all codepoints are <256 * Latin-1, else UTF16. */ JS_PUBLIC_API SmallestEncoding FindSmallestEncoding(UTF8Chars utf8); /* * Return a null-terminated Latin-1 string copied from the input string, * storing its length (excluding null terminator) in |*outlen|. Fail and * report an error if the string contains non-Latin-1 codepoints. Returns * Latin1CharsZ() on failure. */ extern JS_PUBLIC_API Latin1CharsZ UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId); /* * Return a null-terminated Latin-1 string copied from the input string, * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1 * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure. */ extern JS_PUBLIC_API Latin1CharsZ LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen, arena_id_t destArenaId); /* * Returns true if all characters in the given null-terminated string are * ASCII, i.e. < 0x80, false otherwise. */ extern JS_PUBLIC_API bool StringIsASCII(const char* s); /* * Returns true if all characters in the given span are ASCII, * i.e. < 0x80, false otherwise. */ extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span s); } // namespace JS inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); } /** * DEPRECATED * * Allocate memory sufficient to contain the characters of |str| truncated to * Latin-1 and a trailing null terminator, fill the memory with the characters * interpreted in that manner plus the null terminator, and return a pointer to * the memory. * * This function *loses information* when it copies the characters of |str| if * |str| contains code units greater than 0xFF. Additionally, users that * depend on null-termination will misinterpret the copied characters if |str| * contains any nulls. Avoid using this function if possible, because it will * eventually be removed. */ extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToLatin1(JSContext* cx, JSString* str); /** * DEPRECATED * * Same behavior as JS_EncodeStringToLatin1(), but encode into a UTF-8 string. * * This function *loses information* when it copies the characters of |str| if * |str| contains invalid UTF-16: U+FFFD REPLACEMENT CHARACTER will be copied * instead. * * The returned string is also subject to misinterpretation if |str| contains * any nulls (which are faithfully transcribed into the returned string, but * which will implicitly truncate the string if it's passed to functions that * expect null-terminated strings). * * Avoid using this function if possible, because we'll remove it once we can * devise a better API for the task. */ extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToUTF8( JSContext* cx, JS::Handle str); /** * DEPRECATED * * Same behavior as JS_EncodeStringToLatin1(), but encode into an ASCII string. * * This function asserts in debug mode that the input string contains only * ASCII characters. * * The returned string is also subject to misinterpretation if |str| contains * any nulls (which are faithfully transcribed into the returned string, but * which will implicitly truncate the string if it's passed to functions that * expect null-terminated strings). * * Avoid using this function if possible, because we'll remove it once we can * devise a better API for the task. */ extern JS_PUBLIC_API JS::UniqueChars JS_EncodeStringToASCII(JSContext* cx, JSString* str); #endif /* js_CharacterEncoding_h */