diff options
Diffstat (limited to 'intl/components/src/ICU4CGlue.h')
-rw-r--r-- | intl/components/src/ICU4CGlue.h | 723 |
1 files changed, 723 insertions, 0 deletions
diff --git a/intl/components/src/ICU4CGlue.h b/intl/components/src/ICU4CGlue.h new file mode 100644 index 0000000000..aad04a6ef6 --- /dev/null +++ b/intl/components/src/ICU4CGlue.h @@ -0,0 +1,723 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef intl_components_ICUUtils_h +#define intl_components_ICUUtils_h + +#include "unicode/uenum.h" +#include "unicode/utypes.h" +#include "mozilla/Buffer.h" +#include "mozilla/DebugOnly.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/Utf8.h" +#include "mozilla/Vector.h" +#include "mozilla/intl/ICUError.h" + +// When building standalone js shell, it will include headers from +// intl/components if JS_HAS_INTL_API is true (the default value), but js shell +// won't include headers from XPCOM, so don't include nsTArray.h when building +// standalone js shell. +#ifndef JS_STANDALONE +# include "nsTArray.h" +#endif + +#include <cstring> +#include <iterator> +#include <stddef.h> +#include <stdint.h> +#include <string> +#include <string_view> + +struct UFormattedValue; +namespace mozilla::intl { + +template <typename CharType> +static inline CharType* AssertNullTerminatedString(Span<CharType> aSpan) { + // Intentionally check one past the last character, because we expect that the + // NUL character isn't part of the string. + MOZ_ASSERT(*(aSpan.data() + aSpan.size()) == '\0'); + + // Also ensure there aren't any other NUL characters within the string. + MOZ_ASSERT(std::char_traits<std::remove_const_t<CharType>>::length( + aSpan.data()) == aSpan.size()); + + return aSpan.data(); +} + +static inline const char* AssertNullTerminatedString(std::string_view aView) { + // Intentionally check one past the last character, because we expect that the + // NUL character isn't part of the string. + MOZ_ASSERT(*(aView.data() + aView.size()) == '\0'); + + // Also ensure there aren't any other NUL characters within the string. + MOZ_ASSERT(std::strlen(aView.data()) == aView.size()); + + return aView.data(); +} + +/** + * Map the "und" locale to an empty string, which ICU uses internally. + */ +static inline const char* IcuLocale(const char* aLocale) { + // Return the empty string if the input is exactly equal to the string "und". + const char* locale = aLocale; + if (!std::strcmp(locale, "und")) { + locale = ""; // ICU root locale + } + return locale; +} + +/** + * Ensure a locale is null-terminated, and map the "und" locale to an empty + * string, which ICU uses internally. + */ +static inline const char* IcuLocale(Span<const char> aLocale) { + return IcuLocale(AssertNullTerminatedString(aLocale)); +} + +/** + * Ensure a locale in the buffer is null-terminated, and map the "und" locale to + * an empty string, which ICU uses internally. + */ +static inline const char* IcuLocale(const Buffer<char>& aLocale) { + return IcuLocale(Span(aLocale.begin(), aLocale.Length() - 1)); +} + +using ICUResult = Result<Ok, ICUError>; + +/** + * Convert a UErrorCode to ICUError. This will correctly apply the OutOfMemory + * case. + */ +ICUError ToICUError(UErrorCode status); + +/** + * Convert a UErrorCode to ICUResult. This will correctly apply the OutOfMemory + * case. + */ +ICUResult ToICUResult(UErrorCode status); + +/** + * The ICU status can complain about a string not being terminated, but this + * is fine for this API, as it deals with the mozilla::Span that has a pointer + * and a length. + */ +static inline bool ICUSuccessForStringSpan(UErrorCode status) { + return U_SUCCESS(status) || status == U_STRING_NOT_TERMINATED_WARNING; +} + +/** + * This class enforces that the unified mozilla::intl methods match the + * const-ness of the underlying ICU4C API calls. const ICU4C APIs take a const + * pointer, while mutable ones take a non-const pointer. + * + * For const ICU4C calls use: + * ICUPointer::GetConst(). + * + * For non-const ICU4C calls use: + * ICUPointer::GetMut(). + * + * This will propagate the `const` specifier from the ICU4C API call to the + * unified method, and it will be enforced by the compiler. This helps ensures + * a consistence and correct implementation. + */ +template <typename T> +class ICUPointer { + public: + explicit ICUPointer(T* aPointer) : mPointer(aPointer) {} + + // Only allow moves of ICUPointers, no copies. + ICUPointer(ICUPointer&& other) noexcept = default; + ICUPointer& operator=(ICUPointer&& other) noexcept = default; + + // Implicitly take ownership of a raw pointer through copy assignment. + ICUPointer& operator=(T* aPointer) noexcept { + mPointer = aPointer; + return *this; + }; + + const T* GetConst() const { return const_cast<const T*>(mPointer); } + T* GetMut() { return mPointer; } + + explicit operator bool() const { return !!mPointer; } + + private: + T* mPointer; +}; + +/** + * Calling into ICU with the C-API can be a bit tricky. This function wraps up + * the relatively risky operations involving pointers, lengths, and buffers into + * a simpler call. This function accepts a lambda that performs the ICU call, + * and returns the length of characters in the buffer. When using a temporary + * stack-based buffer, the calls can often be done in one trip. However, if + * additional memory is needed, this function will call the C-API twice, in + * order to first get the size of the result, and then second to copy the result + * over to the buffer. + */ +template <typename ICUStringFunction, typename Buffer> +static ICUResult FillBufferWithICUCall(Buffer& buffer, + const ICUStringFunction& strFn) { + static_assert(std::is_same_v<typename Buffer::CharType, char16_t> || + std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, uint8_t>); + + UErrorCode status = U_ZERO_ERROR; + int32_t length = strFn(buffer.data(), buffer.capacity(), &status); + if (status == U_BUFFER_OVERFLOW_ERROR) { + MOZ_ASSERT(length >= 0); + + if (!buffer.reserve(length)) { + return Err(ICUError::OutOfMemory); + } + + status = U_ZERO_ERROR; + mozilla::DebugOnly<int32_t> length2 = strFn(buffer.data(), length, &status); + MOZ_ASSERT(length == length2); + } + if (!ICUSuccessForStringSpan(status)) { + return Err(ToICUError(status)); + } + + buffer.written(length); + + return Ok{}; +} + +/** + * Adaptor for mozilla::Vector to implement the Buffer interface. + */ +template <typename T, size_t N> +class VectorToBufferAdaptor { + mozilla::Vector<T, N>& vector; + + public: + using CharType = T; + + explicit VectorToBufferAdaptor(mozilla::Vector<T, N>& vector) + : vector(vector) {} + + T* data() { return vector.begin(); } + + size_t capacity() const { return vector.capacity(); } + + bool reserve(size_t length) { return vector.reserve(length); } + + void written(size_t length) { + mozilla::DebugOnly<bool> result = vector.resizeUninitialized(length); + MOZ_ASSERT(result); + } +}; + +/** + * An overload of FillBufferWithICUCall that accepts a mozilla::Vector rather + * than a Buffer. + */ +template <typename ICUStringFunction, size_t InlineSize, typename CharType> +static ICUResult FillBufferWithICUCall(Vector<CharType, InlineSize>& vector, + const ICUStringFunction& strFn) { + VectorToBufferAdaptor buffer(vector); + return FillBufferWithICUCall(buffer, strFn); +} + +#ifndef JS_STANDALONE +/** + * mozilla::intl APIs require sizeable buffers. This class abstracts over + * the nsTArray. + */ +template <typename T> +class nsTArrayToBufferAdapter { + public: + using CharType = T; + + // Do not allow copy or move. Move could be added in the future if needed. + nsTArrayToBufferAdapter(const nsTArrayToBufferAdapter&) = delete; + nsTArrayToBufferAdapter& operator=(const nsTArrayToBufferAdapter&) = delete; + + explicit nsTArrayToBufferAdapter(nsTArray<CharType>& aArray) + : mArray(aArray) {} + + /** + * Ensures the buffer has enough space to accommodate |size| elements. + */ + [[nodiscard]] bool reserve(size_t size) { + // Use fallible behavior here. + return mArray.SetCapacity(size, fallible); + } + + /** + * Returns the raw data inside the buffer. + */ + CharType* data() { return mArray.Elements(); } + + /** + * Returns the count of elements written into the buffer. + */ + size_t length() const { return mArray.Length(); } + + /** + * Returns the buffer's overall capacity. + */ + size_t capacity() const { return mArray.Capacity(); } + + /** + * Resizes the buffer to the given amount of written elements. + */ + void written(size_t amount) { + MOZ_ASSERT(amount <= mArray.Capacity()); + // This sets |mArray|'s internal size so that it matches how much was + // written. This is necessary because the write happens across FFI + // boundaries. + mArray.SetLengthAndRetainStorage(amount); + } + + private: + nsTArray<CharType>& mArray; +}; + +template <typename T, size_t N> +class AutoTArrayToBufferAdapter : public nsTArrayToBufferAdapter<T> { + using nsTArrayToBufferAdapter<T>::nsTArrayToBufferAdapter; +}; + +/** + * An overload of FillBufferWithICUCall that accepts a nsTArray. + */ +template <typename ICUStringFunction, typename CharType> +static ICUResult FillBufferWithICUCall(nsTArray<CharType>& array, + const ICUStringFunction& strFn) { + nsTArrayToBufferAdapter<CharType> buffer(array); + return FillBufferWithICUCall(buffer, strFn); +} + +template <typename ICUStringFunction, typename CharType, size_t N> +static ICUResult FillBufferWithICUCall(AutoTArray<CharType, N>& array, + const ICUStringFunction& strFn) { + AutoTArrayToBufferAdapter<CharType, N> buffer(array); + return FillBufferWithICUCall(buffer, strFn); +} +#endif + +/** + * Fill a UTF-8 or a UTF-16 buffer with a UTF-16 span. ICU4C mostly uses UTF-16 + * internally, but different consumers may have different situations with their + * buffers. + */ +template <typename Buffer> +[[nodiscard]] bool FillBuffer(Span<const char16_t> utf16Span, + Buffer& targetBuffer) { + static_assert(std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char> || + std::is_same_v<typename Buffer::CharType, char16_t>); + + if constexpr (std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char>) { + if (utf16Span.Length() & mozilla::tl::MulOverflowMask<3>::value) { + // Tripling the size of the buffer overflows the size_t. + return false; + } + + if (!targetBuffer.reserve(3 * utf16Span.Length())) { + return false; + } + + size_t amount = ConvertUtf16toUtf8( + utf16Span, Span(reinterpret_cast<char*>(targetBuffer.data()), + targetBuffer.capacity())); + + targetBuffer.written(amount); + } + if constexpr (std::is_same_v<typename Buffer::CharType, char16_t>) { + size_t amount = utf16Span.Length(); + if (!targetBuffer.reserve(amount)) { + return false; + } + for (size_t i = 0; i < amount; i++) { + targetBuffer.data()[i] = utf16Span[i]; + } + targetBuffer.written(amount); + } + + return true; +} + +/** + * Fill a UTF-8 or a UTF-16 buffer with a UTF-8 span. ICU4C mostly uses UTF-16 + * internally, but different consumers may have different situations with their + * buffers. + */ +template <typename Buffer> +[[nodiscard]] bool FillBuffer(Span<const char> utf8Span, Buffer& targetBuffer) { + static_assert(std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char> || + std::is_same_v<typename Buffer::CharType, char16_t>); + + if constexpr (std::is_same_v<typename Buffer::CharType, char> || + std::is_same_v<typename Buffer::CharType, unsigned char>) { + size_t amount = utf8Span.Length(); + if (!targetBuffer.reserve(amount)) { + return false; + } + for (size_t i = 0; i < amount; i++) { + targetBuffer.data()[i] = + // Static cast in case of a mismatch between `unsigned char` and + // `char` + static_cast<typename Buffer::CharType>(utf8Span[i]); + } + targetBuffer.written(amount); + } + if constexpr (std::is_same_v<typename Buffer::CharType, char16_t>) { + if (!targetBuffer.reserve(utf8Span.Length() + 1)) { + return false; + } + + size_t amount = ConvertUtf8toUtf16( + utf8Span, Span(targetBuffer.data(), targetBuffer.capacity())); + + targetBuffer.written(amount); + } + + return true; +} + +/** + * It is convenient for callers to be able to pass in UTF-8 strings to the API. + * This function can be used to convert that to a stack-allocated UTF-16 + * mozilla::Vector that can then be passed into ICU calls. The string will be + * null terminated. + */ +template <size_t StackSize> +[[nodiscard]] static bool FillUTF16Vector( + Span<const char> utf8Span, + mozilla::Vector<char16_t, StackSize>& utf16TargetVec) { + // Per ConvertUtf8toUtf16: The length of aDest must be at least one greater + // than the length of aSource. This additional length will be used for null + // termination. + if (!utf16TargetVec.reserve(utf8Span.Length() + 1)) { + return false; + } + + // ConvertUtf8toUtf16 fills the buffer with the data, but the length of the + // vector is unchanged. + size_t length = ConvertUtf8toUtf16( + utf8Span, Span(utf16TargetVec.begin(), utf16TargetVec.capacity())); + + // Assert that the last element is free for writing a null terminator. + MOZ_ASSERT(length < utf16TargetVec.capacity()); + utf16TargetVec.begin()[length] = '\0'; + + // The call to resizeUninitialized notifies the vector of how much was written + // exclusive of the null terminated character. + return utf16TargetVec.resizeUninitialized(length); +} + +/** + * An iterable class that wraps calls to the ICU UEnumeration C API. + * + * Usage: + * + * // Make sure the range expression is non-temporary, otherwise there is a + * // risk of undefined behavior: + * auto result = Calendar::GetBcp47KeywordValuesForLocale("en-US"); + * + * for (auto name : result.unwrap()) { + * MOZ_ASSERT(name.unwrap(), "An iterable value exists".); + * } + */ +template <typename CharType, typename T, T(Mapper)(const CharType*, int32_t)> +class Enumeration { + public: + class Iterator; + friend class Iterator; + + // Transfer ownership of the UEnumeration in the move constructor. + Enumeration(Enumeration&& other) noexcept + : mUEnumeration(other.mUEnumeration) { + other.mUEnumeration = nullptr; + } + + // Transfer ownership of the UEnumeration in the move assignment operator. + Enumeration& operator=(Enumeration&& other) noexcept { + if (this == &other) { + return *this; + } + if (mUEnumeration) { + uenum_close(mUEnumeration); + } + mUEnumeration = other.mUEnumeration; + other.mUEnumeration = nullptr; + return *this; + } + + class Iterator { + Enumeration& mEnumeration; + // `Nothing` signifies that no enumeration has been loaded through ICU yet. + Maybe<int32_t> mIteration = Nothing{}; + const CharType* mNext = nullptr; + int32_t mNextLength = 0; + + public: + using value_type = const CharType*; + using reference = T; + using iterator_category = std::input_iterator_tag; + + explicit Iterator(Enumeration& aEnumeration, bool aIsBegin) + : mEnumeration(aEnumeration) { + if (aIsBegin) { + AdvanceUEnum(); + } + } + + Iterator& operator++() { + AdvanceUEnum(); + return *this; + } + + Iterator operator++(int) { + Iterator retval = *this; + ++(*this); + return retval; + } + + bool operator==(Iterator other) const { + return mIteration == other.mIteration; + } + + bool operator!=(Iterator other) const { return !(*this == other); } + + T operator*() const { + // Map the iterated value to something new. + return Mapper(mNext, mNextLength); + } + + private: + void AdvanceUEnum() { + if (mIteration.isNothing()) { + mIteration = Some(-1); + } + UErrorCode status = U_ZERO_ERROR; + if constexpr (std::is_same_v<CharType, char16_t>) { + mNext = uenum_unext(mEnumeration.mUEnumeration, &mNextLength, &status); + } else { + static_assert(std::is_same_v<CharType, char>, + "Only char16_t and char are supported by " + "mozilla::intl::Enumeration."); + mNext = uenum_next(mEnumeration.mUEnumeration, &mNextLength, &status); + } + if (U_FAILURE(status)) { + mNext = nullptr; + } + + if (mNext) { + (*mIteration)++; + } else { + // The iterator is complete. + mIteration = Nothing{}; + } + } + }; + + Iterator begin() { return Iterator(*this, true); } + Iterator end() { return Iterator(*this, false); } + + explicit Enumeration(UEnumeration* aUEnumeration) + : mUEnumeration(aUEnumeration) {} + + ~Enumeration() { + if (mUEnumeration) { + // Only close when the object is being destructed, not moved. + uenum_close(mUEnumeration); + } + } + + private: + UEnumeration* mUEnumeration = nullptr; +}; + +template <typename CharType> +Result<Span<const CharType>, InternalError> SpanMapper(const CharType* string, + int32_t length) { + // Return the raw value from this Iterator. + if (string == nullptr) { + return Err(InternalError{}); + } + MOZ_ASSERT(length >= 0); + return Span<const CharType>(string, static_cast<size_t>(length)); +} + +template <typename CharType> +using SpanResult = Result<Span<const CharType>, InternalError>; + +template <typename CharType> +using SpanEnumeration = Enumeration<CharType, SpanResult<CharType>, SpanMapper>; + +/** + * An iterable class that wraps calls to ICU's available locales API. + */ +template <int32_t(CountAvailable)(), const char*(GetAvailable)(int32_t)> +class AvailableLocalesEnumeration final { + // The overall count of available locales. + int32_t mLocalesCount = 0; + + public: + AvailableLocalesEnumeration() { mLocalesCount = CountAvailable(); } + + class Iterator { + public: + // std::iterator traits. + using iterator_category = std::input_iterator_tag; + using value_type = const char*; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using reference = value_type&; + + private: + // The current position in the list of available locales. + int32_t mLocalesPos = 0; + + public: + explicit Iterator(int32_t aLocalesPos) : mLocalesPos(aLocalesPos) {} + + Iterator& operator++() { + mLocalesPos++; + return *this; + } + + Iterator operator++(int) { + Iterator result = *this; + ++(*this); + return result; + } + + bool operator==(const Iterator& aOther) const { + return mLocalesPos == aOther.mLocalesPos; + } + + bool operator!=(const Iterator& aOther) const { return !(*this == aOther); } + + value_type operator*() const { return GetAvailable(mLocalesPos); } + }; + + // std::iterator begin() and end() methods. + + /** + * Return an iterator pointing to the first available locale. + */ + Iterator begin() const { return Iterator(0); } + + /** + * Return an iterator pointing to one past the last available locale. + */ + Iterator end() const { return Iterator(mLocalesCount); } +}; + +/** + * A helper class to wrap calling ICU function in cpp file so we don't have to + * include the ICU header here. + */ +class FormattedResult { + protected: + static Result<Span<const char16_t>, ICUError> ToSpanImpl( + const UFormattedValue* value); +}; + +/** + * A RAII class to hold the formatted value of format result. + * + * The caller will need to create this AutoFormattedResult on the stack, with + * the following parameters: + * 1. Native ICU type. + * 2. An ICU function which opens the result. + * 3. An ICU function which can get the result as UFormattedValue. + * 4. An ICU function which closes the result. + * + * After the object is created, caller needs to call IsValid() method to check + * if the native object has been created properly, and then passes this + * object to other format interfaces. + * The format result will be stored in this object, the caller can use ToSpan() + * method to get the formatted string. + * + * The methods GetFormatted() and Value() are private methods since they expose + * native ICU types. If the caller wants to call these methods, the caller needs + * to register itself as a friend class in AutoFormattedResult. + * + * The formatted value and the native ICU object will be released once this + * class is destructed. + */ +template <typename T, T*(Open)(UErrorCode*), + const UFormattedValue*(GetValue)(const T*, UErrorCode*), + void(Close)(T*)> +class MOZ_RAII AutoFormattedResult : FormattedResult { + public: + AutoFormattedResult() { + mFormatted = Open(&mError); + if (U_FAILURE(mError)) { + mFormatted = nullptr; + } + } + ~AutoFormattedResult() { + if (mFormatted) { + Close(mFormatted); + } + } + + AutoFormattedResult(const AutoFormattedResult& other) = delete; + AutoFormattedResult& operator=(const AutoFormattedResult& other) = delete; + + AutoFormattedResult(AutoFormattedResult&& other) = delete; + AutoFormattedResult& operator=(AutoFormattedResult&& other) = delete; + + /** + * Check if the native UFormattedDateInterval was created successfully. + */ + bool IsValid() const { return !!mFormatted; } + + /** + * Get error code if IsValid() returns false. + */ + ICUError GetError() const { return ToICUError(mError); } + + /** + * Get the formatted result. + */ + Result<Span<const char16_t>, ICUError> ToSpan() const { + if (!IsValid()) { + return Err(GetError()); + } + + const UFormattedValue* value = Value(); + if (!value) { + return Err(ICUError::InternalError); + } + + return ToSpanImpl(value); + } + + private: + friend class DateIntervalFormat; + friend class ListFormat; + T* GetFormatted() const { return mFormatted; } + + const UFormattedValue* Value() const { + if (!IsValid()) { + return nullptr; + } + + UErrorCode status = U_ZERO_ERROR; + const UFormattedValue* value = GetValue(mFormatted, &status); + if (U_FAILURE(status)) { + return nullptr; + } + + return value; + }; + + T* mFormatted = nullptr; + UErrorCode mError = U_ZERO_ERROR; +}; +} // namespace mozilla::intl + +#endif /* intl_components_ICUUtils_h */ |