From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- mfbt/Utf8.h | 591 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 591 insertions(+) create mode 100644 mfbt/Utf8.h (limited to 'mfbt/Utf8.h') diff --git a/mfbt/Utf8.h b/mfbt/Utf8.h new file mode 100644 index 0000000000..29b1883084 --- /dev/null +++ b/mfbt/Utf8.h @@ -0,0 +1,591 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * UTF-8-related functionality, including a type-safe structure representing a + * UTF-8 code unit. + */ + +#ifndef mozilla_Utf8_h +#define mozilla_Utf8_h + +#include "mozilla/Casting.h" // for mozilla::AssertedCast +#include "mozilla/Likely.h" // for MOZ_UNLIKELY +#include "mozilla/Maybe.h" // for mozilla::Maybe +#include "mozilla/Span.h" // for mozilla::Span +#include "mozilla/TextUtils.h" // for mozilla::IsAscii and via Latin1.h for + // encoding_rs_mem.h and MOZ_HAS_JSRUST. +#include "mozilla/Types.h" // for MFBT_API + +#include // for std::numeric_limits +#include // for CHAR_BIT +#include // for size_t +#include // for uint8_t + +#if MOZ_HAS_JSRUST() +// Can't include mozilla/Encoding.h here. +extern "C" { +// Declared as uint8_t instead of char to match declaration in another header. +size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len); +} +#else +namespace mozilla { +namespace detail { +extern MFBT_API bool IsValidUtf8(const void* aCodeUnits, size_t aCount); +}; // namespace detail +}; // namespace mozilla +#endif // MOZ_HAS_JSRUST + +namespace mozilla { + +union Utf8Unit; + +static_assert(CHAR_BIT == 8, + "Utf8Unit won't work so well with non-octet chars"); + +/** + * A code unit within a UTF-8 encoded string. (A code unit is the smallest + * unit within the Unicode encoding of a string. For UTF-8 this is an 8-bit + * number; for UTF-16 it would be a 16-bit number.) + * + * This is *not* the same as a single code point: in UTF-8, non-ASCII code + * points are constituted by multiple code units. + */ +union Utf8Unit { + private: + // Utf8Unit is a union wrapping a raw |char|. The C++ object model and C++ + // requirements as to how objects may be accessed with respect to their actual + // types (almost?) uniquely compel this choice. + // + // Our requirements for a UTF-8 code unit representation are: + // + // 1. It must be "compatible" with C++ character/string literals that use + // the UTF-8 encoding. Given a properly encoded C++ literal, you should + // be able to use |Utf8Unit| and friends to access it; given |Utf8Unit| + // and friends (particularly UnicodeData), you should be able to access + // C++ character types for their contents. + // 2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by + // explicit operation. + // 3. |Utf8Unit| must participate in overload resolution and template type + // equivalence (that is, given |template class X|, when |X| and + // |X| are the same type) distinctly from the C++ character types. + // + // And a few nice-to-haves (at least for the moment): + // + // 4. The representation should use unsigned numbers, to avoid undefined + // behavior that can arise with signed types, and because Unicode code + // points and code units are unsigned. + // 5. |Utf8Unit| and friends should be convertible to/from |unsigned char| + // and |unsigned char*|, for APIs that (because of #4 above) use those + // types as the "natural" choice for UTF-8 data. + // + // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of + // |{,{un,}signed} char|.[0] |uint8_t| won't work because it might not be a + // C++ character type. + // + // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one: + // typedefs don't generate *new* types, just type aliases). This requires a + // compound type. + // + // The ultimate representation (and character type in it) is constrained by + // C++14 [basic.lval]p10 that defines how objects may be accessed, with + // respect to the dynamic type in memory and the actual type used to access + // them. It reads: + // + // If a program attempts to access the stored value of an object + // through a glvalue of other than one of the following types the + // behavior is undefined: + // + // 1. the dynamic type of the object, + // 2. a cv-qualified version of the dynamic type of the object, + // ...other types irrelevant here... + // 3. an aggregate or union type that includes one of the + // aforementioned types among its elements or non-static data + // members (including, recursively, an element or non-static + // data member of a subaggregate or contained union), + // ...more irrelevant types... + // 4. a char or unsigned char type. + // + // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no + // matter the representation by #4. (Briefly set aside what values are seen.) + // (And #2 allows |const| on either the dynamic type or the accessing type.) + // (|signed char| is really only useful for small signed numbers, not + // characters, so we ignore it.) + // + // If we interpret contents as |char|/|unsigned char| contrary to the actual + // type stored there, what happens? C++14 [basic.fundamental]p1 requires + // character types be identically aligned/sized; C++14 [basic.fundamental]p3 + // requires |signed char| and |unsigned char| have the same value + // representation. C++ doesn't require identical bitwise representation, tho. + // Practically we could assume it, but this verges on C++ spec bits best not + // *relied* on for correctness, if possible. + // + // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char| + // and |char*|. Instead we safely expose |unsigned char| by fully-defined + // *integral conversion* (C++14 [conv.integral]p2). Integral conversion from + // |unsigned char| → |char| has only implementation-defined behavior. It'd be + // better not to depend on that, but given twos-complement won, it should be + // okay. (Also |unsigned char*| is awkward enough to work with for strings + // that it probably doesn't appear in string manipulation much anyway, only in + // places that should really use |Utf8Unit| directly.) + // + // The opposite direction -- interpreting |char| or |char*| data through + // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as + // decided above, using #3. An "aggregate or union" will work that contains a + // |char|. Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says + // aggregates must have "no private or protected non-static data members", and + // we want to keep the inner |char| hidden. So a |struct| is out, and only + // |union| remains. + // + // (Enums are not "an aggregate or union type", so [maybe surprisingly] we + // can't make |Utf8Unit| an enum class with |char| underlying type, because we + // are given no license to treat |char| memory as such an |enum|'s memory.) + // + // Therefore |Utf8Unit| is a union type with a |char| non-static data member. + // This satisfies all our requirements. It also supports the nice-to-haves of + // creating a |Utf8Unit| from an |unsigned char|, and being convertible to + // |unsigned char|. It doesn't satisfy the nice-to-haves of using an + // |unsigned char| internally, nor of letting us wrap an existing + // |unsigned char| or pointer to one. We probably *could* do these, if we + // were willing to rely harder on implementation-defined behaviors, but for + // now we privilege C++'s main character type over some conceptual purity. + // + // 0. There's a proposal for a UTF-8 character type distinct from the existing + // C++ narrow character types: + // + // http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html + // + // but it hasn't been standardized (and might never be), and none of the + // compilers we really care about have implemented it. Maybe someday we + // can change our implementation to it without too much trouble, if we're + // lucky... + char mValue = '\0'; + + public: + Utf8Unit() = default; + + explicit constexpr Utf8Unit(char aUnit) : mValue(aUnit) {} + + explicit constexpr Utf8Unit(unsigned char aUnit) + : mValue(static_cast(aUnit)) { + // Per the above comment, the prior cast is integral conversion with + // implementation-defined semantics, and we regretfully but unavoidably + // assume the conversion does what we want it to. + } + + constexpr bool operator==(const Utf8Unit& aOther) const { + return mValue == aOther.mValue; + } + + constexpr bool operator!=(const Utf8Unit& aOther) const { + return !(*this == aOther); + } + + /** Convert a UTF-8 code unit to a raw char. */ + constexpr char toChar() const { + // Only a |char| is ever permitted to be written into this location, so this + // is both permissible and returns the desired value. + return mValue; + } + + /** Convert a UTF-8 code unit to a raw unsigned char. */ + constexpr unsigned char toUnsignedChar() const { + // Per the above comment, this is well-defined integral conversion. + return static_cast(mValue); + } + + /** Convert a UTF-8 code unit to a uint8_t. */ + constexpr uint8_t toUint8() const { + // Per the above comment, this is well-defined integral conversion. + return static_cast(mValue); + } + + // We currently don't expose |&mValue|. |UnicodeData| sort of does, but + // that's a somewhat separate concern, justified in different comments in + // that other code. +}; + +/** + * Reinterpret the address of a UTF-8 code unit as |const unsigned char*|. + * + * Assuming proper backing has been set up, the resulting |const unsigned char*| + * may validly be dereferenced. + * + * No access is provided to mutate this underlying memory as |unsigned char|. + * Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are + * loath to offer a way to write non-|char| data until absolutely necessary. + */ +inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit* aUnits) { + static_assert(sizeof(Utf8Unit) == sizeof(unsigned char), + "sizes must match to permissibly reinterpret_cast<>"); + static_assert(alignof(Utf8Unit) == alignof(unsigned char), + "alignment must match to permissibly reinterpret_cast<>"); + + // The static_asserts above only enable the reinterpret_cast<> to occur. + // + // Dereferencing the resulting pointer is a separate question. Any object's + // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but + // this doesn't guarantee what values will be observed. If |char| is + // implemented to act like |unsigned char|, we're good to go: memory for the + // |char| in |Utf8Unit| acts as we need. But if |char| is implemented to act + // like |signed char|, dereferencing produces the right value only if the + // |char| types all use two's-complement representation. Every modern + // compiler does this, and there's a C++ proposal to standardize it. + // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html So + // *technically* this is implementation-defined -- but everyone does it and + // this behavior is being standardized. + return reinterpret_cast(aUnits); +} + +/** Returns true iff |aUnit| is an ASCII value. */ +constexpr bool IsAscii(Utf8Unit aUnit) { + return IsAscii(aUnit.toUnsignedChar()); +} + +/** + * Return true if the given span of memory consists of a valid UTF-8 + * string and false otherwise. + * + * The string *may* contain U+0000 NULL code points. + */ +inline bool IsUtf8(mozilla::Span aString) { +#if MOZ_HAS_JSRUST() + size_t length = aString.Length(); + const uint8_t* ptr = reinterpret_cast(aString.Elements()); + // For short strings, the function call is a pessimization, and the SIMD + // code won't have a chance to kick in anyway. + if (length < 16) { + for (size_t i = 0; i < length; i++) { + if (ptr[i] >= 0x80U) { + ptr += i; + length -= i; + goto end; + } + } + return true; + } +end: + return length == encoding_utf8_valid_up_to(ptr, length); +#else + return detail::IsValidUtf8(aString.Elements(), aString.Length()); +#endif +} + +#if MOZ_HAS_JSRUST() + +// See Latin1.h for conversions between Latin1 and UTF-8. + +/** + * Returns the index of the start of the first malformed byte + * sequence or the length of the string if there are none. + */ +inline size_t Utf8ValidUpTo(mozilla::Span aString) { + return encoding_utf8_valid_up_to( + reinterpret_cast(aString.Elements()), aString.Length()); +} + +/** + * Converts potentially-invalid UTF-16 to UTF-8 replacing lone surrogates + * with the REPLACEMENT CHARACTER. + * + * The length of aDest must be at least the length of aSource times three. + * + * Returns the number of code units written. + */ +inline size_t ConvertUtf16toUtf8(mozilla::Span aSource, + mozilla::Span aDest) { + return encoding_mem_convert_utf16_to_utf8( + aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); +} + +/** + * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte + * sequences with the REPLACEMENT CHARACTER with potentially insufficient + * output space. + * + * Returns the number of code units read and the number of bytes written. + * + * If the output isn't large enough, not all input is consumed. + * + * The conversion is guaranteed to be complete if the length of aDest is + * at least the length of aSource times three. + * + * The output is always valid UTF-8 ending on scalar value boundary + * even in the case of partial conversion. + * + * The semantics of this function match the semantics of + * TextEncoder.encodeInto. + * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto + */ +inline std::tuple ConvertUtf16toUtf8Partial( + mozilla::Span aSource, mozilla::Span aDest) { + size_t srcLen = aSource.Length(); + size_t dstLen = aDest.Length(); + encoding_mem_convert_utf16_to_utf8_partial(aSource.Elements(), &srcLen, + aDest.Elements(), &dstLen); + return std::make_tuple(srcLen, dstLen); +} + +/** + * Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte + * sequences with the REPLACEMENT CHARACTER. + * + * Returns the number of code units written. + * + * The length of aDest must be at least one greater than the length of aSource + * even though the last slot isn't written to. + * + * If you know that the input is valid for sure, use + * UnsafeConvertValidUtf8toUtf16() instead. + */ +inline size_t ConvertUtf8toUtf16(mozilla::Span aSource, + mozilla::Span aDest) { + return encoding_mem_convert_utf8_to_utf16( + aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); +} + +/** + * Converts known-valid UTF-8 to UTF-16. If the input might be invalid, + * use ConvertUtf8toUtf16() or ConvertUtf8toUtf16WithoutReplacement() instead. + * + * Returns the number of code units written. + * + * The length of aDest must be at least the length of aSource. + */ +inline size_t UnsafeConvertValidUtf8toUtf16(mozilla::Span aSource, + mozilla::Span aDest) { + return encoding_mem_convert_str_to_utf16(aSource.Elements(), aSource.Length(), + aDest.Elements(), aDest.Length()); +} + +/** + * Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. + * + * Returns the number of code units written or `mozilla::Nothing` if the + * input was invalid. + * + * The length of the destination buffer must be at least the length of the + * source buffer. + * + * When the input was invalid, some output may have been written. + * + * If you know that the input is valid for sure, use + * UnsafeConvertValidUtf8toUtf16() instead. + */ +inline mozilla::Maybe ConvertUtf8toUtf16WithoutReplacement( + mozilla::Span aSource, mozilla::Span aDest) { + size_t written = encoding_mem_convert_utf8_to_utf16_without_replacement( + aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length()); + if (MOZ_UNLIKELY(written == std::numeric_limits::max())) { + return mozilla::Nothing(); + } + return mozilla::Some(written); +} + +#endif // MOZ_HAS_JSRUST + +/** + * Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern + * 0b10xx'xxxx. + */ +inline bool IsTrailingUnit(Utf8Unit aUnit) { + return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000; +} + +/** + * Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter| + * that (initially) itself points one unit past |aLeadUnit|, and + * |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared + * against |*aIter| using |aEnd - *aIter|: + * + * If |aLeadUnit| and subsequent code units computed using |*aIter| (up to + * |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a + * surrogate, in shortest form -- then return Some(that code point) and advance + * |*aIter| past those code units. + * + * Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return + * Nothing(). + * + * |Iter| and |EndIter| are generalized concepts most easily understood as if + * they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|: + * iterators that when dereferenced can be used to construct a |Utf8Unit| and + * that can be compared and modified in certain limited ways. (Carefully note + * that this function mutates |*aIter|.) |Iter| and |EndIter| are template + * parameters to support more-complicated adaptor iterators. + * + * The template parameters after |Iter| allow users to implement custom handling + * for various forms of invalid UTF-8. A version of this function that defaults + * all such handling to no-ops is defined below this function. To learn how to + * define your own custom handling, consult the implementation of that function, + * which documents exactly how custom handler functors are invoked. + * + * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version + * of this function without the "Inline" suffix on the name. + */ +template +MOZ_ALWAYS_INLINE Maybe DecodeOneUtf8CodePointInline( + const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd, + OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits, + OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint, + OnNotShortestForm aOnNotShortestForm) { + MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit); + + char32_t n = aLeadUnit.toUint8(); + MOZ_ASSERT(!IsAscii(n)); + + // |aLeadUnit| determines the number of trailing code units in the code point + // and the bits of |aLeadUnit| that contribute to the code point's value. + uint8_t remaining; + uint32_t min; + if ((n & 0b1110'0000) == 0b1100'0000) { + remaining = 1; + min = 0x80; + n &= 0b0001'1111; + } else if ((n & 0b1111'0000) == 0b1110'0000) { + remaining = 2; + min = 0x800; + n &= 0b0000'1111; + } else if ((n & 0b1111'1000) == 0b1111'0000) { + remaining = 3; + min = 0x10000; + n &= 0b0000'0111; + } else { + *aIter -= 1; + aOnBadLeadUnit(); + return Nothing(); + } + + // If the code point would require more code units than remain, the encoding + // is invalid. + auto actual = aEnd - *aIter; + if (MOZ_UNLIKELY(actual < remaining)) { + *aIter -= 1; + aOnNotEnoughUnits(AssertedCast(actual + 1), remaining + 1); + return Nothing(); + } + + for (uint8_t i = 0; i < remaining; i++) { + const Utf8Unit unit(*(*aIter)++); + + // Every non-leading code unit in properly encoded UTF-8 has its high + // bit set and the next-highest bit unset. + if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) { + uint8_t unitsObserved = i + 1 + 1; + *aIter -= unitsObserved; + aOnBadTrailingUnit(unitsObserved); + return Nothing(); + } + + // The code point being encoded is the concatenation of all the + // unconstrained bits. + n = (n << 6) | (unit.toUint8() & 0b0011'1111); + } + + // UTF-16 surrogates and values outside the Unicode range are invalid. + if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) { + uint8_t unitsObserved = remaining + 1; + *aIter -= unitsObserved; + aOnBadCodePoint(n, unitsObserved); + return Nothing(); + } + + // Overlong code points are also invalid. + if (MOZ_UNLIKELY(n < min)) { + uint8_t unitsObserved = remaining + 1; + *aIter -= unitsObserved; + aOnNotShortestForm(n, unitsObserved); + return Nothing(); + } + + return Some(n); +} + +/** + * Identical to the above function, but not forced to be instantiated inline -- + * the compiler is permitted to common up separate invocations if it chooses. + */ +template +inline Maybe DecodeOneUtf8CodePoint( + const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd, + OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits, + OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint, + OnNotShortestForm aOnNotShortestForm) { + return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, aOnBadLeadUnit, + aOnNotEnoughUnits, aOnBadTrailingUnit, + aOnBadCodePoint, aOnNotShortestForm); +} + +/** + * Like the always-inlined function above, but with no-op behavior from all + * trailing if-invalid notifier functors. + * + * This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version + * of this function without the "Inline" suffix on the name. + */ +template +MOZ_ALWAYS_INLINE Maybe DecodeOneUtf8CodePointInline( + const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd) { + // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in + // a multi-unit code point. It is passed no arguments: the caller already has + // |aLeadUnit| on hand, so no need to provide it again. + auto onBadLeadUnit = []() {}; + + // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code + // point length, but there aren't enough units from |*aIter| to |aEnd| to + // satisfy that length. It is passed the number of code units actually + // available (according to |aEnd - *aIter|) and the number of code units that + // |aLeadUnit| indicates are needed. Both numbers include the contribution + // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and + // |aUnitsAvailable < aUnitsNeeded|. As above, it also is not passed the lead + // code unit. + auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {}; + + // aOnBadTrailingUnit is called when one of the trailing code units implied by + // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8 + // trailing code units must satisfy. It is passed the total count of units + // observed (including |aLeadUnit|). The bad trailing code unit will + // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is + // called, and so |aUnitsObserved <= 4|. + auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {}; + + // aOnBadCodePoint is called when a structurally-correct code point encoding + // is found, but the *value* that is encoded is not a valid code point: either + // because it exceeded the U+10FFFF Unicode maximum code point, or because it + // was a UTF-16 surrogate. It is passed the non-code point value and the + // number of code units used to encode it. + auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {}; + + // aOnNotShortestForm is called when structurally-correct encoding is found, + // but the encoded value should have been encoded in fewer code units (e.g. + // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of + // as 0b0000'0000). It is passed the mis-encoded code point (which will be + // valid and not a surrogate) and the count of code units that mis-encoded it. + auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) { + }; + + return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, onBadLeadUnit, + onNotEnoughUnits, onBadTrailingUnit, + onBadCodePoint, onNotShortestForm); +} + +/** + * Identical to the above function, but not forced to be instantiated inline -- + * the compiler/linker are allowed to common up separate invocations. + */ +template +inline Maybe DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit, + Iter* aIter, + const EndIter& aEnd) { + return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd); +} + +} // namespace mozilla + +#endif /* mozilla_Utf8_h */ -- cgit v1.2.3