diff options
Diffstat (limited to '')
-rw-r--r-- | mfbt/tests/TestUtf8.cpp | 755 |
1 files changed, 755 insertions, 0 deletions
diff --git a/mfbt/tests/TestUtf8.cpp b/mfbt/tests/TestUtf8.cpp new file mode 100644 index 0000000000..ee8810ab11 --- /dev/null +++ b/mfbt/tests/TestUtf8.cpp @@ -0,0 +1,755 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#define MOZ_PRETEND_NO_JSRUST 1 + +#include "mozilla/Utf8.h" + +#include "mozilla/ArrayUtils.h" +#include "mozilla/Assertions.h" +#include "mozilla/EnumSet.h" +#include "mozilla/IntegerRange.h" +#include "mozilla/Span.h" + +using mozilla::ArrayLength; +using mozilla::AsChars; +using mozilla::DecodeOneUtf8CodePoint; +using mozilla::EnumSet; +using mozilla::IntegerRange; +using mozilla::IsAscii; +using mozilla::IsUtf8; +using mozilla::Span; +using mozilla::Utf8Unit; + +// Disable the C++ 2a warning. See bug #1509926 +#if defined(__clang__) && (__clang_major__ >= 6) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wc++2a-compat" +#endif + +static void TestUtf8Unit() { + Utf8Unit c('A'); + MOZ_RELEASE_ASSERT(c.toChar() == 'A'); + MOZ_RELEASE_ASSERT(c == Utf8Unit('A')); + MOZ_RELEASE_ASSERT(c != Utf8Unit('B')); + MOZ_RELEASE_ASSERT(c.toUint8() == 0x41); + + unsigned char asUnsigned = 'A'; + MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned); + MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned); + + Utf8Unit first('@'); + Utf8Unit second('#'); + + MOZ_RELEASE_ASSERT(first != second); + + first = second; + MOZ_RELEASE_ASSERT(first == second); +} + +template <typename Char> +struct ToUtf8Units { + public: + explicit ToUtf8Units(const Char* aStart, const Char* aEnd) + : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) { + MOZ_RELEASE_ASSERT(!IsAscii(aStart[0])); + } + + const Utf8Unit lead; + const Char* iter; + const Char* const end; +}; + +class AssertIfCalled { + public: + template <typename... Args> + void operator()(Args&&... aArgs) { + MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called"); + } +}; + +// NOTE: For simplicity in treating |aCharN| identically regardless whether it's +// a string literal or a more-generalized array, we require |aCharN| be +// null-terminated. + +template <typename Char, size_t N> +static void ExpectValidCodePoint(const Char (&aCharN)[N], + char32_t aExpectedCodePoint) { + MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0, + "array must be null-terminated for |aCharN + N - 1| to " + "compute the value of |aIter| as altered by " + "DecodeOneUtf8CodePoint"); + + ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1); + auto simple = + DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end); + MOZ_RELEASE_ASSERT(simple.isSome()); + MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint); + MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end); + + ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1); + auto complex = DecodeOneUtf8CodePoint( + complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(), + AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled()); + MOZ_RELEASE_ASSERT(complex.isSome()); + MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint); + MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end); +} + +enum class InvalidUtf8Reason { + BadLeadUnit, + NotEnoughUnits, + BadTrailingUnit, + BadCodePoint, + NotShortestForm, +}; + +template <typename Char, size_t N> +static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N], + InvalidUtf8Reason aExpectedReason, + uint8_t aExpectedUnitsAvailable, + uint8_t aExpectedUnitsNeeded, + char32_t aExpectedBadCodePoint, + uint8_t aExpectedUnitsObserved) { + MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0, + "array must be null-terminated for |aCharN + N - 1| to " + "compute the value of |aIter| as altered by " + "DecodeOneUtf8CodePoint"); + + ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1); + auto simple = + DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end); + MOZ_RELEASE_ASSERT(simple.isNothing()); + MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN); + + EnumSet<InvalidUtf8Reason> reasons; + uint8_t unitsAvailable; + uint8_t unitsNeeded; + char32_t badCodePoint; + uint8_t unitsObserved; + + struct OnNotShortestForm { + EnumSet<InvalidUtf8Reason>& reasons; + char32_t& badCodePoint; + uint8_t& unitsObserved; + + void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) { + reasons += InvalidUtf8Reason::NotShortestForm; + badCodePoint = aBadCodePoint; + unitsObserved = aUnitsObserved; + } + }; + + ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1); + auto complex = DecodeOneUtf8CodePoint( + complexUnit.lead, &complexUnit.iter, complexUnit.end, + [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; }, + [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable, + uint8_t aUnitsNeeded) { + reasons += InvalidUtf8Reason::NotEnoughUnits; + unitsAvailable = aUnitsAvailable; + unitsNeeded = aUnitsNeeded; + }, + [&reasons, &unitsObserved](uint8_t aUnitsObserved) { + reasons += InvalidUtf8Reason::BadTrailingUnit; + unitsObserved = aUnitsObserved; + }, + [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint, + uint8_t aUnitsObserved) { + reasons += InvalidUtf8Reason::BadCodePoint; + badCodePoint = aBadCodePoint; + unitsObserved = aUnitsObserved; + }, + [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint, + uint8_t aUnitsObserved) { + reasons += InvalidUtf8Reason::NotShortestForm; + badCodePoint = aBadCodePoint; + unitsObserved = aUnitsObserved; + }); + MOZ_RELEASE_ASSERT(complex.isNothing()); + MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN); + + bool alreadyIterated = false; + for (InvalidUtf8Reason reason : reasons) { + MOZ_RELEASE_ASSERT(!alreadyIterated); + alreadyIterated = true; + + switch (reason) { + case InvalidUtf8Reason::BadLeadUnit: + break; + + case InvalidUtf8Reason::NotEnoughUnits: + MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable); + MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded); + break; + + case InvalidUtf8Reason::BadTrailingUnit: + MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); + break; + + case InvalidUtf8Reason::BadCodePoint: + MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint); + MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); + break; + + case InvalidUtf8Reason::NotShortestForm: + MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint); + MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); + break; + } + } +} + +// NOTE: For simplicity in treating |aCharN| identically regardless whether it's +// a string literal or a more-generalized array, we require |aCharN| be +// null-terminated in all these functions. + +template <typename Char, size_t N> +static void ExpectBadLeadUnit(const Char (&aCharN)[N]) { + ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF, + 0xFF, 0xFFFFFFFF, 0xFF); +} + +template <typename Char, size_t N> +static void ExpectNotEnoughUnits(const Char (&aCharN)[N], + uint8_t aExpectedUnitsAvailable, + uint8_t aExpectedUnitsNeeded) { + ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits, + aExpectedUnitsAvailable, aExpectedUnitsNeeded, + 0xFFFFFFFF, 0xFF); +} + +template <typename Char, size_t N> +static void ExpectBadTrailingUnit(const Char (&aCharN)[N], + uint8_t aExpectedUnitsObserved) { + ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF, + 0xFF, 0xFFFFFFFF, aExpectedUnitsObserved); +} + +template <typename Char, size_t N> +static void ExpectNotShortestForm(const Char (&aCharN)[N], + char32_t aExpectedBadCodePoint, + uint8_t aExpectedUnitsObserved) { + ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF, + 0xFF, aExpectedBadCodePoint, + aExpectedUnitsObserved); +} + +template <typename Char, size_t N> +static void ExpectBadCodePoint(const Char (&aCharN)[N], + char32_t aExpectedBadCodePoint, + uint8_t aExpectedUnitsObserved) { + ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF, + 0xFF, aExpectedBadCodePoint, + aExpectedUnitsObserved); +} + +static void TestIsUtf8() { + // Note we include the U+0000 NULL in this one -- and that's fine. + static const char asciiBytes[] = u8"How about a nice game of chess?"; + MOZ_RELEASE_ASSERT(IsUtf8(Span(asciiBytes, ArrayLength(asciiBytes)))); + + static const char endNonAsciiBytes[] = u8"Life is like a 🌯"; + MOZ_RELEASE_ASSERT( + IsUtf8(Span(endNonAsciiBytes, ArrayLength(endNonAsciiBytes) - 1))); + + static const unsigned char badLeading[] = {0x80}; + MOZ_RELEASE_ASSERT( + !IsUtf8(AsChars(Span(badLeading, ArrayLength(badLeading))))); + + // Byte-counts + + // 1 + static const char oneBytes[] = u8"A"; // U+0041 LATIN CAPITAL LETTER A + constexpr size_t oneBytesLen = ArrayLength(oneBytes); + static_assert(oneBytesLen == 2, "U+0041 plus nul"); + MOZ_RELEASE_ASSERT(IsUtf8(Span(oneBytes, oneBytesLen))); + + // 2 + static const char twoBytes[] = u8"؆"; // U+0606 ARABIC-INDIC CUBE ROOT + constexpr size_t twoBytesLen = ArrayLength(twoBytes); + static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul"); + MOZ_RELEASE_ASSERT(IsUtf8(Span(twoBytes, twoBytesLen))); + + ExpectValidCodePoint(twoBytes, 0x0606); + + // 3 + static const char threeBytes[] = u8"᨞"; // U+1A1E BUGINESE PALLAWA + constexpr size_t threeBytesLen = ArrayLength(threeBytes); + static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul"); + MOZ_RELEASE_ASSERT(IsUtf8(Span(threeBytes, threeBytesLen))); + + ExpectValidCodePoint(threeBytes, 0x1A1E); + + // 4 + static const char fourBytes[] = + u8"🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06 + constexpr size_t fourBytesLen = ArrayLength(fourBytes); + static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul"); + MOZ_RELEASE_ASSERT(IsUtf8(Span(fourBytes, fourBytesLen))); + + ExpectValidCodePoint(fourBytes, 0x1F061); + + // Max code point + static const char maxCodePoint[] = u8""; // U+10FFFF + constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint); + static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul"); + MOZ_RELEASE_ASSERT(IsUtf8(Span(maxCodePoint, maxCodePointLen))); + + ExpectValidCodePoint(maxCodePoint, 0x10FFFF); + + // One past max code point + static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80, + 0x0}; + constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint); + MOZ_RELEASE_ASSERT( + !IsUtf8(AsChars(Span(onePastMaxCodePoint, onePastMaxCodePointLen)))); + + ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4); + + // Surrogate-related testing + + // (Note that the various code unit sequences here are null-terminated to + // simplify life for ExpectValidCodePoint, which presumes null termination.) + + static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0}; + constexpr size_t justBeforeSurrogatesLen = + ArrayLength(justBeforeSurrogates) - 1; + MOZ_RELEASE_ASSERT( + IsUtf8(AsChars(Span(justBeforeSurrogates, justBeforeSurrogatesLen)))); + + ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF); + + static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0}; + constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1; + MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(leastSurrogate, leastSurrogateLen)))); + + ExpectBadCodePoint(leastSurrogate, 0xD800, 3); + + static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0}; + constexpr size_t arbitraryHighSurrogateLen = + ArrayLength(arbitraryHighSurrogate) - 1; + MOZ_RELEASE_ASSERT(!IsUtf8( + AsChars(Span(arbitraryHighSurrogate, arbitraryHighSurrogateLen)))); + + ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3); + + static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0}; + constexpr size_t arbitraryLowSurrogateLen = + ArrayLength(arbitraryLowSurrogate) - 1; + MOZ_RELEASE_ASSERT( + !IsUtf8(AsChars(Span(arbitraryLowSurrogate, arbitraryLowSurrogateLen)))); + + ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3); + + static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0}; + constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1; + MOZ_RELEASE_ASSERT( + !IsUtf8(AsChars(Span(greatestSurrogate, greatestSurrogateLen)))); + + ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3); + + static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0}; + constexpr size_t justAfterSurrogatesLen = + ArrayLength(justAfterSurrogates) - 1; + MOZ_RELEASE_ASSERT( + IsUtf8(AsChars(Span(justAfterSurrogates, justAfterSurrogatesLen)))); + + ExpectValidCodePoint(justAfterSurrogates, 0xE000); +} + +static void TestDecodeOneValidUtf8CodePoint() { + // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that + // consist of multiple code units, so there are no ASCII tests below. + + // Length two. + + ExpectValidCodePoint(u8"", 0x80); // <control> + ExpectValidCodePoint(u8"©", 0xA9); // COPYRIGHT SIGN + ExpectValidCodePoint(u8"¶", 0xB6); // PILCROW SIGN + ExpectValidCodePoint(u8"¾", 0xBE); // VULGAR FRACTION THREE QUARTERS + ExpectValidCodePoint(u8"÷", 0xF7); // DIVISION SIGN + ExpectValidCodePoint(u8"ÿ", 0xFF); // LATIN SMALL LETTER Y WITH DIAERESIS + ExpectValidCodePoint(u8"Ā", 0x100); // LATIN CAPITAL LETTER A WITH MACRON + ExpectValidCodePoint(u8"IJ", 0x132); // LATIN CAPITAL LETTER LIGATURE IJ + ExpectValidCodePoint(u8"ͼ", 0x37C); // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL + ExpectValidCodePoint(u8"Ӝ", + 0x4DC); // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS + ExpectValidCodePoint(u8"۩", 0x6E9); // ARABIC PLACE OF SAJDAH + ExpectValidCodePoint(u8"߿", 0x7FF); // <not assigned> + + // Length three. + + ExpectValidCodePoint(u8"ࠀ", 0x800); // SAMARITAN LETTER ALAF + ExpectValidCodePoint(u8"ࡁ", 0x841); // MANDAIC LETTER AB + ExpectValidCodePoint(u8"ࣿ", 0x8FF); // ARABIC MARK SIDEWAYS NOON GHUNNA + ExpectValidCodePoint(u8"ஆ", 0xB86); // TAMIL LETTER AA + ExpectValidCodePoint(u8"༃", + 0xF03); // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA + ExpectValidCodePoint( + u8"࿉", + 0xFC9); // TIBETAN SYMBOL NOR BU (but on my system it really looks like + // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me) + ExpectValidCodePoint(u8"ဪ", 0x102A); // MYANMAR LETTER AU + ExpectValidCodePoint(u8"ᚏ", 0x168F); // OGHAM LETTER RUIS + ExpectValidCodePoint("\xE2\x80\xA8", 0x2028); // (the hated) LINE SEPARATOR + ExpectValidCodePoint("\xE2\x80\xA9", + 0x2029); // (the hated) PARAGRAPH SEPARATOR + ExpectValidCodePoint(u8"☬", 0x262C); // ADI SHAKTI + ExpectValidCodePoint(u8"㊮", 0x32AE); // CIRCLED IDEOGRAPH RESOURCE + ExpectValidCodePoint(u8"㏖", 0x33D6); // SQUARE MOL + ExpectValidCodePoint(u8"ꔄ", 0xA504); // VAI SYLLABLE WEEN + ExpectValidCodePoint(u8"ퟕ", 0xD7D5); // HANGUL JONGSEONG RIEUL-SSANGKIYEOK + ExpectValidCodePoint(u8"", 0xD7FF); // <not assigned> + ExpectValidCodePoint(u8"", 0xE000); // <Private Use> + ExpectValidCodePoint(u8"鱗", 0xF9F2); // CJK COMPATIBILITY IDEOGRAPH-F9F + ExpectValidCodePoint( + u8"﷽", 0xFDFD); // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM + ExpectValidCodePoint(u8"", 0xFFFF); // <not assigned> + + // Length four. + ExpectValidCodePoint(u8"𐀀", 0x10000); // LINEAR B SYLLABLE B008 A + ExpectValidCodePoint(u8"𔑀", 0x14440); // ANATOLIAN HIEROGLYPH A058 + ExpectValidCodePoint(u8"𝛗", 0x1D6D7); // MATHEMATICAL BOLD SMALL PHI + ExpectValidCodePoint(u8"💩", 0x1F4A9); // PILE OF POO + ExpectValidCodePoint(u8"🔫", 0x1F52B); // PISTOL + ExpectValidCodePoint(u8"🥌", 0x1F94C); // CURLING STONE + ExpectValidCodePoint(u8"🥏", 0x1F94F); // FLYING DISC + ExpectValidCodePoint(u8"𠍆", 0x20346); // CJK UNIFIED IDEOGRAPH-20346 + ExpectValidCodePoint(u8"𡠺", 0x2183A); // CJK UNIFIED IDEOGRAPH-2183A + ExpectValidCodePoint(u8"", 0x417F6); // <not assigned> + ExpectValidCodePoint(u8"", 0x7E836); // <not assigned> + ExpectValidCodePoint(u8"", 0xFEF67); // <Plane 15 Private Use> + ExpectValidCodePoint(u8"", 0x10FFFF); // +} + +static void TestDecodeBadLeadUnit() { + // These tests are actually exhaustive. + + unsigned char badLead[] = {'\0', '\0'}; + + for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) { + badLead[0] = lead; + ExpectBadLeadUnit(badLead); + } + + { + uint8_t lead = 0b1111'1000; + do { + badLead[0] = lead; + ExpectBadLeadUnit(badLead); + if (lead == 0b1111'1111) { + break; + } + + lead++; + } while (true); + } +} + +static void TestTooFewOrBadTrailingUnits() { + // Lead unit indicates a two-byte code point. + + char truncatedTwo[] = {'\0', '\0'}; + char badTrailTwo[] = {'\0', '\0', '\0'}; + + for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) { + truncatedTwo[0] = lead; + ExpectNotEnoughUnits(truncatedTwo, 1, 2); + + badTrailTwo[0] = lead; + for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) { + badTrailTwo[1] = trail; + ExpectBadTrailingUnit(badTrailTwo, 2); + } + + for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) { + badTrailTwo[1] = trail; + ExpectBadTrailingUnit(badTrailTwo, 2); + } + } + + // Lead unit indicates a three-byte code point. + + char truncatedThreeOne[] = {'\0', '\0'}; + char truncatedThreeTwo[] = {'\0', '\0', '\0'}; + unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'}; + + for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) { + truncatedThreeOne[0] = lead; + ExpectNotEnoughUnits(truncatedThreeOne, 1, 3); + + truncatedThreeTwo[0] = lead; + ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3); + + badTrailThree[0] = lead; + badTrailThree[2] = 0b1011'1111; // make valid to test overreads + for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) { + badTrailThree[1] = mid; + ExpectBadTrailingUnit(badTrailThree, 2); + } + { + uint8_t mid = 0b1100'0000; + do { + badTrailThree[1] = mid; + ExpectBadTrailingUnit(badTrailThree, 2); + if (mid == 0b1111'1111) { + break; + } + + mid++; + } while (true); + } + + badTrailThree[1] = 0b1011'1111; + for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) { + badTrailThree[2] = last; + ExpectBadTrailingUnit(badTrailThree, 3); + } + { + uint8_t last = 0b1100'0000; + do { + badTrailThree[2] = last; + ExpectBadTrailingUnit(badTrailThree, 3); + if (last == 0b1111'1111) { + break; + } + + last++; + } while (true); + } + } + + // Lead unit indicates a four-byte code point. + + char truncatedFourOne[] = {'\0', '\0'}; + char truncatedFourTwo[] = {'\0', '\0', '\0'}; + char truncatedFourThree[] = {'\0', '\0', '\0', '\0'}; + + unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'}; + + for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) { + truncatedFourOne[0] = lead; + ExpectNotEnoughUnits(truncatedFourOne, 1, 4); + + truncatedFourTwo[0] = lead; + ExpectNotEnoughUnits(truncatedFourTwo, 2, 4); + + truncatedFourThree[0] = lead; + ExpectNotEnoughUnits(truncatedFourThree, 3, 4); + + badTrailFour[0] = lead; + badTrailFour[2] = badTrailFour[3] = 0b1011'1111; // test for overreads + for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) { + badTrailFour[1] = second; + ExpectBadTrailingUnit(badTrailFour, 2); + } + { + uint8_t second = 0b1100'0000; + do { + badTrailFour[1] = second; + ExpectBadTrailingUnit(badTrailFour, 2); + if (second == 0b1111'1111) { + break; + } + + second++; + } while (true); + } + + badTrailFour[1] = badTrailFour[3] = 0b1011'1111; // test for overreads + for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) { + badTrailFour[2] = third; + ExpectBadTrailingUnit(badTrailFour, 3); + } + { + uint8_t third = 0b1100'0000; + do { + badTrailFour[2] = third; + ExpectBadTrailingUnit(badTrailFour, 3); + if (third == 0b1111'1111) { + break; + } + + third++; + } while (true); + } + + badTrailFour[2] = 0b1011'1111; + for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) { + badTrailFour[3] = fourth; + ExpectBadTrailingUnit(badTrailFour, 4); + } + { + uint8_t fourth = 0b1100'0000; + do { + badTrailFour[3] = fourth; + ExpectBadTrailingUnit(badTrailFour, 4); + if (fourth == 0b1111'1111) { + break; + } + + fourth++; + } while (true); + } + } +} + +static void TestBadSurrogate() { + // These tests are actually exhaustive. + + ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF); // last before surrogates + ExpectValidCodePoint("\xEE\x80\x80", 0xE000); // first after surrogates + + // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }. Last invalid + // surrogate encoding is { 0xED, 0xBF, 0xBF }. + + char badSurrogate[] = {'\xED', '\0', '\0', '\0'}; + + for (char32_t c = 0xD800; c < 0xE000; c++) { + badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6); + badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111)); + + ExpectBadCodePoint(badSurrogate, c, 3); + } +} + +static void TestBadTooBig() { + // These tests are actually exhaustive. + + ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF); // last code point + + // Four-byte code points are + // + // 0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx + // + // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally + // representable limit (exclusive) is 2**21 - 1 == 2097152. + + char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'}; + + for (char32_t c = 0x11'0000; c < (1 << 21); c++) { + tooLargeCodePoint[0] = + 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); + tooLargeCodePoint[1] = + 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); + tooLargeCodePoint[2] = + 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); + tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); + + ExpectBadCodePoint(tooLargeCodePoint, c, 4); + } +} + +static void TestBadCodePoint() { + TestBadSurrogate(); + TestBadTooBig(); +} + +static void TestNotShortestForm() { + { + // One-byte in two-byte. + + char oneInTwo[] = {'\0', '\0', '\0'}; + + for (char32_t c = '\0'; c < 0x80; c++) { + oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6); + oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111)); + + ExpectNotShortestForm(oneInTwo, c, 2); + } + + // One-byte in three-byte. + + char oneInThree[] = {'\0', '\0', '\0', '\0'}; + + for (char32_t c = '\0'; c < 0x80; c++) { + oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12); + oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6); + oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111)); + + ExpectNotShortestForm(oneInThree, c, 3); + } + + // One-byte in four-byte. + + char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'}; + + for (char32_t c = '\0'; c < 0x80; c++) { + oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); + oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); + oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); + oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); + + ExpectNotShortestForm(oneInFour, c, 4); + } + } + + { + // Two-byte in three-byte. + + char twoInThree[] = {'\0', '\0', '\0', '\0'}; + + for (char32_t c = 0x80; c < 0x800; c++) { + twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12); + twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6); + twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111)); + + ExpectNotShortestForm(twoInThree, c, 3); + } + + // Two-byte in four-byte. + + char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'}; + + for (char32_t c = 0x80; c < 0x800; c++) { + twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); + twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); + twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); + twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); + + ExpectNotShortestForm(twoInFour, c, 4); + } + } + + { + // Three-byte in four-byte. + + char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'}; + + for (char32_t c = 0x800; c < 0x1'0000; c++) { + threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); + threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); + threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); + threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); + + ExpectNotShortestForm(threeInFour, c, 4); + } + } +} + +static void TestDecodeOneInvalidUtf8CodePoint() { + TestDecodeBadLeadUnit(); + TestTooFewOrBadTrailingUnits(); + TestBadCodePoint(); + TestNotShortestForm(); +} + +static void TestDecodeOneUtf8CodePoint() { + TestDecodeOneValidUtf8CodePoint(); + TestDecodeOneInvalidUtf8CodePoint(); +} + +int main() { + TestUtf8Unit(); + TestIsUtf8(); + TestDecodeOneUtf8CodePoint(); + return 0; +} + +#if defined(__clang__) && (__clang_major__ >= 6) +# pragma clang diagnostic pop +#endif |