/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #define MOZ_PRETEND_NO_JSRUST 1 #include "mozilla/Utf8.h" #include "mozilla/ArrayUtils.h" #include "mozilla/Assertions.h" #include "mozilla/EnumSet.h" #include "mozilla/IntegerRange.h" #include "mozilla/TextUtils.h" using mozilla::ArrayLength; using mozilla::AsChars; using mozilla::DecodeOneUtf8CodePoint; using mozilla::EnumSet; using mozilla::IntegerRange; using mozilla::IsAscii; using mozilla::IsUtf8; using mozilla::Span; using mozilla::Utf8Unit; // Disable the C++ 2a warning. See bug #1509926 #if defined(__clang__) && (__clang_major__ >= 6) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wc++2a-compat" #endif static void TestUtf8Unit() { Utf8Unit c('A'); MOZ_RELEASE_ASSERT(c.toChar() == 'A'); MOZ_RELEASE_ASSERT(c == Utf8Unit('A')); MOZ_RELEASE_ASSERT(c != Utf8Unit('B')); MOZ_RELEASE_ASSERT(c.toUint8() == 0x41); unsigned char asUnsigned = 'A'; MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned); MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned); Utf8Unit first('@'); Utf8Unit second('#'); MOZ_RELEASE_ASSERT(first != second); first = second; MOZ_RELEASE_ASSERT(first == second); } template struct ToUtf8Units { public: explicit ToUtf8Units(const Char* aStart, const Char* aEnd) : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) { MOZ_RELEASE_ASSERT(!IsAscii(aStart[0])); } const Utf8Unit lead; const Char* iter; const Char* const end; }; class AssertIfCalled { public: template void operator()(Args&&... aArgs) { MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called"); } }; // NOTE: For simplicity in treating |aCharN| identically regardless whether it's // a string literal or a more-generalized array, we require |aCharN| be // null-terminated. template static void ExpectValidCodePoint(const Char (&aCharN)[N], char32_t aExpectedCodePoint) { MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0, "array must be null-terminated for |aCharN + N - 1| to " "compute the value of |aIter| as altered by " "DecodeOneUtf8CodePoint"); ToUtf8Units simpleUnit(aCharN, aCharN + N - 1); auto simple = DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end); MOZ_RELEASE_ASSERT(simple.isSome()); MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint); MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end); ToUtf8Units complexUnit(aCharN, aCharN + N - 1); auto complex = DecodeOneUtf8CodePoint( complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled()); MOZ_RELEASE_ASSERT(complex.isSome()); MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint); MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end); } enum class InvalidUtf8Reason { BadLeadUnit, NotEnoughUnits, BadTrailingUnit, BadCodePoint, NotShortestForm, }; template static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N], InvalidUtf8Reason aExpectedReason, uint8_t aExpectedUnitsAvailable, uint8_t aExpectedUnitsNeeded, char32_t aExpectedBadCodePoint, uint8_t aExpectedUnitsObserved) { MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0, "array must be null-terminated for |aCharN + N - 1| to " "compute the value of |aIter| as altered by " "DecodeOneUtf8CodePoint"); ToUtf8Units simpleUnit(aCharN, aCharN + N - 1); auto simple = DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end); MOZ_RELEASE_ASSERT(simple.isNothing()); MOZ_RELEASE_ASSERT(static_cast(simpleUnit.iter) == aCharN); EnumSet reasons; uint8_t unitsAvailable; uint8_t unitsNeeded; char32_t badCodePoint; uint8_t unitsObserved; struct OnNotShortestForm { EnumSet& reasons; char32_t& badCodePoint; uint8_t& unitsObserved; void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) { reasons += InvalidUtf8Reason::NotShortestForm; badCodePoint = aBadCodePoint; unitsObserved = aUnitsObserved; } }; ToUtf8Units complexUnit(aCharN, aCharN + N - 1); auto complex = DecodeOneUtf8CodePoint( complexUnit.lead, &complexUnit.iter, complexUnit.end, [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; }, [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) { reasons += InvalidUtf8Reason::NotEnoughUnits; unitsAvailable = aUnitsAvailable; unitsNeeded = aUnitsNeeded; }, [&reasons, &unitsObserved](uint8_t aUnitsObserved) { reasons += InvalidUtf8Reason::BadTrailingUnit; unitsObserved = aUnitsObserved; }, [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint, uint8_t aUnitsObserved) { reasons += InvalidUtf8Reason::BadCodePoint; badCodePoint = aBadCodePoint; unitsObserved = aUnitsObserved; }, [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint, uint8_t aUnitsObserved) { reasons += InvalidUtf8Reason::NotShortestForm; badCodePoint = aBadCodePoint; unitsObserved = aUnitsObserved; }); MOZ_RELEASE_ASSERT(complex.isNothing()); MOZ_RELEASE_ASSERT(static_cast(complexUnit.iter) == aCharN); bool alreadyIterated = false; for (InvalidUtf8Reason reason : reasons) { MOZ_RELEASE_ASSERT(!alreadyIterated); alreadyIterated = true; switch (reason) { case InvalidUtf8Reason::BadLeadUnit: break; case InvalidUtf8Reason::NotEnoughUnits: MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable); MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded); break; case InvalidUtf8Reason::BadTrailingUnit: MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); break; case InvalidUtf8Reason::BadCodePoint: MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint); MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); break; case InvalidUtf8Reason::NotShortestForm: MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint); MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved); break; } } } // NOTE: For simplicity in treating |aCharN| identically regardless whether it's // a string literal or a more-generalized array, we require |aCharN| be // null-terminated in all these functions. template static void ExpectBadLeadUnit(const Char (&aCharN)[N]) { ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF, 0xFF, 0xFFFFFFFF, 0xFF); } template static void ExpectNotEnoughUnits(const Char (&aCharN)[N], uint8_t aExpectedUnitsAvailable, uint8_t aExpectedUnitsNeeded) { ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits, aExpectedUnitsAvailable, aExpectedUnitsNeeded, 0xFFFFFFFF, 0xFF); } template static void ExpectBadTrailingUnit(const Char (&aCharN)[N], uint8_t aExpectedUnitsObserved) { ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF, 0xFF, 0xFFFFFFFF, aExpectedUnitsObserved); } template static void ExpectNotShortestForm(const Char (&aCharN)[N], char32_t aExpectedBadCodePoint, uint8_t aExpectedUnitsObserved) { ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF, 0xFF, aExpectedBadCodePoint, aExpectedUnitsObserved); } template static void ExpectBadCodePoint(const Char (&aCharN)[N], char32_t aExpectedBadCodePoint, uint8_t aExpectedUnitsObserved) { ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF, 0xFF, aExpectedBadCodePoint, aExpectedUnitsObserved); } static void TestIsUtf8() { // Note we include the U+0000 NULL in this one -- and that's fine. static const char asciiBytes[] = u8"How about a nice game of chess?"; MOZ_RELEASE_ASSERT(IsUtf8(Span(asciiBytes, ArrayLength(asciiBytes)))); static const char endNonAsciiBytes[] = u8"Life is like a 🌯"; MOZ_RELEASE_ASSERT( IsUtf8(Span(endNonAsciiBytes, ArrayLength(endNonAsciiBytes) - 1))); static const unsigned char badLeading[] = {0x80}; MOZ_RELEASE_ASSERT( !IsUtf8(AsChars(Span(badLeading, ArrayLength(badLeading))))); // Byte-counts // 1 static const char oneBytes[] = u8"A"; // U+0041 LATIN CAPITAL LETTER A constexpr size_t oneBytesLen = ArrayLength(oneBytes); static_assert(oneBytesLen == 2, "U+0041 plus nul"); MOZ_RELEASE_ASSERT(IsUtf8(Span(oneBytes, oneBytesLen))); // 2 static const char twoBytes[] = u8"؆"; // U+0606 ARABIC-INDIC CUBE ROOT constexpr size_t twoBytesLen = ArrayLength(twoBytes); static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul"); MOZ_RELEASE_ASSERT(IsUtf8(Span(twoBytes, twoBytesLen))); ExpectValidCodePoint(twoBytes, 0x0606); // 3 static const char threeBytes[] = u8"᨞"; // U+1A1E BUGINESE PALLAWA constexpr size_t threeBytesLen = ArrayLength(threeBytes); static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul"); MOZ_RELEASE_ASSERT(IsUtf8(Span(threeBytes, threeBytesLen))); ExpectValidCodePoint(threeBytes, 0x1A1E); // 4 static const char fourBytes[] = u8"🁡"; // U+1F061 DOMINO TILE HORIZONTAL-06-06 constexpr size_t fourBytesLen = ArrayLength(fourBytes); static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul"); MOZ_RELEASE_ASSERT(IsUtf8(Span(fourBytes, fourBytesLen))); ExpectValidCodePoint(fourBytes, 0x1F061); // Max code point static const char maxCodePoint[] = u8"􏿿"; // U+10FFFF constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint); static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul"); MOZ_RELEASE_ASSERT(IsUtf8(Span(maxCodePoint, maxCodePointLen))); ExpectValidCodePoint(maxCodePoint, 0x10FFFF); // One past max code point static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80, 0x0}; constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint); MOZ_RELEASE_ASSERT( !IsUtf8(AsChars(Span(onePastMaxCodePoint, onePastMaxCodePointLen)))); ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4); // Surrogate-related testing // (Note that the various code unit sequences here are null-terminated to // simplify life for ExpectValidCodePoint, which presumes null termination.) static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0}; constexpr size_t justBeforeSurrogatesLen = ArrayLength(justBeforeSurrogates) - 1; MOZ_RELEASE_ASSERT( IsUtf8(AsChars(Span(justBeforeSurrogates, justBeforeSurrogatesLen)))); ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF); static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0}; constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1; MOZ_RELEASE_ASSERT(!IsUtf8(AsChars(Span(leastSurrogate, leastSurrogateLen)))); ExpectBadCodePoint(leastSurrogate, 0xD800, 3); static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0}; constexpr size_t arbitraryHighSurrogateLen = ArrayLength(arbitraryHighSurrogate) - 1; MOZ_RELEASE_ASSERT(!IsUtf8( AsChars(Span(arbitraryHighSurrogate, arbitraryHighSurrogateLen)))); ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3); static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0}; constexpr size_t arbitraryLowSurrogateLen = ArrayLength(arbitraryLowSurrogate) - 1; MOZ_RELEASE_ASSERT( !IsUtf8(AsChars(Span(arbitraryLowSurrogate, arbitraryLowSurrogateLen)))); ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3); static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0}; constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1; MOZ_RELEASE_ASSERT( !IsUtf8(AsChars(Span(greatestSurrogate, greatestSurrogateLen)))); ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3); static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0}; constexpr size_t justAfterSurrogatesLen = ArrayLength(justAfterSurrogates) - 1; MOZ_RELEASE_ASSERT( IsUtf8(AsChars(Span(justAfterSurrogates, justAfterSurrogatesLen)))); ExpectValidCodePoint(justAfterSurrogates, 0xE000); } static void TestDecodeOneValidUtf8CodePoint() { // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that // consist of multiple code units, so there are no ASCII tests below. // Length two. ExpectValidCodePoint(u8"€", 0x80); // ExpectValidCodePoint(u8"©", 0xA9); // COPYRIGHT SIGN ExpectValidCodePoint(u8"¶", 0xB6); // PILCROW SIGN ExpectValidCodePoint(u8"¾", 0xBE); // VULGAR FRACTION THREE QUARTERS ExpectValidCodePoint(u8"÷", 0xF7); // DIVISION SIGN ExpectValidCodePoint(u8"ÿ", 0xFF); // LATIN SMALL LETTER Y WITH DIAERESIS ExpectValidCodePoint(u8"Ā", 0x100); // LATIN CAPITAL LETTER A WITH MACRON ExpectValidCodePoint(u8"IJ", 0x132); // LATIN CAPITAL LETTER LIGATURE IJ ExpectValidCodePoint(u8"ͼ", 0x37C); // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL ExpectValidCodePoint(u8"Ӝ", 0x4DC); // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS ExpectValidCodePoint(u8"۩", 0x6E9); // ARABIC PLACE OF SAJDAH ExpectValidCodePoint(u8"߿", 0x7FF); // // Length three. ExpectValidCodePoint(u8"ࠀ", 0x800); // SAMARITAN LETTER ALAF ExpectValidCodePoint(u8"ࡁ", 0x841); // MANDAIC LETTER AB ExpectValidCodePoint(u8"ࣿ", 0x8FF); // ARABIC MARK SIDEWAYS NOON GHUNNA ExpectValidCodePoint(u8"ஆ", 0xB86); // TAMIL LETTER AA ExpectValidCodePoint(u8"༃", 0xF03); // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA ExpectValidCodePoint( u8"࿉", 0xFC9); // TIBETAN SYMBOL NOR BU (but on my system it really looks like // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me) ExpectValidCodePoint(u8"ဪ", 0x102A); // MYANMAR LETTER AU ExpectValidCodePoint(u8"ᚏ", 0x168F); // OGHAM LETTER RUIS ExpectValidCodePoint("\xE2\x80\xA8", 0x2028); // (the hated) LINE SEPARATOR ExpectValidCodePoint("\xE2\x80\xA9", 0x2029); // (the hated) PARAGRAPH SEPARATOR ExpectValidCodePoint(u8"☬", 0x262C); // ADI SHAKTI ExpectValidCodePoint(u8"㊮", 0x32AE); // CIRCLED IDEOGRAPH RESOURCE ExpectValidCodePoint(u8"㏖", 0x33D6); // SQUARE MOL ExpectValidCodePoint(u8"ꔄ", 0xA504); // VAI SYLLABLE WEEN ExpectValidCodePoint(u8"ퟕ", 0xD7D5); // HANGUL JONGSEONG RIEUL-SSANGKIYEOK ExpectValidCodePoint(u8"퟿", 0xD7FF); // ExpectValidCodePoint(u8"", 0xE000); // ExpectValidCodePoint(u8"鱗", 0xF9F2); // CJK COMPATIBILITY IDEOGRAPH-F9F ExpectValidCodePoint( u8"﷽", 0xFDFD); // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM ExpectValidCodePoint(u8"￿", 0xFFFF); // // Length four. ExpectValidCodePoint(u8"𐀀", 0x10000); // LINEAR B SYLLABLE B008 A ExpectValidCodePoint(u8"𔑀", 0x14440); // ANATOLIAN HIEROGLYPH A058 ExpectValidCodePoint(u8"𝛗", 0x1D6D7); // MATHEMATICAL BOLD SMALL PHI ExpectValidCodePoint(u8"💩", 0x1F4A9); // PILE OF POO ExpectValidCodePoint(u8"🔫", 0x1F52B); // PISTOL ExpectValidCodePoint(u8"🥌", 0x1F94C); // CURLING STONE ExpectValidCodePoint(u8"🥏", 0x1F94F); // FLYING DISC ExpectValidCodePoint(u8"𠍆", 0x20346); // CJK UNIFIED IDEOGRAPH-20346 ExpectValidCodePoint(u8"𡠺", 0x2183A); // CJK UNIFIED IDEOGRAPH-2183A ExpectValidCodePoint(u8"񁟶", 0x417F6); // ExpectValidCodePoint(u8"񾠶", 0x7E836); // ExpectValidCodePoint(u8"󾽧", 0xFEF67); // ExpectValidCodePoint(u8"􏿿", 0x10FFFF); // } static void TestDecodeBadLeadUnit() { // These tests are actually exhaustive. unsigned char badLead[] = {'\0', '\0'}; for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) { badLead[0] = lead; ExpectBadLeadUnit(badLead); } { uint8_t lead = 0b1111'1000; do { badLead[0] = lead; ExpectBadLeadUnit(badLead); if (lead == 0b1111'1111) { break; } lead++; } while (true); } } static void TestTooFewOrBadTrailingUnits() { // Lead unit indicates a two-byte code point. char truncatedTwo[] = {'\0', '\0'}; char badTrailTwo[] = {'\0', '\0', '\0'}; for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) { truncatedTwo[0] = lead; ExpectNotEnoughUnits(truncatedTwo, 1, 2); badTrailTwo[0] = lead; for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) { badTrailTwo[1] = trail; ExpectBadTrailingUnit(badTrailTwo, 2); } for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) { badTrailTwo[1] = trail; ExpectBadTrailingUnit(badTrailTwo, 2); } } // Lead unit indicates a three-byte code point. char truncatedThreeOne[] = {'\0', '\0'}; char truncatedThreeTwo[] = {'\0', '\0', '\0'}; unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'}; for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) { truncatedThreeOne[0] = lead; ExpectNotEnoughUnits(truncatedThreeOne, 1, 3); truncatedThreeTwo[0] = lead; ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3); badTrailThree[0] = lead; badTrailThree[2] = 0b1011'1111; // make valid to test overreads for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) { badTrailThree[1] = mid; ExpectBadTrailingUnit(badTrailThree, 2); } { uint8_t mid = 0b1100'0000; do { badTrailThree[1] = mid; ExpectBadTrailingUnit(badTrailThree, 2); if (mid == 0b1111'1111) { break; } mid++; } while (true); } badTrailThree[1] = 0b1011'1111; for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) { badTrailThree[2] = last; ExpectBadTrailingUnit(badTrailThree, 3); } { uint8_t last = 0b1100'0000; do { badTrailThree[2] = last; ExpectBadTrailingUnit(badTrailThree, 3); if (last == 0b1111'1111) { break; } last++; } while (true); } } // Lead unit indicates a four-byte code point. char truncatedFourOne[] = {'\0', '\0'}; char truncatedFourTwo[] = {'\0', '\0', '\0'}; char truncatedFourThree[] = {'\0', '\0', '\0', '\0'}; unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'}; for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) { truncatedFourOne[0] = lead; ExpectNotEnoughUnits(truncatedFourOne, 1, 4); truncatedFourTwo[0] = lead; ExpectNotEnoughUnits(truncatedFourTwo, 2, 4); truncatedFourThree[0] = lead; ExpectNotEnoughUnits(truncatedFourThree, 3, 4); badTrailFour[0] = lead; badTrailFour[2] = badTrailFour[3] = 0b1011'1111; // test for overreads for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) { badTrailFour[1] = second; ExpectBadTrailingUnit(badTrailFour, 2); } { uint8_t second = 0b1100'0000; do { badTrailFour[1] = second; ExpectBadTrailingUnit(badTrailFour, 2); if (second == 0b1111'1111) { break; } second++; } while (true); } badTrailFour[1] = badTrailFour[3] = 0b1011'1111; // test for overreads for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) { badTrailFour[2] = third; ExpectBadTrailingUnit(badTrailFour, 3); } { uint8_t third = 0b1100'0000; do { badTrailFour[2] = third; ExpectBadTrailingUnit(badTrailFour, 3); if (third == 0b1111'1111) { break; } third++; } while (true); } badTrailFour[2] = 0b1011'1111; for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) { badTrailFour[3] = fourth; ExpectBadTrailingUnit(badTrailFour, 4); } { uint8_t fourth = 0b1100'0000; do { badTrailFour[3] = fourth; ExpectBadTrailingUnit(badTrailFour, 4); if (fourth == 0b1111'1111) { break; } fourth++; } while (true); } } } static void TestBadSurrogate() { // These tests are actually exhaustive. ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF); // last before surrogates ExpectValidCodePoint("\xEE\x80\x80", 0xE000); // first after surrogates // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }. Last invalid // surrogate encoding is { 0xED, 0xBF, 0xBF }. char badSurrogate[] = {'\xED', '\0', '\0', '\0'}; for (char32_t c = 0xD800; c < 0xE000; c++) { badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6); badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111)); ExpectBadCodePoint(badSurrogate, c, 3); } } static void TestBadTooBig() { // These tests are actually exhaustive. ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF); // last code point // Four-byte code points are // // 0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx // // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally // representable limit (exclusive) is 2**21 - 1 == 2097152. char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'}; for (char32_t c = 0x11'0000; c < (1 << 21); c++) { tooLargeCodePoint[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); tooLargeCodePoint[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); tooLargeCodePoint[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); ExpectBadCodePoint(tooLargeCodePoint, c, 4); } } static void TestBadCodePoint() { TestBadSurrogate(); TestBadTooBig(); } static void TestNotShortestForm() { { // One-byte in two-byte. char oneInTwo[] = {'\0', '\0', '\0'}; for (char32_t c = '\0'; c < 0x80; c++) { oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6); oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111)); ExpectNotShortestForm(oneInTwo, c, 2); } // One-byte in three-byte. char oneInThree[] = {'\0', '\0', '\0', '\0'}; for (char32_t c = '\0'; c < 0x80; c++) { oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12); oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6); oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111)); ExpectNotShortestForm(oneInThree, c, 3); } // One-byte in four-byte. char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'}; for (char32_t c = '\0'; c < 0x80; c++) { oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); ExpectNotShortestForm(oneInFour, c, 4); } } { // Two-byte in three-byte. char twoInThree[] = {'\0', '\0', '\0', '\0'}; for (char32_t c = 0x80; c < 0x800; c++) { twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12); twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6); twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111)); ExpectNotShortestForm(twoInThree, c, 3); } // Two-byte in four-byte. char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'}; for (char32_t c = 0x80; c < 0x800; c++) { twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); ExpectNotShortestForm(twoInFour, c, 4); } } { // Three-byte in four-byte. char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'}; for (char32_t c = 0x800; c < 0x1'0000; c++) { threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18); threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12); threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6); threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111)); ExpectNotShortestForm(threeInFour, c, 4); } } } static void TestDecodeOneInvalidUtf8CodePoint() { TestDecodeBadLeadUnit(); TestTooFewOrBadTrailingUnits(); TestBadCodePoint(); TestNotShortestForm(); } static void TestDecodeOneUtf8CodePoint() { TestDecodeOneValidUtf8CodePoint(); TestDecodeOneInvalidUtf8CodePoint(); } int main() { TestUtf8Unit(); TestIsUtf8(); TestDecodeOneUtf8CodePoint(); return 0; } #if defined(__clang__) && (__clang_major__ >= 6) # pragma clang diagnostic pop #endif