/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include #include "gtest/gtest.h" #include "mozilla/intl/LineBreaker.h" #include "mozilla/intl/WordBreaker.h" #include "mozilla/Span.h" #include "nsISupports.h" #include "nsServiceManagerUtils.h" #include "nsString.h" #include "nsTArray.h" #include "nsXPCOM.h" using mozilla::intl::LineBreaker; using mozilla::intl::WordBreaker; // Turn off clang-format to align the ruler comments to the test strings. // clang-format off static char teng0[] = // 1 2 3 4 5 6 7 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 "hello world"; // clang-format on static uint32_t lexp0[] = {5, 11}; static uint32_t wexp0[] = {5, 6, 11}; // clang-format off static char teng1[] = // 1 2 3 4 5 6 7 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48."; // clang-format on static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41, 42, 49, 54, 62, 64, 67, 69, 73}; static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22, 23, 33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57, 62, 63, 64, 65, 67, 68, 69, 70, 72, 73}; // clang-format off static char teng2[] = // 1 2 3 4 5 6 7 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 "()((reasonab(l)e) line break. .01123=45x48."; // clang-format on static uint32_t lexp2[] = {17, 22, 23, 30, 44}; static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22, 24, 29, 30, 31, 32, 37, 38, 43, 44}; // clang-format off static char teng3[] = // 1 2 3 4 5 6 7 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 "It's a test to test(ronae ) line break...."; // clang-format on static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42}; static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15, 19, 20, 25, 26, 27, 28, 32, 33, 38, 42}; static char ruler1[] = " 1 2 3 4 5 6 7 "; static char ruler2[] = "0123456789012345678901234567890123456789012345678901234567890123456789012"; bool Check(const char* in, mozilla::Span out, mozilla::Span res) { const uint32_t outlen = out.Length(); const uint32_t i = res.Length(); bool ok = true; if (i != outlen) { ok = false; printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i); } for (uint32_t j = 0; j < i; j++) { if (j < outlen) { if (res[j] != out[j]) { ok = false; printf("[%d] expect %d but got %d\n", j, out[j], res[j]); } } else { ok = false; printf("[%d] additional %d\n", j, res[j]); } } if (!ok) { printf("string = \n%s\n", in); printf("%s\n", ruler1); printf("%s\n", ruler2); printf("Expect = \n"); for (uint32_t j = 0; j < outlen; j++) { printf("%d,", out[j]); } printf("\nResult = \n"); for (uint32_t j = 0; j < i; j++) { printf("%d,", res[j]); } printf("\n"); } return ok; } bool TestASCIILB(const char* in, mozilla::Span out) { NS_ConvertASCIItoUTF16 input(in); EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; nsTArray result; int32_t curr = 0; while (true) { curr = LineBreaker::Next(input.get(), input.Length(), curr); if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) { break; } result.AppendElement(curr); } return Check(in, out, result); } bool TestASCIIWB(const char* in, mozilla::Span out) { NS_ConvertASCIItoUTF16 input(in); EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; nsTArray result; int32_t curr = 0; while (true) { curr = WordBreaker::Next(input.get(), input.Length(), curr); if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) { break; } result.AppendElement(curr); } return Check(in, out, result); } TEST(LineBreak, LineBreaker) { ASSERT_TRUE(TestASCIILB(teng0, lexp0)); ASSERT_TRUE(TestASCIILB(teng1, lexp1)); ASSERT_TRUE(TestASCIILB(teng2, lexp2)); ASSERT_TRUE(TestASCIILB(teng3, lexp3)); } TEST(WordBreak, WordBreaker) { ASSERT_TRUE(TestASCIIWB(teng0, wexp0)); ASSERT_TRUE(TestASCIIWB(teng1, wexp1)); ASSERT_TRUE(TestASCIIWB(teng2, wexp2)); ASSERT_TRUE(TestASCIIWB(teng3, wexp3)); } // 012345678901234 static const char wb0[] = "T"; static const char wb1[] = "h"; static const char wb2[] = ""; static const char wb3[] = "is is a int"; static const char wb4[] = ""; static const char wb5[] = ""; static const char wb6[] = "ernationali"; static const char wb7[] = "zation work."; static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7}; TEST(WordBreak, TestPrintWordWithBreak) { uint32_t numOfFragment = sizeof(wb) / sizeof(char*); // This test generate the result string by appending '^' at every word break // opportunity except the one at end of the text. nsAutoString result; for (uint32_t i = 0; i < numOfFragment; i++) { NS_ConvertASCIItoUTF16 fragText(wb[i]); int32_t cur = 0; cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); uint32_t start = 0; while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) { result.Append(Substring(fragText, start, cur - start)); // Append '^' only if cur is within the fragText. We'll check the word // break opportunity between fragText and nextFragText using // BreakInBetween() below. if (cur < static_cast(fragText.Length())) { result.Append('^'); } start = (cur >= 0 ? cur : cur - start); cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); } if (i != numOfFragment - 1) { NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]); if (nextFragText.IsEmpty()) { // If nextFragText is empty, there's no new possible word break // opportunity. continue; } const auto origFragLen = static_cast(fragText.Length()); fragText.Append(nextFragText); bool canBreak = origFragLen == WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen - 1); if (canBreak) { result.Append('^'); } } } ASSERT_STREQ("This^ ^is^ ^a^ ^internationalization^ ^work^.", NS_ConvertUTF16toUTF8(result).get()); } // This function searches a complete word starting from |offset| in wb[fragN]. // If it reaches the end of wb[fragN], and there is no word break opportunity // between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1] // until a word break. void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset, const char* expected) { uint32_t numOfFragment = sizeof(wb) / sizeof(char*); NS_ConvertASCIItoUTF16 fragText(wb[fragN]); mozilla::intl::WordRange res = WordBreaker::FindWord(fragText.get(), fragText.Length(), offset); nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin)); if ((uint32_t)fragText.Length() <= res.mEnd) { // if we hit the end of the fragment nsAutoString curFragText = fragText; for (uint32_t p = fragN + 1; p < numOfFragment; p++) { NS_ConvertASCIItoUTF16 nextFragText(wb[p]); if (nextFragText.IsEmpty()) { // If nextFragText is empty, there's no new possible word break // opportunity between curFragText and nextFragText. continue; } const auto origFragLen = static_cast(curFragText.Length()); curFragText.Append(nextFragText); bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(), curFragText.Length(), origFragLen - 1); if (canBreak) { break; } mozilla::intl::WordRange r = WordBreaker::FindWord(nextFragText.get(), nextFragText.Length(), 0); result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin)); if ((uint32_t)nextFragText.Length() != r.mEnd) { break; } } } ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get()) << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")"; } TEST(WordBreak, TestNextWordBreakWithComplexLanguage) { nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01"); int32_t offset = 0; while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) { int32_t newOffset = WordBreaker::Next(fragText.get(), fragText.Length(), offset); ASSERT_NE(offset, newOffset); offset = newOffset; } ASSERT_TRUE(true); } TEST(WordBreak, TestFindWordWithEmptyString) { char16_t empty[] = {}; mozilla::intl::WordRange expect{0, 0}; mozilla::intl::WordRange result = WordBreaker::FindWord(empty, 0, 0); ASSERT_EQ(expect.mBegin, result.mBegin); ASSERT_EQ(expect.mEnd, result.mEnd); } TEST(WordBreak, TestNextWordBreakWithEmptyString) { char16_t empty[] = {}; ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 0)); ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 1)); } TEST(WordBreak, TestFindWordBreakFromPosition) { TestFindWordBreakFromPosition(0, 0, "This"); TestFindWordBreakFromPosition(1, 0, "his"); TestFindWordBreakFromPosition(2, 0, "is"); TestFindWordBreakFromPosition(3, 0, "is"); TestFindWordBreakFromPosition(3, 1, "is"); TestFindWordBreakFromPosition(3, 9, " "); TestFindWordBreakFromPosition(3, 10, "internationalization"); TestFindWordBreakFromPosition(4, 0, "ernationalization"); TestFindWordBreakFromPosition(5, 0, "ernationalization"); TestFindWordBreakFromPosition(6, 4, "ernationalization"); TestFindWordBreakFromPosition(6, 8, "ernationalization"); TestFindWordBreakFromPosition(7, 6, " "); TestFindWordBreakFromPosition(7, 7, "work"); }