diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /intl/lwbrk/gtest | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/lwbrk/gtest')
-rw-r--r-- | intl/lwbrk/gtest/TestBreak.cpp | 376 | ||||
-rw-r--r-- | intl/lwbrk/gtest/TestSegmenter.cpp | 209 | ||||
-rw-r--r-- | intl/lwbrk/gtest/TestSegmenterPerf.cpp | 276 | ||||
-rw-r--r-- | intl/lwbrk/gtest/moz.build | 13 |
4 files changed, 874 insertions, 0 deletions
diff --git a/intl/lwbrk/gtest/TestBreak.cpp b/intl/lwbrk/gtest/TestBreak.cpp new file mode 100644 index 0000000000..4e6622dffd --- /dev/null +++ b/intl/lwbrk/gtest/TestBreak.cpp @@ -0,0 +1,376 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stdio.h> + +#include "gtest/gtest.h" +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/WordBreaker.h" +#include "mozilla/Preferences.h" +#include "mozilla/Span.h" +#include "nsISupports.h" +#include "nsServiceManagerUtils.h" +#include "nsString.h" +#include "nsTArray.h" +#include "nsXPCOM.h" + +using mozilla::intl::LineBreaker; +using mozilla::intl::WordBreaker; + +// Turn off clang-format to align the ruler comments to the test strings. + +// clang-format off +static char teng0[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "hello world"; +// clang-format on + +static uint32_t lexp0[] = {5, 11}; + +static uint32_t wexp0[] = {5, 6, 11}; + +// clang-format off +static char teng1[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48."; +// clang-format on + +static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41, + 42, 49, 54, 62, 64, 67, 69, 73}; + +static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22, 23, + 33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57, + 62, 63, 64, 65, 67, 68, 69, 70, 72, 73}; + +// clang-format off +static char teng2[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "()((reasonab(l)e) line break. .01123=45x48."; +// clang-format on + +static uint32_t lexp2[] = {17, 22, 23, 30, 44}; + +static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22, + 24, 29, 30, 31, 32, 37, 38, 43, 44}; + +// clang-format off +static char teng3[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "It's a test to test(ronae ) line break...."; +// clang-format on + +static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42}; + +static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15, + 19, 20, 25, 26, 27, 28, 32, 33, 38, 42}; + +static char ruler1[] = + " 1 2 3 4 5 6 7 "; +static char ruler2[] = + "0123456789012345678901234567890123456789012345678901234567890123456789012"; + +bool Check(const char* in, mozilla::Span<const uint32_t> out, + mozilla::Span<const uint32_t> res) { + const uint32_t outlen = out.Length(); + const uint32_t i = res.Length(); + bool ok = true; + + if (i != outlen) { + ok = false; + printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i); + } + + for (uint32_t j = 0; j < i; j++) { + if (j < outlen) { + if (res[j] != out[j]) { + ok = false; + printf("[%d] expect %d but got %d\n", j, out[j], res[j]); + } + } else { + ok = false; + printf("[%d] additional %d\n", j, res[j]); + } + } + + if (!ok) { + printf("string = \n%s\n", in); + printf("%s\n", ruler1); + printf("%s\n", ruler2); + + printf("Expect = \n"); + for (uint32_t j = 0; j < outlen; j++) { + printf("%d,", out[j]); + } + + printf("\nResult = \n"); + for (uint32_t j = 0; j < i; j++) { + printf("%d,", res[j]); + } + printf("\n"); + } + + return ok; +} + +bool TestASCIILB(const char* in, mozilla::Span<const uint32_t> out) { + NS_ConvertASCIItoUTF16 input(in); + EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; + + nsTArray<uint32_t> result; + int32_t curr = 0; + while (true) { + curr = LineBreaker::Next(input.get(), input.Length(), curr); + if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) { + break; + } + result.AppendElement(curr); + } + + return Check(in, out, result); +} + +bool TestASCIIWB(const char* in, mozilla::Span<const uint32_t> out) { + NS_ConvertASCIItoUTF16 input(in); + EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; + + nsTArray<uint32_t> result; + int32_t curr = 0; + while (true) { + curr = WordBreaker::Next(input.get(), input.Length(), curr); + if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) { + break; + } + result.AppendElement(curr); + } + + return Check(in, out, result); +} + +TEST(LineBreak, LineBreaker) +{ + ASSERT_TRUE(TestASCIILB(teng0, lexp0)); + ASSERT_TRUE(TestASCIILB(teng1, lexp1)); + ASSERT_TRUE(TestASCIILB(teng2, lexp2)); + ASSERT_TRUE(TestASCIILB(teng3, lexp3)); +} + +TEST(WordBreak, WordBreaker) +{ + ASSERT_TRUE(TestASCIIWB(teng0, wexp0)); + ASSERT_TRUE(TestASCIIWB(teng1, wexp1)); + ASSERT_TRUE(TestASCIIWB(teng2, wexp2)); + ASSERT_TRUE(TestASCIIWB(teng3, wexp3)); +} + +// 012345678901234 +static const char wb0[] = "T"; +static const char wb1[] = "h"; +static const char wb2[] = ""; +static const char wb3[] = "is is a int"; +static const char wb4[] = ""; +static const char wb5[] = ""; +static const char wb6[] = "ernationali"; +static const char wb7[] = "zation work."; + +static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7}; + +TEST(WordBreak, TestPrintWordWithBreak) +{ + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + + // This test generate the result string by appending '^' at every word break + // opportunity except the one at end of the text. + nsAutoString result; + + for (uint32_t i = 0; i < numOfFragment; i++) { + NS_ConvertASCIItoUTF16 fragText(wb[i]); + + int32_t cur = 0; + cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); + uint32_t start = 0; + while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) { + result.Append(Substring(fragText, start, cur - start)); + + // Append '^' only if cur is within the fragText. We'll check the word + // break opportunity between fragText and nextFragText using + // BreakInBetween() below. + if (cur < static_cast<int32_t>(fragText.Length())) { + result.Append('^'); + } + start = (cur >= 0 ? cur : cur - start); + cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); + } + + if (i != numOfFragment - 1) { + NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]); + if (nextFragText.IsEmpty()) { + // If nextFragText is empty, there's no new possible word break + // opportunity. + continue; + } + + const auto origFragLen = static_cast<int32_t>(fragText.Length()); + fragText.Append(nextFragText); + + bool canBreak = + origFragLen == + WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen - 1); + if (canBreak) { + result.Append('^'); + } + } + } + ASSERT_STREQ("This^ ^is^ ^a^ ^internationalization^ ^work^.", + NS_ConvertUTF16toUTF8(result).get()); +} + +// This function searches a complete word starting from |offset| in wb[fragN]. +// If it reaches the end of wb[fragN], and there is no word break opportunity +// between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1] +// until a word break. +void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset, + const char* expected) { + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + + NS_ConvertASCIItoUTF16 fragText(wb[fragN]); + + mozilla::intl::WordRange res = WordBreaker::FindWord(fragText, offset); + + nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin)); + + if ((uint32_t)fragText.Length() <= res.mEnd) { + // if we hit the end of the fragment + nsAutoString curFragText = fragText; + for (uint32_t p = fragN + 1; p < numOfFragment; p++) { + NS_ConvertASCIItoUTF16 nextFragText(wb[p]); + if (nextFragText.IsEmpty()) { + // If nextFragText is empty, there's no new possible word break + // opportunity between curFragText and nextFragText. + continue; + } + + const auto origFragLen = static_cast<int32_t>(curFragText.Length()); + curFragText.Append(nextFragText); + bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(), + curFragText.Length(), + origFragLen - 1); + if (canBreak) { + break; + } + mozilla::intl::WordRange r = WordBreaker::FindWord(nextFragText, 0); + + result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin)); + + if ((uint32_t)nextFragText.Length() != r.mEnd) { + break; + } + } + } + + ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get()) + << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")"; +} + +TEST(WordBreak, TestNextWordBreakWithComplexLanguage) +{ + nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01"); + + int32_t offset = 0; + while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) { + int32_t newOffset = + WordBreaker::Next(fragText.get(), fragText.Length(), offset); + ASSERT_NE(offset, newOffset); + offset = newOffset; + } + ASSERT_TRUE(true); +} + +TEST(WordBreak, TestFindWordWithEmptyString) +{ + mozilla::intl::WordRange expect{0, 0}; + mozilla::intl::WordRange result = WordBreaker::FindWord(EmptyString(), 0); + ASSERT_EQ(expect.mBegin, result.mBegin); + ASSERT_EQ(expect.mEnd, result.mEnd); +} + +TEST(WordBreak, TestNextWordBreakWithEmptyString) +{ + char16_t empty[] = {}; + ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 0)); + ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 1)); +} + +TEST(WordBreak, TestFindWordBreakFromPosition) +{ + TestFindWordBreakFromPosition(0, 0, "This"); + TestFindWordBreakFromPosition(1, 0, "his"); + TestFindWordBreakFromPosition(2, 0, "is"); + TestFindWordBreakFromPosition(3, 0, "is"); + TestFindWordBreakFromPosition(3, 1, "is"); + TestFindWordBreakFromPosition(3, 9, " "); + TestFindWordBreakFromPosition(3, 10, "internationalization"); + TestFindWordBreakFromPosition(4, 0, "ernationalization"); + TestFindWordBreakFromPosition(5, 0, "ernationalization"); + TestFindWordBreakFromPosition(6, 4, "ernationalization"); + TestFindWordBreakFromPosition(6, 8, "ernationalization"); + TestFindWordBreakFromPosition(7, 6, " "); + TestFindWordBreakFromPosition(7, 7, "work"); +} + +// Test for StopAtPunctuation option. +TEST(WordBreak, TestFindBreakWithStopAtPunctuation) +{ + bool original = + mozilla::Preferences::GetBool("intl.icu4x.segmenter.enabled", true); + + // Not UAX#29 rule + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", false); + + nsString fragText(u"one.two"); + + mozilla::intl::WordRange result1 = WordBreaker::FindWord(fragText, 0); + ASSERT_EQ(0u, result1.mBegin); + ASSERT_EQ(3u, result1.mEnd); + mozilla::intl::WordRange result2 = WordBreaker::FindWord(fragText, 3); + ASSERT_EQ(3u, result2.mBegin); + ASSERT_EQ(4u, result2.mEnd); + mozilla::intl::WordRange result3 = WordBreaker::FindWord(fragText, 4); + ASSERT_EQ(4u, result3.mBegin); + ASSERT_EQ(7u, result3.mEnd); + + // UAX#29 rule + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + + mozilla::intl::WordRange result4 = WordBreaker::FindWord( + fragText, 0, WordBreaker::FindWordOptions::StopAtPunctuation); + ASSERT_EQ(0u, result4.mBegin); + ASSERT_EQ(3u, result4.mEnd); + mozilla::intl::WordRange result5 = WordBreaker::FindWord( + fragText, 3, WordBreaker::FindWordOptions::StopAtPunctuation); + ASSERT_EQ(3u, result5.mBegin); + ASSERT_EQ(4u, result5.mEnd); + mozilla::intl::WordRange result6 = WordBreaker::FindWord( + fragText, 4, WordBreaker::FindWordOptions::StopAtPunctuation); + ASSERT_EQ(4u, result6.mBegin); + ASSERT_EQ(7u, result6.mEnd); + + // Default (without StopAtPunctuation) + mozilla::intl::WordRange result7 = WordBreaker::FindWord(fragText, 0); + ASSERT_EQ(0u, result7.mBegin); + ASSERT_EQ(7u, result7.mEnd); + mozilla::intl::WordRange result8 = WordBreaker::FindWord(fragText, 3); + ASSERT_EQ(0u, result8.mBegin); + ASSERT_EQ(7u, result8.mEnd); + mozilla::intl::WordRange result9 = WordBreaker::FindWord(fragText, 4); + ASSERT_EQ(0u, result9.mBegin); + ASSERT_EQ(7u, result9.mEnd); + + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", original); +} diff --git a/intl/lwbrk/gtest/TestSegmenter.cpp b/intl/lwbrk/gtest/TestSegmenter.cpp new file mode 100644 index 0000000000..42d04b8e03 --- /dev/null +++ b/intl/lwbrk/gtest/TestSegmenter.cpp @@ -0,0 +1,209 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "gtest/gtest.h" + +#include "mozilla/intl/Segmenter.h" +#include "mozilla/Preferences.h" + +namespace mozilla::intl { + +TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false); + EXPECT_TRUE(rv == NS_OK); + + const SegmenterOptions options{SegmenterGranularity::Line}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto lineSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr<SegmentIteratorUtf16> segIter = + lineSegmenter->Segment(MakeStringSpan(text)); + + // Seek to space between "hello" and "world". + ASSERT_EQ(segIter->Seek(5u), Some(11u)); + + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + const SegmenterOptions options{SegmenterGranularity::Line}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto lineSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr<SegmentIteratorUtf16> segIter = + lineSegmenter->Segment(MakeStringSpan(text)); + + // Seek to space between "hello" and "world". + // UAX#14 rule returns before "w". + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(11u)); + + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple) +{ + const SegmenterOptions options{SegmenterGranularity::Word}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto wordSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr<SegmentIteratorUtf16> segIter = + wordSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(5u)); + ASSERT_EQ(segIter->Next(), Some(6u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); +} + +TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek) +{ + const SegmenterOptions options{SegmenterGranularity::Word}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto wordSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr<SegmentIteratorUtf16> segIter = + wordSegmenter->Segment(MakeStringSpan(text)); + + // Seek to the space between "hello" and "world" + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple) +{ + SegmenterOptions options{SegmenterGranularity::Grapheme}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto graphemeClusterSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr<SegmentIteratorUtf16> segIter = + graphemeClusterSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(1u)); + ASSERT_EQ(segIter->Next(), Some(2u)); + ASSERT_EQ(segIter->Next(), Some(3u)); + ASSERT_EQ(segIter->Next(), Some(4u)); + ASSERT_EQ(segIter->Next(), Some(5u)); + ASSERT_EQ(segIter->Next(), Some(6u)); + ASSERT_EQ(segIter->Next(), Some(7u)); + ASSERT_EQ(segIter->Next(), Some(8u)); + ASSERT_EQ(segIter->Next(), Some(9u)); + ASSERT_EQ(segIter->Next(), Some(10u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek) +{ + SegmenterOptions options{SegmenterGranularity::Grapheme}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto graphemeClusterSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr<SegmentIteratorUtf16> segIter = + graphemeClusterSegmenter->Segment(MakeStringSpan(text)); + + // Seek to the space between "hello" and "world" + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(7u)); + ASSERT_EQ(segIter->Next(), Some(8u)); + ASSERT_EQ(segIter->Next(), Some(9u)); + ASSERT_EQ(segIter->Next(), Some(10u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16) +{ + const char16_t text[] = u"hello world"; + GraphemeClusterBreakReverseIteratorUtf16 segIter(MakeStringSpan(text)); + + // Seek to the space between "hello" and "world" + ASSERT_EQ(segIter.Seek(6u), Some(5u)); + + ASSERT_EQ(segIter.Next(), Some(4u)); + ASSERT_EQ(segIter.Next(), Some(3u)); + ASSERT_EQ(segIter.Next(), Some(2u)); + ASSERT_EQ(segIter.Next(), Some(1u)); + ASSERT_EQ(segIter.Next(), Some(0u)); + ASSERT_EQ(segIter.Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter.Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + SegmenterOptions options{SegmenterGranularity::Sentence}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto sentenceSegmenter = result.unwrap(); + + const char16_t text[] = u"Hello world. Hello world."; + UniquePtr<SegmentIteratorUtf16> segIter = + sentenceSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(13u)); + ASSERT_EQ(segIter->Next(), Some(25u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + SegmenterOptions options{SegmenterGranularity::Sentence}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto sentenceSegmenter = result.unwrap(); + + const char16_t text[] = u"Hello world. Hello world."; + UniquePtr<SegmentIteratorUtf16> segIter = + sentenceSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Seek(5u), Some(13u)); +} + +} // namespace mozilla::intl diff --git a/intl/lwbrk/gtest/TestSegmenterPerf.cpp b/intl/lwbrk/gtest/TestSegmenterPerf.cpp new file mode 100644 index 0000000000..772e284fa8 --- /dev/null +++ b/intl/lwbrk/gtest/TestSegmenterPerf.cpp @@ -0,0 +1,276 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <fstream> + +#include "gtest/gtest.h" +#include "gtest/MozGTestBench.h" // For MOZ_GTEST_BENCH +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/Segmenter.h" +#include "mozilla/Preferences.h" +#include "nsAtom.h" +#include "nsLineBreaker.h" +#include "nsString.h" +#include "nsTArray.h" + +namespace mozilla::intl { + +using mozilla::intl::LineBreakRule; +using mozilla::intl::WordBreakRule; + +constexpr size_t kIterations = 100; + +static std::string ReadFileIntoString(const char* aPath) { + std::ifstream file(aPath); + std::stringstream sstr; + sstr << file.rdbuf(); + return sstr.str(); +} + +class SegmenterPerf : public ::testing::Test { + protected: + void SetUp() override { + // Test files are into xpcom/tests/gtest/wikipedia + mArUtf8 = ReadFileIntoString("ar.txt"); + mDeUtf8 = ReadFileIntoString("de.txt"); + mJaUtf8 = ReadFileIntoString("ja.txt"); + mRuUtf8 = ReadFileIntoString("ru.txt"); + mThUtf8 = ReadFileIntoString("th.txt"); + mTrUtf8 = ReadFileIntoString("tr.txt"); + mViUtf8 = ReadFileIntoString("vi.txt"); + + CopyUTF8toUTF16(mArUtf8, mArUtf16); + CopyUTF8toUTF16(mDeUtf8, mDeUtf16); + CopyUTF8toUTF16(mJaUtf8, mJaUtf16); + CopyUTF8toUTF16(mRuUtf8, mRuUtf16); + CopyUTF8toUTF16(mThUtf8, mThUtf16); + CopyUTF8toUTF16(mTrUtf8, mTrUtf16); + CopyUTF8toUTF16(mViUtf8, mViUtf16); + + mAr = NS_Atomize(u"ar"); + mDe = NS_Atomize(u"de"); + mJa = NS_Atomize(u"ja"); + mRu = NS_Atomize(u"ru"); + mTh = NS_Atomize(u"th"); + mTr = NS_Atomize(u"tr"); + mVi = NS_Atomize(u"vi"); + } + + public: + std::string mArUtf8; + std::string mDeUtf8; + std::string mJaUtf8; + std::string mRuUtf8; + std::string mThUtf8; + std::string mTrUtf8; + std::string mViUtf8; + + nsString mArUtf16; + nsString mDeUtf16; + nsString mJaUtf16; + nsString mRuUtf16; + nsString mThUtf16; + nsString mTrUtf16; + nsString mViUtf16; + + RefPtr<nsAtom> mAr; + RefPtr<nsAtom> mDe; + RefPtr<nsAtom> mJa; + RefPtr<nsAtom> mRu; + RefPtr<nsAtom> mTh; + RefPtr<nsAtom> mTr; + RefPtr<nsAtom> mVi; +}; + +class AutoSetSegmenter final { + public: + explicit AutoSetSegmenter(bool aValue) { + nsresult rv = + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", aValue); + EXPECT_TRUE(rv == NS_OK); + } + + ~AutoSetSegmenter() { + mozilla::Preferences::ClearUser("intl.icu4x.segmenter.enabled"); + } +}; + +static void TestSegmenterBench(const nsString& aStr, bool aIsJaOrZh, + size_t aCount = kIterations) { + nsTArray<uint8_t> breakState; + breakState.SetLength(aStr.Length()); + + for (size_t i = 0; i < aCount; i++) { + LineBreaker::ComputeBreakPositions( + aStr.get(), aStr.Length(), WordBreakRule::Normal, LineBreakRule::Strict, + aIsJaOrZh, breakState.Elements()); + } +} + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakAROld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mArUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakDEOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mDeUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakJAOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mJaUtf16, true); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakRUOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mRuUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTHOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mThUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTROld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mTrUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakVIOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mViUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakAR, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mArUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakDE, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mDeUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakJA, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mJaUtf16, true); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakRU, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mRuUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTH, [this] { + AutoSetSegmenter set(true); + // LSTM segmenter is too slow + TestSegmenterBench(mThUtf16, false, 3); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTR, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mTrUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakVI, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mViUtf16, false); +}); + +class LBSink final : public nsILineBreakSink { + public: + LBSink() = default; + ~LBSink() = default; + + virtual void SetBreaks(uint32_t, uint32_t, uint8_t*) override {} + virtual void SetCapitalization(uint32_t, uint32_t, bool*) override {} +}; + +static void TestDOMSegmenterBench(const nsString& aStr, nsAtom* aLang, + size_t aCount = kIterations) { + LBSink sink; + bool trailingBreak; + + for (size_t i = 0; i < aCount; i++) { + nsLineBreaker breaker; + breaker.AppendText(aLang, aStr.get(), aStr.Length(), 0, &sink); + breaker.Reset(&trailingBreak); + } +} + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakAROld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mArUtf16, mAr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakDEOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mDeUtf16, mDe); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakJAOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mJaUtf16, mJa); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakRUOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mRuUtf16, mRu); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTHOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mThUtf16, mTh); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTROld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mTrUtf16, mTr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakVIOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mViUtf16, mVi); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakAR, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mArUtf16, mAr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakDE, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mDeUtf16, mDe); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakJA, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mJaUtf16, mJa); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakRU, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mRuUtf16, mRu); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTH, [this] { + AutoSetSegmenter set(true); + // LSTM segmenter is too slow + TestDOMSegmenterBench(mThUtf16, mTh, 3); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTR, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mTrUtf16, mTr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakVI, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mViUtf16, mVi); +}); + +} // namespace mozilla::intl diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build new file mode 100644 index 0000000000..092a0f0a86 --- /dev/null +++ b/intl/lwbrk/gtest/moz.build @@ -0,0 +1,13 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += [ + "TestBreak.cpp", + "TestSegmenter.cpp", + "TestSegmenterPerf.cpp", +] + +FINAL_LIBRARY = "xul-gtest" |