diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
commit | ed5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch) | |
tree | 7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /i18npool/qa/cppunit/test_breakiterator.cxx | |
parent | Initial commit. (diff) | |
download | libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.tar.xz libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.zip |
Adding upstream version 4:7.4.7.upstream/4%7.4.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'i18npool/qa/cppunit/test_breakiterator.cxx')
-rw-r--r-- | i18npool/qa/cppunit/test_breakiterator.cxx | 1068 |
1 files changed, 1068 insertions, 0 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx new file mode 100644 index 000000000..cdcbff9be --- /dev/null +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -0,0 +1,1068 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <com/sun/star/i18n/XBreakIterator.hpp> +#include <com/sun/star/i18n/CharacterIteratorMode.hpp> +#include <com/sun/star/i18n/ScriptType.hpp> +#include <com/sun/star/i18n/WordType.hpp> +#include <o3tl/cppunittraitshelper.hxx> +#include <unotest/bootstrapfixturebase.hxx> + +#include <unicode/uvernum.h> + +#include <string.h> + +#include <stack> +#include <string_view> + +using namespace ::com::sun::star; + +class TestBreakIterator : public test::BootstrapFixtureBase +{ +public: + virtual void setUp() override; + virtual void tearDown() override; + + void testLineBreaking(); + void testWordBoundaries(); + void testGraphemeIteration(); + void testWeak(); + void testAsian(); + void testThai(); +#if (U_ICU_VERSION_MAJOR_NUM > 51) + void testLao(); +#ifdef TODO + void testNorthernThai(); + void testKhmer(); +#endif +#endif + void testJapanese(); + void testChinese(); + + CPPUNIT_TEST_SUITE(TestBreakIterator); + CPPUNIT_TEST(testLineBreaking); + CPPUNIT_TEST(testWordBoundaries); + CPPUNIT_TEST(testGraphemeIteration); + CPPUNIT_TEST(testWeak); + CPPUNIT_TEST(testAsian); + CPPUNIT_TEST(testThai); +#if (U_ICU_VERSION_MAJOR_NUM > 51) + CPPUNIT_TEST(testLao); +#ifdef TODO + CPPUNIT_TEST(testKhmer); + CPPUNIT_TEST(testNorthernThai); +#endif +#endif + CPPUNIT_TEST(testJapanese); + CPPUNIT_TEST(testChinese); + CPPUNIT_TEST_SUITE_END(); + +private: + uno::Reference<i18n::XBreakIterator> m_xBreak; + void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak); +}; + +void TestBreakIterator::testLineBreaking() +{ + i18n::LineBreakHyphenationOptions aHyphOptions; + i18n::LineBreakUserOptions aUserOptions; + lang::Locale aLocale; + + //See https://bugs.libreoffice.org/show_bug.cgi?id=31271 + { + OUString aTest("(some text here)"); + + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //Here we want the line break to leave text here) on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex); + } + + { + //Here we want the line break to leave "here)" on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex); + } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=49849 + { + static constexpr OUStringLiteral aWord = u"\u05DE\u05D9\u05DC\u05D9\u05DD"; + OUString aTest(aWord + " " + aWord); + + aLocale.Language = "he"; + aLocale.Country = "IL"; + + { + //Here we want the line break to happen at the whitespace + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=17155 + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //Here we want the line break to leave /bar/ba clumped together on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak("foo /bar/baz", strlen("foo /bar/ba"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=19716 + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + OUString aTest("aaa]aaa"); + //Here we want the line break to move the whole lot to the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex); + } + } + + //this is an example sequence from tdf92993-1.docx caught by the load crashtesting + { + static constexpr OUStringLiteral aTest = u"\U0001f356\U0001f357\U0001f346" + "\U0001f364\u2668\ufe0f\U0001f3c6"; + + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //This must not assert/crash + (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions); + } + } + + //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197 + { + static constexpr OUStringLiteral aTest = u"\uc560\uad6D\uac00\uc758 \uac00" + "\uc0ac\ub294"; + + aLocale.Language = "ko"; + aLocale.Country = "KR"; + + { + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex); + } + } +} + +//See https://bugs.libreoffice.org/show_bug.cgi?id=49629 +void TestBreakIterator::testWordBoundaries() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + i18n::Boundary aBounds; + + //See https://bz.apache.org/ooo/show_bug.cgi?id=11993 + { + OUString aTest("abcd ef ghi??? KLM"); + + CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD)); + aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + + CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD)); + + //next word + aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); + + //previous word + aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD)); + aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD)); + aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=21907 + { + OUString aTest("b a?"); + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD)); + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES)); + + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD)); + + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES)); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=14904 + { + static constexpr OUStringLiteral aTest = + u"Working \u201CWords" + " starting wit" + "h quotes\u201D Work" + "ing \u2018Broken\u2019 " + "?Spanish? doe" + "sn\u2019t work. No" + "t even \u00BFreal? " + "Spanish"; + + aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(44), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(46), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(52), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(55), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(62), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(64), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(71), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(88), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(92), aBounds.endPos); + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=49629 + sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF }; + for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode) + { + //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary + for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i) + { + OUString aTest = "Word" + OUStringChar(aBreakTests[i]) + "Word"; + aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true); + switch (mode) + { + case i18n::WordType::ANY_WORD: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + break; + case i18n::WordType::ANYWORD_IGNOREWHITESPACES: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + break; + case i18n::WordType::DICTIONARY_WORD: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + break; + case i18n::WordType::WORD_COUNT: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos); + break; + } + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode)); + } + } + + sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB }; + for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode) + { + //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary + for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i) + { + OUString aTest = "Word" + OUStringChar(aJoinTests[i]) + "Word"; + aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true); + switch (mode) + { + case i18n::WordType::ANY_WORD: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + break; + case i18n::WordType::ANYWORD_IGNOREWHITESPACES: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + break; + case i18n::WordType::DICTIONARY_WORD: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + break; + case i18n::WordType::WORD_COUNT: + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + break; + } + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode)); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=13494 + { + const OUString aBase("xxAAxxBBxxCCxx"); + const sal_Unicode aTests[] = + { + '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*', + '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/', + '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|' + }; + + const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14}; + for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j) + { + OUString aTest = aBase.replace('x', aTests[j]); + sal_Int32 nPos = -1; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions)); + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos); + ++i; + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + i = SAL_N_ELEMENTS(aDoublePositions)-1; + do + { + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + --i; + CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos); + } + while (nPos > 0); + } + + const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10}; + for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j) + { + OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j])); + sal_Int32 nPos = -1; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions)); + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos); + ++i; + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + i = SAL_N_ELEMENTS(aSinglePositions)-1; + do + { + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + --i; + CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos); + } + while (nPos > 0); + } + + const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10}; + CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]); + { + OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0])); + sal_Int32 nPos = -1; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions)); + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos); + ++i; + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + i = SAL_N_ELEMENTS(aSingleQuotePositions)-1; + do + { + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + --i; + CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos); + } + while (nPos > 0); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=13451 + { + aLocale.Language = "ca"; + aLocale.Country = "ES"; + + OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!"); + + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=85411 + for (int j = 0; j < 3; ++j) + { + switch (j) + { + case 0: + aLocale.Language = "en"; + aLocale.Country = "US"; + break; + case 1: + aLocale.Language = "ca"; + aLocale.Country = "ES"; + break; + case 2: + aLocale.Language = "fi"; + aLocale.Country = "FI"; + break; + default: + CPPUNIT_ASSERT(false); + break; + } + + static constexpr OUStringLiteral aTest = + u"I\u200Bwant\u200Bto\u200Bgo"; + + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {1, 6, 9, 12}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + //https://bz.apache.org/ooo/show_bug.cgi?id=21290 + for (int j = 0; j < 2; ++j) + { + switch (j) + { + case 0: + aLocale.Language = "en"; + aLocale.Country = "US"; + break; + case 1: + aLocale.Language = "grc"; + aLocale.Country.clear(); + break; + default: + CPPUNIT_ASSERT(false); + break; + } + + static constexpr OUStringLiteral aTest = + u"\u1F0C\u03BD\u03B4\u03C1\u03B1 \u1F00" + "\u03C1\u03BD\u1F7B\u03BC\u03B5\u03BD\u03BF" + "\u03C2 \u1F00\u03BB\u03BB \u1F24" + "\u03C3\u03B8\u03B9\u03BF\u03BD"; + + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {5, 15, 19, 26}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=58513 + //See https://bugs.libreoffice.org/show_bug.cgi?id=55707 + { + aLocale.Language = "fi"; + aLocale.Country = "FI"; + + OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"); + + { + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::WORD_COUNT, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + { + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37, + 40, 41, 42, 43, 45, 46, 47, 50, 51}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos); + ++i; + CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos); + ++i; + nPos = aBounds.endPos; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=107843 + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + static constexpr OUStringLiteral aTest = + u"ru\uFB00le \uFB01sh"; + + aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=113785 + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + static constexpr OUStringLiteral aTest = + u"a\u2013b\u2014c"; + + aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + } +} + +//See https://bugs.libreoffice.org/show_bug.cgi?id=40292 +//See https://bz.apache.org/ooo/show_bug.cgi?id=80412 +//See https://bz.apache.org/ooo/show_bug.cgi?id=111152 +//See https://bz.apache.org/ooo/show_bug.cgi?id=50172 +void TestBreakIterator::testGraphemeIteration() +{ + lang::Locale aLocale; + aLocale.Language = "bn"; + aLocale.Country = "IN"; + + { + static constexpr OUStringLiteral aTest = u"\u09AC\u09CD\u09AF"; // BA HALANT LA + + sal_Int32 nDone=0; + sal_Int32 nPos; + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + static constexpr OUStringLiteral aTest = u"\u09B9\u09CD\u09A3\u09BF"; + // HA HALANT NA VOWELSIGNI + + sal_Int32 nDone=0; + sal_Int32 nPos; + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + static constexpr OUStringLiteral aTest = u"\u09A4\u09CD\u09AE\u09CD\u09AF"; + // TA HALANT MA HALANT YA + + sal_Int32 nDone=0; + sal_Int32 nPos; + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + aLocale.Language = "ta"; + aLocale.Country = "IN"; + + { + static constexpr OUStringLiteral aTest = u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"; // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI + + sal_Int32 nDone=0; + sal_Int32 nPos = 0; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos); + nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos); + nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos); + nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos); + nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos); + nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos); + nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + static constexpr OUStringLiteral aTest = u"\u0B95\u0BC1"; // KA VOWELSIGNU + + sal_Int32 nDone=0; + sal_Int32 nPos = 0; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + static constexpr OUStringLiteral aTest = + u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"; + // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI + + sal_Int32 nDone=0; + sal_Int32 nPos=0; + + for (sal_Int32 i = 0; i < 4; ++i) + { + sal_Int32 nOldPos = nPos; + nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos); + } + + for (sal_Int32 i = 0; i < 4; ++i) + { + sal_Int32 nOldPos = nPos; + nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos); + } + } + + { + static constexpr OUStringLiteral aText = u"\u05D0\u05B8"; // ALEF QAMATS + + sal_Int32 nGraphemeCount = 0; + + sal_Int32 nCurPos = 0; + while (nCurPos < aText.getLength()) + { + sal_Int32 nCount2 = 1; + nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(), + i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2); + ++nGraphemeCount; + } + + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount); + } + + aLocale.Language = "hi"; + aLocale.Country = "IN"; + + { + static constexpr OUStringLiteral aTest = u"\u0936\u0940"; // SHA VOWELSIGNII + + sal_Int32 nDone=0; + sal_Int32 nPos = 0; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } +} + +//A test to ensure that certain ranges and codepoints that are categorized as +//weak remain as weak, so that existing docs that depend on this don't silently +//change font for those weak chars +void TestBreakIterator::testWeak() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + static constexpr OUStringLiteral aWeaks = + u"\u0001\u0002" + " \u00A0" + "\u0300\u036F" //Combining Diacritical Marks + "\u1AB0\u1AFF" //Combining Diacritical Marks Extended + "\u1DC0\u1DFF" //Combining Diacritical Marks Supplement + "\u20D0\u20FF" //Combining Diacritical Marks for Symbols + "\u2150\u215F" //Number Forms, fractions + "\u2160\u2180" //Number Forms, roman numerals + "\u2200\u22FF" //Mathematical Operators + "\u27C0\u27EF" //Miscellaneous Mathematical Symbols-A + "\u2980\u29FF" //Miscellaneous Mathematical Symbols-B + "\u2A00\u2AFF" //Supplemental Mathematical Operators + "\u2100\u214F" //Letterlike Symbols + "\u2308\u230B" //Miscellaneous technical + "\u25A0\u25FF" //Geometric Shapes + "\u2B30\u2B4C"; //Miscellaneous Symbols and Arrows + + for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i) + { + sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i); + OString aMsg = + "Char 0x" + + OString::number(static_cast<sal_Int32>(std::u16string_view(aWeaks)[i]), 16) + + " should have been weak"; + CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(), + i18n::ScriptType::WEAK, nScript); + } + } +} + +//A test to ensure that certain ranges and codepoints that are categorized as +//asian remain as asian, so that existing docs that depend on this don't silently +//change font for those asian chars. +//See https://bugs.libreoffice.org/show_bug.cgi?id=38095 +void TestBreakIterator::testAsian() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + static constexpr OUStringLiteral aAsians = + //some typical CJK chars + u"\u4E00\u62FF" + //The full HalfWidth and FullWidth block has historically been + //designated as taking the CJK font :-( + //HalfWidth and FullWidth forms of ASCII 0-9, categorized under + //UAX24 as "Common" i.e. by that logic WEAK + "\uFF10\uFF19" + //HalfWidth and FullWidth forms of ASCII A-z, categorized under + //UAX25 as "Latin", i.e. by that logic LATIN + "\uFF21\uFF5A"; + + for (sal_Int32 i = 0; i < aAsians.getLength(); ++i) + { + sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i); + OString aMsg = + "Char 0x" + + OString::number(static_cast<sal_Int32>(std::u16string_view(aAsians)[i]), 16) + + " should have been asian"; + CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(), + i18n::ScriptType::ASIAN, nScript); + } + } +} + +#if (U_ICU_VERSION_MAJOR_NUM > 51) +//A test to ensure that our Lao word boundary detection is useful +void TestBreakIterator::testLao() +{ + lang::Locale aLocale; + aLocale.Language = "lo"; + aLocale.Country = "LA"; + + static constexpr OUStringLiteral aTest = u"\u0e8d\u0eb4\u0e99\u0e94\u0eb5\u0e95\u0ec9\u0ead\u0e99\u0eae\u0eb1\u0e9a"; + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); +#if (U_ICU_VERSION_MAJOR_NUM < 70) + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); +#else + // FIXME: + // In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and + // instead the length 12 is returned as endpos. + // Deep in + // icu_70::RuleBasedBreakIterator::BreakCache::next() + // icu_70::RuleBasedBreakIterator::BreakCache::following() + // icu_70::RuleBasedBreakIterator::following() + // i18npool::BreakIterator_Unicode::getWordBoundary() + CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); +#endif +} +#endif + +//A test to ensure that our thai word boundary detection is useful +void TestBreakIterator::testThai() +{ + lang::Locale aLocale; + aLocale.Language = "th"; + aLocale.Country = "TH"; + + //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html + { + static constexpr OUStringLiteral aTest = u"\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A"; + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word", + sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word", + aTest.getLength(), aBounds.endPos); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=29548 + //make sure forwards and back are consistent + { + static constexpr OUStringLiteral aTest = + u"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41" + "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34" + "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27" + "\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41" + "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34" + "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"; + + std::stack<sal_Int32> aPositions; + sal_Int32 nPos = -1; + do + { + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + aPositions.push(nPos); + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + CPPUNIT_ASSERT(!aPositions.empty()); + aPositions.pop(); + do + { + CPPUNIT_ASSERT(!aPositions.empty()); + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos); + aPositions.pop(); + } + while (nPos > 0); + } + + // tdf#113694 + { + static constexpr OUStringLiteral aTest = u"\U00010000"; + + sal_Int32 nDone=0; + sal_Int32 nPos; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos); + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos); + nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale, + i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos); + } +} + +#ifdef TODO +void TestBreakIterator::testNorthernThai() +{ + lang::Locale aLocale; + aLocale.Language = "nod"; + aLocale.Country = "TH"; + + const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A }; + OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1)); + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_MESSAGE("Should skip full word", + aBounds.startPos == 0 && aBounds.endPos == aTest.getLength()); +} + +// Not sure if any version earlier than 49 did have Khmer word boundary +// dictionaries, 4.6 does not. + +// As of icu 54, word boundary detection for Khmer is still considered +// insufficient, so icu khmer stuff is disabled + +//A test to ensure that our khmer word boundary detection is useful +//https://bugs.libreoffice.org/show_bug.cgi?id=52020 +void TestBreakIterator::testKhmer() +{ + lang::Locale aLocale; + aLocale.Language = "km"; + aLocale.Country = "KH"; + + const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 }; + + OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER)); + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3); + + aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5); +} +#endif + +void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak) +{ + lang::Locale aLocale; + aLocale.Language = "ja"; + aLocale.Country = "JP"; + i18n::Boundary aBounds; + + { + static constexpr OUStringLiteral aTest = u"\u30B7\u30E3\u30C3\u30C8\u30C0\u30A6\u30F3"; + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + } + + { + static constexpr OUStringLiteral aTest = u"\u9EBB\u306E\u8449\u9EBB\u306E\u8449"; + + aBounds = xBreak->getWordBoundary(aTest, 1, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } +} + +void TestBreakIterator::testJapanese() +{ + doTestJapanese(m_xBreak); + + // fdo#78479 - test second / cached instantiation of xdictionary + uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance( + "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW); + + doTestJapanese(xTmpBreak); +} + +void TestBreakIterator::testChinese() +{ + lang::Locale aLocale; + aLocale.Language = "zh"; + aLocale.Country = "CN"; + + { + static constexpr OUStringLiteral aTest = u"\u6A35\u6A30\u69FE\u8919\U00029EDB"; + + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } +} +void TestBreakIterator::setUp() +{ + BootstrapFixtureBase::setUp(); + m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW); +} + +void TestBreakIterator::tearDown() +{ + m_xBreak.clear(); + BootstrapFixtureBase::tearDown(); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator); + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |