summaryrefslogtreecommitdiffstats
path: root/i18npool/qa
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--i18npool/qa/cppunit/test_breakiterator.cxx1065
-rw-r--r--i18npool/qa/cppunit/test_characterclassification.cxx105
-rw-r--r--i18npool/qa/cppunit/test_defaultnumberingprovider.cxx131
-rw-r--r--i18npool/qa/cppunit/test_ordinalsuffix.cxx100
-rw-r--r--i18npool/qa/cppunit/test_textsearch.cxx402
5 files changed, 1803 insertions, 0 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx
new file mode 100644
index 000000000..a41cd8218
--- /dev/null
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -0,0 +1,1065 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <com/sun/star/i18n/XBreakIterator.hpp>
+#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
+#include <com/sun/star/i18n/ScriptType.hpp>
+#include <com/sun/star/i18n/WordType.hpp>
+#include <o3tl/cppunittraitshelper.hxx>
+#include <unotest/bootstrapfixturebase.hxx>
+
+#include <unicode/uvernum.h>
+
+#include <rtl/strbuf.hxx>
+
+#include <string.h>
+
+#include <stack>
+
+using namespace ::com::sun::star;
+
+class TestBreakIterator : public test::BootstrapFixtureBase
+{
+public:
+ virtual void setUp() override;
+ virtual void tearDown() override;
+
+ void testLineBreaking();
+ void testWordBoundaries();
+ void testGraphemeIteration();
+ void testWeak();
+ void testAsian();
+ void testThai();
+#if (U_ICU_VERSION_MAJOR_NUM > 51)
+ void testLao();
+#ifdef TODO
+ void testNorthernThai();
+ void testKhmer();
+#endif
+#endif
+ void testJapanese();
+ void testChinese();
+
+ CPPUNIT_TEST_SUITE(TestBreakIterator);
+ CPPUNIT_TEST(testLineBreaking);
+ CPPUNIT_TEST(testWordBoundaries);
+ CPPUNIT_TEST(testGraphemeIteration);
+ CPPUNIT_TEST(testWeak);
+ CPPUNIT_TEST(testAsian);
+ CPPUNIT_TEST(testThai);
+#if (U_ICU_VERSION_MAJOR_NUM > 51)
+ CPPUNIT_TEST(testLao);
+#ifdef TODO
+ CPPUNIT_TEST(testKhmer);
+ CPPUNIT_TEST(testNorthernThai);
+#endif
+#endif
+ CPPUNIT_TEST(testJapanese);
+ CPPUNIT_TEST(testChinese);
+ CPPUNIT_TEST_SUITE_END();
+
+private:
+ uno::Reference<i18n::XBreakIterator> m_xBreak;
+ void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
+};
+
+void TestBreakIterator::testLineBreaking()
+{
+ i18n::LineBreakHyphenationOptions aHyphOptions;
+ i18n::LineBreakUserOptions aUserOptions;
+ lang::Locale aLocale;
+
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
+ {
+ OUString aTest("(some text here)");
+
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ //Here we want the line break to leave text here) on the next line
+ i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
+ }
+
+ {
+ //Here we want the line break to leave "here)" on the next line
+ i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
+ }
+ }
+
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
+ {
+ const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
+ OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
+ OUString aTest(aWord + " " + aWord);
+
+ aLocale.Language = "he";
+ aLocale.Country = "IL";
+
+ {
+ //Here we want the line break to happen at the whitespace
+ i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
+ }
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
+ {
+ OUString const aTest("foo /bar/baz");
+
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ //Here we want the line break to leave /bar/ba clumped together on the next line
+ i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
+ aHyphOptions, aUserOptions);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
+ }
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
+ {
+ OUString aTest("aaa]aaa");
+
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ //Here we want the line break to move the whole lot to the next line
+ i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
+ aHyphOptions, aUserOptions);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
+ }
+ }
+
+ //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
+ {
+ const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46,
+ 0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6};
+
+ OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1));
+
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ //This must not assert/crash
+ (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
+ }
+ }
+
+ //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
+ {
+ const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
+ 0xc0ac, 0xb294};
+ OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
+
+ aLocale.Language = "ko";
+ aLocale.Country = "KR";
+
+ {
+ i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
+ aHyphOptions, aUserOptions);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
+ }
+ }
+}
+
+//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
+void TestBreakIterator::testWordBoundaries()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ i18n::Boundary aBounds;
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
+ {
+ OUString aTest("abcd ef ghi??? KLM");
+
+ CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
+ aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
+
+ CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
+ CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
+
+ //next word
+ aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
+
+ //previous word
+ aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
+
+ CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
+ aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
+
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
+ CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
+ aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
+ {
+ OUString aTest("b a?");
+
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
+
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
+
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
+
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
+ {
+ const sal_Unicode TEST[] =
+ {
+ 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
+ ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
+ 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
+ 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
+ '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
+ 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
+ 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
+ 'S', 'p', 'a', 'n', 'i', 's', 'h'
+ };
+ OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
+ }
+
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
+ sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
+ for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
+ {
+ //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
+ for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
+ {
+#if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
+ //Note the breakiterator test is known to fail on older icu
+ //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
+ if (aBreakTests[i] == 0x200B)
+ continue;
+#endif
+ OUString aTest = "Word" + OUStringChar(aBreakTests[i]) + "Word";
+ aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
+ switch (mode)
+ {
+ case i18n::WordType::ANY_WORD:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
+ break;
+ case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
+ break;
+ case i18n::WordType::DICTIONARY_WORD:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
+ break;
+ case i18n::WordType::WORD_COUNT:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
+ break;
+ }
+
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
+ }
+ }
+
+ sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
+ for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
+ {
+ //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
+ for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
+ {
+ OUString aTest = "Word" + OUStringChar(aJoinTests[i]) + "Word";
+ aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
+ switch (mode)
+ {
+ case i18n::WordType::ANY_WORD:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
+ break;
+ case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
+ break;
+ case i18n::WordType::DICTIONARY_WORD:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
+ break;
+ case i18n::WordType::WORD_COUNT:
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
+ break;
+ }
+
+ CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
+ CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
+ }
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
+ {
+ const OUString aBase("xxAAxxBBxxCCxx");
+ const sal_Unicode aTests[] =
+ {
+ '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
+ '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
+ '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
+ };
+
+ const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
+ for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
+ {
+ OUString aTest = aBase.replace('x', aTests[j]);
+ sal_Int32 nPos = -1;
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
+ nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
+ ++i;
+ }
+ while (nPos < aTest.getLength());
+ nPos = aTest.getLength();
+ i = SAL_N_ELEMENTS(aDoublePositions)-1;
+ do
+ {
+ nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ --i;
+ CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
+ }
+ while (nPos > 0);
+ }
+
+ const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
+ for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
+ {
+ OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
+ sal_Int32 nPos = -1;
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
+ nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
+ ++i;
+ }
+ while (nPos < aTest.getLength());
+ nPos = aTest.getLength();
+ i = SAL_N_ELEMENTS(aSinglePositions)-1;
+ do
+ {
+ nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ --i;
+ CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
+ }
+ while (nPos > 0);
+ }
+
+ const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
+ CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
+ {
+ OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
+ sal_Int32 nPos = -1;
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
+ nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
+ ++i;
+ }
+ while (nPos < aTest.getLength());
+ nPos = aTest.getLength();
+ i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
+ do
+ {
+ nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ --i;
+ CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
+ }
+ while (nPos > 0);
+ }
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
+ {
+ aLocale.Language = "ca";
+ aLocale.Country = "ES";
+
+ OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
+
+ sal_Int32 nPos = 0;
+ sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
+ nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true).endPos;
+ CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
+ ++i;
+ }
+ while (nPos++ < aTest.getLength());
+ CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
+ for (int j = 0; j < 3; ++j)
+ {
+ switch (j)
+ {
+ case 0:
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+ break;
+ case 1:
+ aLocale.Language = "ca";
+ aLocale.Country = "ES";
+ break;
+ case 2:
+ aLocale.Language = "fi";
+ aLocale.Country = "FI";
+ break;
+ default:
+ CPPUNIT_ASSERT(false);
+ break;
+ }
+
+ const sal_Unicode TEST[] =
+ {
+ 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
+ };
+ OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
+
+ sal_Int32 nPos = 0;
+ sal_Int32 aExpected[] = {1, 6, 9, 12};
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
+ nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true).endPos;
+ CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
+ ++i;
+ }
+ while (nPos++ < aTest.getLength());
+ CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
+ }
+
+ //https://bz.apache.org/ooo/show_bug.cgi?id=21290
+ for (int j = 0; j < 2; ++j)
+ {
+ switch (j)
+ {
+ case 0:
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+ break;
+ case 1:
+ aLocale.Language = "grc";
+ aLocale.Country.clear();
+ break;
+ default:
+ CPPUNIT_ASSERT(false);
+ break;
+ }
+
+ const sal_Unicode TEST[] =
+ {
+ 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
+ 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
+ 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
+ 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
+ };
+ OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
+
+ sal_Int32 nPos = 0;
+ sal_Int32 aExpected[] = {5, 15, 19, 26};
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
+ nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true).endPos;
+ CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
+ ++i;
+ }
+ while (nPos++ < aTest.getLength());
+ CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
+ //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
+ {
+ aLocale.Language = "fi";
+ aLocale.Country = "FI";
+
+ OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
+
+ {
+ sal_Int32 nPos = 0;
+ sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
+ nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+ i18n::WordType::WORD_COUNT, true).endPos;
+ CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
+ ++i;
+ }
+ while (nPos++ < aTest.getLength());
+ CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
+ }
+
+ {
+ sal_Int32 nPos = 0;
+ sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
+ 40, 41, 42, 43, 45, 46, 47, 50, 51};
+ size_t i = 0;
+ do
+ {
+ CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
+ aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
+ ++i;
+ CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
+ ++i;
+ nPos = aBounds.endPos;
+ }
+ while (nPos++ < aTest.getLength());
+ CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
+ }
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
+ {
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ const sal_Unicode TEST[] =
+ {
+ 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
+ };
+ OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
+ CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
+ {
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ const sal_Unicode TEST[] =
+ {
+ 'a', 0x2013, 'b', 0x2014, 'c'
+ };
+ OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
+
+ aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
+
+ aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
+ CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
+
+ aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
+ CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
+ }
+}
+
+//See https://bugs.libreoffice.org/show_bug.cgi?id=40292
+//See https://bz.apache.org/ooo/show_bug.cgi?id=80412
+//See https://bz.apache.org/ooo/show_bug.cgi?id=111152
+//See https://bz.apache.org/ooo/show_bug.cgi?id=50172
+void TestBreakIterator::testGraphemeIteration()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "bn";
+ aLocale.Country = "IN";
+
+ {
+ const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
+ OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos;
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
+ }
+
+ {
+ const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
+ OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos;
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
+ }
+
+ {
+ const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
+ OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos;
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
+ }
+
+ aLocale.Language = "ta";
+ aLocale.Country = "IN";
+
+ {
+ const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
+ OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos = 0;
+
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
+ }
+
+ {
+ const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
+ OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos = 0;
+
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
+ }
+
+ {
+ const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
+ { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
+ OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
+ SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos=0;
+
+ for (sal_Int32 i = 0; i < 4; ++i)
+ {
+ sal_Int32 nOldPos = nPos;
+ nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
+ }
+
+ for (sal_Int32 i = 0; i < 4; ++i)
+ {
+ sal_Int32 nOldPos = nPos;
+ nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
+ }
+ }
+
+ {
+ const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
+ OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
+
+ sal_Int32 nGraphemeCount = 0;
+
+ sal_Int32 nCurPos = 0;
+ while (nCurPos < aText.getLength())
+ {
+ sal_Int32 nCount2 = 1;
+ nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
+ i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
+ ++nGraphemeCount;
+ }
+
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
+ }
+
+ aLocale.Language = "hi";
+ aLocale.Country = "IN";
+
+ {
+ const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
+ OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos = 0;
+
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
+ }
+}
+
+//A test to ensure that certain ranges and codepoints that are categorized as
+//weak remain as weak, so that existing docs that depend on this don't silently
+//change font for those weak chars
+void TestBreakIterator::testWeak()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ const sal_Unicode WEAKS[] =
+ {
+ 0x0001, 0x0002,
+ 0x0020, 0x00A0,
+ 0x0300, 0x036F, //Combining Diacritical Marks
+ 0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended
+ 0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement
+ 0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols
+ 0x2150, 0x215F, //Number Forms, fractions
+ 0x2160, 0x2180, //Number Forms, roman numerals
+ 0x2200, 0x22FF, //Mathematical Operators
+ 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
+ 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
+ 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
+ 0x2100, 0x214F, //Letterlike Symbols
+ 0x2308, 0x230B, //Miscellaneous technical
+ 0x25A0, 0x25FF, //Geometric Shapes
+ 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
+ };
+ OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
+
+ for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
+ {
+ sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
+ OString aMsg =
+ "Char 0x" +
+ OString::number(static_cast<sal_Int32>(aWeaks[i]), 16) +
+ " should have been weak";
+ CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
+ i18n::ScriptType::WEAK, nScript);
+ }
+ }
+}
+
+//A test to ensure that certain ranges and codepoints that are categorized as
+//asian remain as asian, so that existing docs that depend on this don't silently
+//change font for those asian chars.
+//See https://bugs.libreoffice.org/show_bug.cgi?id=38095
+void TestBreakIterator::testAsian()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ const sal_Unicode ASIANS[] =
+ {
+ //some typical CJK chars
+ 0x4E00, 0x62FF,
+ //The full HalfWidth and FullWidth block has historically been
+ //designated as taking the CJK font :-(
+ //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
+ //UAX24 as "Common" i.e. by that logic WEAK
+ 0xFF10, 0xFF19,
+ //HalfWidth and FullWidth forms of ASCII A-z, categorized under
+ //UAX25 as "Latin", i.e. by that logic LATIN
+ 0xFF21, 0xFF5A
+ };
+ OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
+
+ for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
+ {
+ sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
+ OString aMsg =
+ "Char 0x" +
+ OString::number(static_cast<sal_Int32>(aAsians[i]), 16) +
+ " should have been asian";
+ CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
+ i18n::ScriptType::ASIAN, nScript);
+ }
+ }
+}
+
+#if (U_ICU_VERSION_MAJOR_NUM > 51)
+//A test to ensure that our Lao word boundary detection is useful
+void TestBreakIterator::testLao()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "lo";
+ aLocale.Country = "LA";
+
+ const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
+ OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
+ i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+}
+#endif
+
+//A test to ensure that our thai word boundary detection is useful
+void TestBreakIterator::testThai()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "th";
+ aLocale.Country = "TH";
+
+ //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
+ {
+ const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
+ OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
+ i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_MESSAGE("Should skip full word",
+ aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
+ }
+
+ //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
+ //make sure forwards and back are consistent
+ {
+ const sal_Unicode THAI[] =
+ {
+ 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
+ 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
+ 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
+ 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
+ 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
+ 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
+ };
+ OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
+
+ std::stack<sal_Int32> aPositions;
+ sal_Int32 nPos = -1;
+ do
+ {
+ nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ aPositions.push(nPos);
+ }
+ while (nPos < aTest.getLength());
+ nPos = aTest.getLength();
+ CPPUNIT_ASSERT(!aPositions.empty());
+ aPositions.pop();
+ do
+ {
+ CPPUNIT_ASSERT(!aPositions.empty());
+ nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
+ CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
+ aPositions.pop();
+ }
+ while (nPos > 0);
+ }
+
+ // tdf#113694
+ {
+ const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 };
+ OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP));
+
+ sal_Int32 nDone=0;
+ sal_Int32 nPos;
+
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
+ i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
+
+ nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
+ i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
+ nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
+ i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
+ }
+}
+
+#ifdef TODO
+void TestBreakIterator::testNorthernThai()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "nod";
+ aLocale.Country = "TH";
+
+ const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
+ OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
+ i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT_MESSAGE("Should skip full word",
+ aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
+}
+
+// Not sure if any version earlier than 49 did have Khmer word boundary
+// dictionaries, 4.6 does not.
+
+// As of icu 54, word boundary detection for Khmer is still considered
+// insufficient, so icu khmer stuff is disabled
+
+//A test to ensure that our khmer word boundary detection is useful
+//https://bugs.libreoffice.org/show_bug.cgi?id=52020
+void TestBreakIterator::testKhmer()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "km";
+ aLocale.Country = "KH";
+
+ const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
+
+ OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
+ i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
+
+ aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
+}
+#endif
+
+void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
+{
+ lang::Locale aLocale;
+ aLocale.Language = "ja";
+ aLocale.Country = "JP";
+ i18n::Boundary aBounds;
+
+ {
+ const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
+
+ OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
+ aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
+ }
+
+ {
+ const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
+
+ OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
+ aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
+
+ aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+
+ CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
+ }
+}
+
+void TestBreakIterator::testJapanese()
+{
+ doTestJapanese(m_xBreak);
+
+ // fdo#78479 - test second / cached instantiation of xdictionary
+ uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
+ "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
+
+ doTestJapanese(xTmpBreak);
+}
+
+void TestBreakIterator::testChinese()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "zh";
+ aLocale.Country = "CN";
+ i18n::Boundary aBounds;
+
+ {
+ const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
+
+ OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
+ aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
+ i18n::WordType::DICTIONARY_WORD, true);
+ CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
+ }
+}
+void TestBreakIterator::setUp()
+{
+ BootstrapFixtureBase::setUp();
+ m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
+}
+
+void TestBreakIterator::tearDown()
+{
+ m_xBreak.clear();
+ BootstrapFixtureBase::tearDown();
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
+
+CPPUNIT_PLUGIN_IMPLEMENT();
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/qa/cppunit/test_characterclassification.cxx b/i18npool/qa/cppunit/test_characterclassification.cxx
new file mode 100644
index 000000000..4af398440
--- /dev/null
+++ b/i18npool/qa/cppunit/test_characterclassification.cxx
@@ -0,0 +1,105 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <com/sun/star/i18n/XCharacterClassification.hpp>
+#include <unotest/bootstrapfixturebase.hxx>
+
+using namespace ::com::sun::star;
+
+class TestCharacterClassification : public test::BootstrapFixtureBase
+{
+public:
+ virtual void setUp() override;
+ virtual void tearDown() override;
+
+ void testTitleCase();
+ void testStringType();
+
+ CPPUNIT_TEST_SUITE(TestCharacterClassification);
+ CPPUNIT_TEST(testTitleCase);
+ CPPUNIT_TEST(testStringType);
+ CPPUNIT_TEST_SUITE_END();
+private:
+ uno::Reference<i18n::XCharacterClassification> m_xCC;
+};
+
+//A test to ensure that our Title Case functionality is working
+//http://lists.freedesktop.org/archives/libreoffice/2012-June/032767.html
+//https://bz.apache.org/ooo/show_bug.cgi?id=30863
+void TestCharacterClassification::testTitleCase()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ //basic example
+ OUString sTest("Some text");
+ OUString sTitleCase = m_xCC->toTitle(sTest, 0, sTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be title", OUString("Some Text"), sTitleCase);
+ OUString sUpperCase = m_xCC->toUpper(sTest, 0, sTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be upper", OUString("SOME TEXT"), sUpperCase);
+ OUString sLowerCase = m_xCC->toLower(sTest, 0, sTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be lower ", OUString("some text"), sLowerCase);
+ }
+
+ {
+ //tricky one
+ const sal_Unicode LATINSMALLLETTERDZ[] = { 0x01F3 };
+ OUString aTest(LATINSMALLLETTERDZ, SAL_N_ELEMENTS(LATINSMALLLETTERDZ));
+ OUString sTitleCase = m_xCC->toTitle(aTest, 0, aTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_MESSAGE("Should be title", sTitleCase.getLength() == 1 && sTitleCase[0] == 0x01F2);
+ OUString sUpperCase = m_xCC->toUpper(aTest, 0, aTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_MESSAGE("Should be upper", sUpperCase.getLength() == 1 && sUpperCase[0] == 0x01F1);
+ OUString sLowerCase = m_xCC->toLower(aTest, 0, aTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_MESSAGE("Should be lower ", sLowerCase.getLength() == 1 && sLowerCase[0] == 0x01F3);
+ }
+}
+
+//https://bugs.libreoffice.org/show_bug.cgi?id=69641
+void TestCharacterClassification::testStringType()
+{
+ lang::Locale aLocale;
+ aLocale.Language = "en";
+ aLocale.Country = "US";
+
+ {
+ //simple case
+ OUString sTest("Some text");
+ sal_Int32 nResult = m_xCC->getStringType(sTest, 0, sTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(230), nResult);
+ }
+
+ {
+ //tricky case
+ const sal_Unicode MATHEMATICAL_ITALIC_SMALL_THETA[] = { 0xD835, 0xDF03 };
+ OUString sTest(MATHEMATICAL_ITALIC_SMALL_THETA, SAL_N_ELEMENTS(MATHEMATICAL_ITALIC_SMALL_THETA));
+ sal_Int32 nResult = m_xCC->getStringType(sTest, 0, sTest.getLength(), aLocale);
+ CPPUNIT_ASSERT_EQUAL(sal_Int32(228), nResult);
+ }
+
+}
+
+void TestCharacterClassification::setUp()
+{
+ BootstrapFixtureBase::setUp();
+ m_xCC.set(m_xSFactory->createInstance("com.sun.star.i18n.CharacterClassification"), uno::UNO_QUERY_THROW);
+}
+
+void TestCharacterClassification::tearDown()
+{
+ BootstrapFixtureBase::tearDown();
+ m_xCC.clear();
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(TestCharacterClassification);
+
+CPPUNIT_PLUGIN_IMPLEMENT();
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/qa/cppunit/test_defaultnumberingprovider.cxx b/i18npool/qa/cppunit/test_defaultnumberingprovider.cxx
new file mode 100644
index 000000000..dbe55050a
--- /dev/null
+++ b/i18npool/qa/cppunit/test_defaultnumberingprovider.cxx
@@ -0,0 +1,131 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <test/bootstrapfixture.hxx>
+
+#include <com/sun/star/style/NumberingType.hpp>
+#include <com/sun/star/text/DefaultNumberingProvider.hpp>
+#include <com/sun/star/text/XNumberingFormatter.hpp>
+
+#include <comphelper/propertyvalue.hxx>
+
+using namespace ::com::sun::star;
+
+/// i18npool defaultnumberingprovider tests.
+class I18npoolDefaultnumberingproviderTest : public test::BootstrapFixture
+{
+};
+
+CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero)
+{
+ // 1 -> "01"
+ uno::Reference<text::XNumberingFormatter> xFormatter(
+ text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY);
+ uno::Sequence<beans::PropertyValue> aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(1)),
+ };
+ lang::Locale aLocale;
+ OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ // Without the accompanying fix in place, this test would have failed with a
+ // lang.IllegalArgumentException, support for ARABIC_ZERO was missing.
+ CPPUNIT_ASSERT_EQUAL(OUString("01"), aActual);
+
+ // 10 -> "10"
+ aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(10)),
+ };
+ aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ CPPUNIT_ASSERT_EQUAL(OUString("10"), aActual);
+}
+
+CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero3)
+{
+ // 10 -> "010"
+ uno::Reference<text::XNumberingFormatter> xFormatter(
+ text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY);
+ uno::Sequence<beans::PropertyValue> aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO3)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(10)),
+ };
+ lang::Locale aLocale;
+ OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ // Without the accompanying fix in place, this test would have failed with a
+ // lang.IllegalArgumentException, support for ARABIC_ZERO3 was missing.
+ CPPUNIT_ASSERT_EQUAL(OUString("010"), aActual);
+
+ // 100 -> "100"
+ aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO3)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(100)),
+ };
+ aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ CPPUNIT_ASSERT_EQUAL(OUString("100"), aActual);
+}
+
+CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero4)
+{
+ // 100 -> "0100"
+ uno::Reference<text::XNumberingFormatter> xFormatter(
+ text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY);
+ uno::Sequence<beans::PropertyValue> aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO4)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(100)),
+ };
+ lang::Locale aLocale;
+ OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ // Without the accompanying fix in place, this test would have failed with a
+ // lang.IllegalArgumentException, support for ARABIC_ZERO4 was missing.
+ CPPUNIT_ASSERT_EQUAL(OUString("0100"), aActual);
+
+ // 1000 -> "1000"
+ aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO4)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(1000)),
+ };
+ aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ CPPUNIT_ASSERT_EQUAL(OUString("1000"), aActual);
+}
+
+CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero5)
+{
+ // 1000 -> "01000"
+ uno::Reference<text::XNumberingFormatter> xFormatter(
+ text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY);
+ uno::Sequence<beans::PropertyValue> aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO5)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(1000)),
+ };
+ lang::Locale aLocale;
+ OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ // Without the accompanying fix in place, this test would have failed with a
+ // lang.IllegalArgumentException, support for ARABIC_ZERO5 was missing.
+ CPPUNIT_ASSERT_EQUAL(OUString("01000"), aActual);
+
+ // 10000 -> "10000"
+ aProperties = {
+ comphelper::makePropertyValue("NumberingType",
+ static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO5)),
+ comphelper::makePropertyValue("Value", static_cast<sal_Int32>(10000)),
+ };
+ aActual = xFormatter->makeNumberingString(aProperties, aLocale);
+ CPPUNIT_ASSERT_EQUAL(OUString("10000"), aActual);
+}
+
+CPPUNIT_PLUGIN_IMPLEMENT();
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/qa/cppunit/test_ordinalsuffix.cxx b/i18npool/qa/cppunit/test_ordinalsuffix.cxx
new file mode 100644
index 000000000..fb06a41fa
--- /dev/null
+++ b/i18npool/qa/cppunit/test_ordinalsuffix.cxx
@@ -0,0 +1,100 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#include <algorithm>
+#include <com/sun/star/i18n/XOrdinalSuffix.hpp>
+#include <com/sun/star/lang/Locale.hpp>
+#include <unotest/bootstrapfixturebase.hxx>
+
+using namespace com::sun::star;
+
+class TestOrdinalSuffix : public test::BootstrapFixtureBase
+{
+private:
+ uno::Reference<i18n::XOrdinalSuffix> m_xOrdinal;
+
+public:
+ virtual void setUp() override;
+ virtual void tearDown() override;
+
+ void testFrench();
+ void testEnglish();
+
+ CPPUNIT_TEST_SUITE(TestOrdinalSuffix);
+ CPPUNIT_TEST(testFrench);
+ CPPUNIT_TEST(testEnglish);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+void TestOrdinalSuffix::setUp()
+{
+ BootstrapFixtureBase::setUp();
+ m_xOrdinal.set(m_xSFactory->createInstance("com.sun.star.i18n.OrdinalSuffix"), uno::UNO_QUERY_THROW);
+}
+
+void TestOrdinalSuffix::tearDown()
+{
+ m_xOrdinal.clear();
+ BootstrapFixtureBase::tearDown();
+}
+
+void TestOrdinalSuffix::testFrench()
+{
+ lang::Locale aLocale("fr", "LU", "");
+ uno::Sequence< OUString > aSuffixes;
+ OUString *pStart, *pEnd, *pFind;
+
+ //1er
+ aSuffixes = m_xOrdinal->getOrdinalSuffix(1, aLocale);
+ pStart = aSuffixes.begin();
+ pEnd = aSuffixes.end();
+ pFind = std::find(pStart, pEnd, OUString("er"));
+ CPPUNIT_ASSERT(pFind != pEnd);
+
+ //2e, 3e, etc.
+ aSuffixes = m_xOrdinal->getOrdinalSuffix(2, aLocale);
+ pStart = aSuffixes.begin();
+ pEnd = aSuffixes.end();
+ pFind = std::find(pStart, pEnd, OUString("e"));
+ CPPUNIT_ASSERT(pFind != pEnd);
+}
+
+void TestOrdinalSuffix::testEnglish()
+{
+ lang::Locale aLocale("en", "US", "");
+ uno::Sequence< OUString > aSuffixes;
+ OUString *pStart, *pEnd, *pFind;
+
+ //1st
+ aSuffixes = m_xOrdinal->getOrdinalSuffix(1, aLocale);
+ pStart = aSuffixes.begin();
+ pEnd = aSuffixes.end();
+ pFind = std::find(pStart, pEnd, OUString("st"));
+ CPPUNIT_ASSERT(pFind != pEnd);
+
+ //2nd
+ aSuffixes = m_xOrdinal->getOrdinalSuffix(2, aLocale);
+ pStart = aSuffixes.begin();
+ pEnd = aSuffixes.end();
+ pFind = std::find(pStart, pEnd, OUString("nd"));
+ CPPUNIT_ASSERT(pFind != pEnd);
+
+ //3rd
+ aSuffixes = m_xOrdinal->getOrdinalSuffix(3, aLocale);
+ pStart = aSuffixes.begin();
+ pEnd = aSuffixes.end();
+ pFind = std::find(pStart, pEnd, OUString("rd"));
+ CPPUNIT_ASSERT(pFind != pEnd);
+}
+
+
+CPPUNIT_TEST_SUITE_REGISTRATION( TestOrdinalSuffix );
+
+CPPUNIT_PLUGIN_IMPLEMENT();
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx
new file mode 100644
index 000000000..22ded53cd
--- /dev/null
+++ b/i18npool/qa/cppunit/test_textsearch.cxx
@@ -0,0 +1,402 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <com/sun/star/util/SearchFlags.hpp>
+#include <com/sun/star/util/SearchOptions.hpp>
+#include <com/sun/star/util/SearchAlgorithms2.hpp>
+#include <com/sun/star/util/XTextSearch2.hpp>
+#include <unotest/bootstrapfixturebase.hxx>
+#include <i18nutil/transliteration.hxx>
+
+#include <unicode/regex.h>
+
+using namespace ::com::sun::star;
+typedef U_ICU_NAMESPACE::UnicodeString IcuUniString;
+
+class TestTextSearch : public test::BootstrapFixtureBase
+{
+public:
+ virtual void setUp() override;
+ virtual void tearDown() override;
+
+ void testICU();
+ void testSearches();
+ void testWildcardSearch();
+ void testApostropheSearch();
+
+ CPPUNIT_TEST_SUITE(TestTextSearch);
+ CPPUNIT_TEST(testICU);
+ CPPUNIT_TEST(testSearches);
+ CPPUNIT_TEST(testWildcardSearch);
+ CPPUNIT_TEST(testApostropheSearch);
+ CPPUNIT_TEST_SUITE_END();
+private:
+ uno::Reference<util::XTextSearch> m_xSearch;
+ uno::Reference<util::XTextSearch2> m_xSearch2;
+};
+
+// Sanity check our ICU first ...
+void TestTextSearch::testICU()
+{
+ UErrorCode nErr = U_ZERO_ERROR;
+ sal_uInt32 nSearchFlags = UREGEX_UWORD | UREGEX_CASE_INSENSITIVE;
+
+ OUString aString( "abcdefgh" );
+ OUString aPattern( "e" );
+ IcuUniString aSearchPat( reinterpret_cast<const UChar*>(aPattern.getStr()), aPattern.getLength() );
+
+ std::unique_ptr<icu::RegexMatcher> pRegexMatcher(new icu::RegexMatcher( aSearchPat, nSearchFlags, nErr ));
+
+ IcuUniString aSource( reinterpret_cast<const UChar*>(aString.getStr()), aString.getLength() );
+ pRegexMatcher->reset( aSource );
+
+ CPPUNIT_ASSERT( pRegexMatcher->find( 0, nErr ) );
+ CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr );
+ CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(4), pRegexMatcher->start( nErr ) );
+ CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr );
+ CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(5), pRegexMatcher->end( nErr ) );
+ CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr );
+
+ OUString aString2( "acababaabcababadcdaa" );
+ OUString aPattern2( "a" );
+
+ IcuUniString aSearchPat2( reinterpret_cast<const UChar*>(aPattern2.getStr()), aPattern2.getLength() );
+ pRegexMatcher.reset(new icu::RegexMatcher( aSearchPat2, nSearchFlags, nErr ));
+
+ IcuUniString aSource2( reinterpret_cast<const UChar*>(aString2.getStr()), aString2.getLength() );
+ pRegexMatcher->reset( aSource2 );
+
+ CPPUNIT_ASSERT( pRegexMatcher->find( 0, nErr ) );
+ CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr );
+ CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(0), pRegexMatcher->start( nErr ) );
+ CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr );
+ CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(1), pRegexMatcher->end( nErr ) );
+ CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr );
+}
+
+void TestTextSearch::testSearches()
+{
+ OUString str( "acababaabcababadcdaa" );
+ sal_Int32 startPos = 2, endPos = 20 ;
+ OUString const searchStr( "(ab)*a(c|d)+" );
+ sal_Int32 const fStartRes = 10, fEndRes = 18 ;
+ sal_Int32 const bStartRes = 18, bEndRes = 10 ;
+
+ // set options
+ util::SearchOptions aOptions;
+ aOptions.algorithmType = util::SearchAlgorithms_REGEXP ;
+ aOptions.searchFlag = util::SearchFlags::ALL_IGNORE_CASE;
+ aOptions.searchString = searchStr;
+ m_xSearch->setOptions( aOptions );
+
+ util::SearchResult aRes;
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, endPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( fStartRes, aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( fEndRes, aRes.endOffset[0] );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, endPos, startPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( bStartRes, aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( bEndRes, aRes.endOffset[0] );
+
+ aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_CASE
+ | TransliterationFlags::IGNORE_WIDTH);
+ aOptions.searchString = "([^ ]*)[ ]*([^ ]*)";
+ m_xSearch->setOptions(aOptions);
+ aRes = m_xSearch->searchForward("11 22 33", 2, 7);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(3), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 2) && (aRes.endOffset[0] == 5));
+ CPPUNIT_ASSERT((aRes.startOffset[1] == 2) && (aRes.endOffset[1] == 2));
+ CPPUNIT_ASSERT((aRes.startOffset[2] == 3) && (aRes.endOffset[2] == 5));
+}
+
+void TestTextSearch::testWildcardSearch()
+{
+ util::SearchOptions2 aOptions;
+ OUString aText;
+ util::SearchResult aRes;
+
+ aOptions.AlgorithmType2 = util::SearchAlgorithms2::WILDCARD ;
+ aOptions.WildcardEscapeCharacter = '~';
+ // aOptions.searchFlag = ::css::util::SearchFlags::WILD_MATCH_SELECTION;
+ // is not set, so substring match is allowed.
+ aOptions.transliterateFlags = sal_Int32(::css::i18n::TransliterationModules::TransliterationModules_IGNORE_CASE);
+ aText = "abAca";
+
+ aOptions.searchString = "a";
+ m_xSearch2->setOptions2( aOptions );
+ // match first "a", [0,1)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 1));
+ // match last "a", (5,4]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 4));
+
+ aOptions.searchString = "a?";
+ m_xSearch2->setOptions2( aOptions );
+ // match "ab", [0,2)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 2));
+ // match "ac", (4,2]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 2));
+
+ aOptions.searchString = "a*c";
+ m_xSearch2->setOptions2( aOptions );
+ // match "abac", [0,4) XXX NOTE: first match forward
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 4));
+ // match "ac", (4,2] XXX NOTE: first match backward, not greedy
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 2));
+
+ aOptions.searchString = "b*a";
+ m_xSearch2->setOptions2( aOptions );
+ // match "ba", [1,3) XXX NOTE: first match forward, not greedy
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 1) && (aRes.endOffset[0] == 3));
+ // match "baca", (5,1] XXX NOTE: first match backward
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 1));
+
+ aText = "ab?ca";
+
+ aOptions.searchString = "?~??";
+ m_xSearch2->setOptions2( aOptions );
+ // match "b?c", [1,4)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 1) && (aRes.endOffset[0] == 4));
+ // match "b?c", (4,1]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 1));
+
+ aText = "ab*ca";
+
+ aOptions.searchString = "?~*?";
+ m_xSearch2->setOptions2( aOptions );
+ // match "b?c", [1,4)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 1) && (aRes.endOffset[0] == 4));
+ // match "b?c", (4,1]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 1));
+
+ aOptions.searchString = "ca?";
+ m_xSearch2->setOptions2( aOptions );
+ // no match
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions);
+ // no match
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions);
+
+ aOptions.searchString = "ca*";
+ m_xSearch2->setOptions2( aOptions );
+ // match "ca", [3,5)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 3) && (aRes.endOffset[0] == 5));
+ // match "ca", (5,3]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 3));
+
+ aOptions.searchString = "*ca*";
+ m_xSearch2->setOptions2( aOptions );
+ // match "abaca", [0,5)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 5));
+ // match "abaca", (5,0]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 0));
+
+ aText = "123123";
+ aOptions.searchString = "*2?";
+ m_xSearch2->setOptions2( aOptions );
+ // match first "123", [0,3)
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 3));
+ // match "123123", (6,0] Yes this looks odd, but it is as searching "?2*" forward.
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 6) && (aRes.endOffset[0] == 0));
+
+ aOptions.searchFlag |= util::SearchFlags::WILD_MATCH_SELECTION;
+ m_xSearch2->setOptions2( aOptions );
+ // match "123123", [0,6) with greedy '*'
+ aRes = m_xSearch2->searchForward( aText, 0, aText.getLength());
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 6));
+ // match "123123", (6,0]
+ aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0);
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions);
+ CPPUNIT_ASSERT((aRes.startOffset[0] == 6) && (aRes.endOffset[0] == 0));
+}
+
+void TestTextSearch::testApostropheSearch()
+{
+ // A) find typographic apostrophes also by using ASCII apostrophe in searchString
+ OUString str( u"It\u2019s an apostrophe." );
+ sal_Int32 startPos = 0, endPos = str.getLength();
+
+ // set options
+ util::SearchOptions aOptions;
+ aOptions.algorithmType = util::SearchAlgorithms_ABSOLUTE;
+ aOptions.searchFlag = util::SearchFlags::ALL_IGNORE_CASE;
+ aOptions.searchString = "'";
+ m_xSearch->setOptions( aOptions );
+
+ util::SearchResult aRes;
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, endPos );
+ // This was 0.
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, endPos, startPos );
+ // This was 0.
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] );
+
+ // check with transliteration
+ aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_CASE
+ | TransliterationFlags::IGNORE_WIDTH);
+ m_xSearch->setOptions(aOptions);
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, endPos );
+ // This was 0.
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, endPos, startPos );
+ // This was 0.
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] );
+
+ // B) search ASCII apostrophe in a text with ASCII apostrophes
+ str = str.replace(u'\u2019', '\'');
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, endPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, endPos, startPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] );
+
+ // C) search typographic apostrophe in a text with ASCII apostrophes (no result)
+ aOptions.searchString = OUString(u"\u2019");
+ m_xSearch->setOptions( aOptions );
+
+ aRes = m_xSearch->searchForward( str, startPos, endPos );
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions);
+
+ aRes = m_xSearch->searchBackward( str, endPos, startPos );
+ CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions);
+
+ // D) search typographic apostrophe in a text with typographic apostrophes
+ str = str.replace('\'', u'\u2019');
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, endPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, endPos, startPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] );
+ CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] );
+
+ // E) search mixed apostrophes in a text with mixed apostrophes:
+ aOptions.searchString = OUString(u"'\u2019");
+ m_xSearch->setOptions( aOptions );
+ str = u"test: \u2019'";
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, str.getLength());
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, str.getLength(), startPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+
+ // F) search mixed apostrophes in a text with ASCII apostrophes:
+ str = u"test: ''";
+
+ // search forward
+ aRes = m_xSearch->searchForward( str, startPos, str.getLength());
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+
+ // search backwards
+ aRes = m_xSearch->searchBackward( str, str.getLength(), startPos );
+ CPPUNIT_ASSERT( aRes.subRegExpressions > 0 );
+}
+
+void TestTextSearch::setUp()
+{
+ BootstrapFixtureBase::setUp();
+ m_xSearch.set(m_xSFactory->createInstance("com.sun.star.util.TextSearch"), uno::UNO_QUERY_THROW);
+ m_xSearch2.set(m_xSFactory->createInstance("com.sun.star.util.TextSearch2"), uno::UNO_QUERY_THROW);
+}
+
+void TestTextSearch::tearDown()
+{
+ m_xSearch.clear();
+ m_xSearch2.clear();
+ BootstrapFixtureBase::tearDown();
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(TestTextSearch);
+
+CPPUNIT_PLUGIN_IMPLEMENT();
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */