diff options
Diffstat (limited to '')
-rw-r--r-- | i18npool/qa/cppunit/test_breakiterator.cxx | 1065 | ||||
-rw-r--r-- | i18npool/qa/cppunit/test_characterclassification.cxx | 105 | ||||
-rw-r--r-- | i18npool/qa/cppunit/test_defaultnumberingprovider.cxx | 131 | ||||
-rw-r--r-- | i18npool/qa/cppunit/test_ordinalsuffix.cxx | 100 | ||||
-rw-r--r-- | i18npool/qa/cppunit/test_textsearch.cxx | 402 |
5 files changed, 1803 insertions, 0 deletions
diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx new file mode 100644 index 000000000..a41cd8218 --- /dev/null +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -0,0 +1,1065 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <com/sun/star/i18n/XBreakIterator.hpp> +#include <com/sun/star/i18n/CharacterIteratorMode.hpp> +#include <com/sun/star/i18n/ScriptType.hpp> +#include <com/sun/star/i18n/WordType.hpp> +#include <o3tl/cppunittraitshelper.hxx> +#include <unotest/bootstrapfixturebase.hxx> + +#include <unicode/uvernum.h> + +#include <rtl/strbuf.hxx> + +#include <string.h> + +#include <stack> + +using namespace ::com::sun::star; + +class TestBreakIterator : public test::BootstrapFixtureBase +{ +public: + virtual void setUp() override; + virtual void tearDown() override; + + void testLineBreaking(); + void testWordBoundaries(); + void testGraphemeIteration(); + void testWeak(); + void testAsian(); + void testThai(); +#if (U_ICU_VERSION_MAJOR_NUM > 51) + void testLao(); +#ifdef TODO + void testNorthernThai(); + void testKhmer(); +#endif +#endif + void testJapanese(); + void testChinese(); + + CPPUNIT_TEST_SUITE(TestBreakIterator); + CPPUNIT_TEST(testLineBreaking); + CPPUNIT_TEST(testWordBoundaries); + CPPUNIT_TEST(testGraphemeIteration); + CPPUNIT_TEST(testWeak); + CPPUNIT_TEST(testAsian); + CPPUNIT_TEST(testThai); +#if (U_ICU_VERSION_MAJOR_NUM > 51) + CPPUNIT_TEST(testLao); +#ifdef TODO + CPPUNIT_TEST(testKhmer); + CPPUNIT_TEST(testNorthernThai); +#endif +#endif + CPPUNIT_TEST(testJapanese); + CPPUNIT_TEST(testChinese); + CPPUNIT_TEST_SUITE_END(); + +private: + uno::Reference<i18n::XBreakIterator> m_xBreak; + void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak); +}; + +void TestBreakIterator::testLineBreaking() +{ + i18n::LineBreakHyphenationOptions aHyphOptions; + i18n::LineBreakUserOptions aUserOptions; + lang::Locale aLocale; + + //See https://bugs.libreoffice.org/show_bug.cgi?id=31271 + { + OUString aTest("(some text here)"); + + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //Here we want the line break to leave text here) on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex); + } + + { + //Here we want the line break to leave "here)" on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex); + } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=49849 + { + const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD }; + OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1)); + OUString aTest(aWord + " " + aWord); + + aLocale.Language = "he"; + aLocale.Country = "IL"; + + { + //Here we want the line break to happen at the whitespace + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=17155 + { + OUString const aTest("foo /bar/baz"); + + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //Here we want the line break to leave /bar/ba clumped together on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=19716 + { + OUString aTest("aaa]aaa"); + + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //Here we want the line break to move the whole lot to the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex); + } + } + + //this is an example sequence from tdf92993-1.docx caught by the load crashtesting + { + const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46, + 0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6}; + + OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1)); + + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //This must not assert/crash + (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions); + } + } + + //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197 + { + const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00, + 0xc0ac, 0xb294}; + OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL)); + + aLocale.Language = "ko"; + aLocale.Country = "KR"; + + { + i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex); + } + } +} + +//See https://bugs.libreoffice.org/show_bug.cgi?id=49629 +void TestBreakIterator::testWordBoundaries() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + i18n::Boundary aBounds; + + //See https://bz.apache.org/ooo/show_bug.cgi?id=11993 + { + OUString aTest("abcd ef ghi??? KLM"); + + CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD)); + aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4); + + CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD)); + + //next word + aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12); + + //previous word + aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7); + + CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD)); + aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12); + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD)); + CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD)); + aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=21907 + { + OUString aTest("b a?"); + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD)); + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES)); + + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD)); + + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES)); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=14904 + { + const sal_Unicode TEST[] = + { + 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's', + ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't', + 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k', + 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ', + '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e', + 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o', + 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ', + 'S', 'p', 'a', 'n', 'i', 's', 'h' + }; + OUString aTest(TEST, SAL_N_ELEMENTS(TEST)); + + aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7); + + aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14); + + aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44); + + aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52); + + aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62); + + aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71); + + aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92); + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=49629 + sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF }; + for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode) + { + //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary + for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i) + { +#if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2) + //Note the breakiterator test is known to fail on older icu + //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase. + if (aBreakTests[i] == 0x200B) + continue; +#endif + OUString aTest = "Word" + OUStringChar(aBreakTests[i]) + "Word"; + aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true); + switch (mode) + { + case i18n::WordType::ANY_WORD: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4); + break; + case i18n::WordType::ANYWORD_IGNOREWHITESPACES: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4); + break; + case i18n::WordType::DICTIONARY_WORD: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4); + break; + case i18n::WordType::WORD_COUNT: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4); + break; + } + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode)); + } + } + + sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB }; + for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode) + { + //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary + for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i) + { + OUString aTest = "Word" + OUStringChar(aJoinTests[i]) + "Word"; + aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true); + switch (mode) + { + case i18n::WordType::ANY_WORD: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9); + break; + case i18n::WordType::ANYWORD_IGNOREWHITESPACES: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9); + break; + case i18n::WordType::DICTIONARY_WORD: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9); + break; + case i18n::WordType::WORD_COUNT: + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9); + break; + } + + CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode)); + CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode)); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=13494 + { + const OUString aBase("xxAAxxBBxxCCxx"); + const sal_Unicode aTests[] = + { + '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*', + '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/', + '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|' + }; + + const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14}; + for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j) + { + OUString aTest = aBase.replace('x', aTests[j]); + sal_Int32 nPos = -1; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions)); + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos); + ++i; + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + i = SAL_N_ELEMENTS(aDoublePositions)-1; + do + { + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + --i; + CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos); + } + while (nPos > 0); + } + + const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10}; + for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j) + { + OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j])); + sal_Int32 nPos = -1; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions)); + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos); + ++i; + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + i = SAL_N_ELEMENTS(aSinglePositions)-1; + do + { + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + --i; + CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos); + } + while (nPos > 0); + } + + const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10}; + CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]); + { + OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0])); + sal_Int32 nPos = -1; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions)); + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos); + ++i; + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + i = SAL_N_ELEMENTS(aSingleQuotePositions)-1; + do + { + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + --i; + CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos); + } + while (nPos > 0); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=13451 + { + aLocale.Language = "ca"; + aLocale.Country = "ES"; + + OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!"); + + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=85411 + for (int j = 0; j < 3; ++j) + { + switch (j) + { + case 0: + aLocale.Language = "en"; + aLocale.Country = "US"; + break; + case 1: + aLocale.Language = "ca"; + aLocale.Country = "ES"; + break; + case 2: + aLocale.Language = "fi"; + aLocale.Country = "FI"; + break; + default: + CPPUNIT_ASSERT(false); + break; + } + + const sal_Unicode TEST[] = + { + 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o' + }; + OUString aTest(TEST, SAL_N_ELEMENTS(TEST)); + + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {1, 6, 9, 12}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + //https://bz.apache.org/ooo/show_bug.cgi?id=21290 + for (int j = 0; j < 2; ++j) + { + switch (j) + { + case 0: + aLocale.Language = "en"; + aLocale.Country = "US"; + break; + case 1: + aLocale.Language = "grc"; + aLocale.Country.clear(); + break; + default: + CPPUNIT_ASSERT(false); + break; + } + + const sal_Unicode TEST[] = + { + 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00, + 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF, + 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24, + 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD + }; + OUString aTest(TEST, SAL_N_ELEMENTS(TEST)); + + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {5, 15, 19, 26}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=58513 + //See https://bugs.libreoffice.org/show_bug.cgi?id=55707 + { + aLocale.Language = "fi"; + aLocale.Country = "FI"; + + OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"); + + { + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::WORD_COUNT, true).endPos; + CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + ++i; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + + { + sal_Int32 nPos = 0; + sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37, + 40, 41, 42, 43, 45, 46, 47, 50, 51}; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected)); + aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos); + ++i; + CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos); + ++i; + nPos = aBounds.endPos; + } + while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i); + } + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=107843 + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + const sal_Unicode TEST[] = + { + 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h' + }; + OUString aTest(TEST, SAL_N_ELEMENTS(TEST)); + + aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5); + + aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=113785 + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + const sal_Unicode TEST[] = + { + 'a', 0x2013, 'b', 0x2014, 'c' + }; + OUString aTest(TEST, SAL_N_ELEMENTS(TEST)); + + aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1); + + aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD); + CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3); + + aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD); + CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5); + } +} + +//See https://bugs.libreoffice.org/show_bug.cgi?id=40292 +//See https://bz.apache.org/ooo/show_bug.cgi?id=80412 +//See https://bz.apache.org/ooo/show_bug.cgi?id=111152 +//See https://bz.apache.org/ooo/show_bug.cgi?id=50172 +void TestBreakIterator::testGraphemeIteration() +{ + lang::Locale aLocale; + aLocale.Language = "bn"; + aLocale.Country = "IN"; + + { + const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF }; + OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA)); + + sal_Int32 nDone=0; + sal_Int32 nPos; + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF }; + OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)); + + sal_Int32 nDone=0; + sal_Int32 nPos; + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF }; + OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)); + + sal_Int32 nDone=0; + sal_Int32 nPos; + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + aLocale.Language = "ta"; + aLocale.Country = "IN"; + + { + const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 }; + OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA)); + + sal_Int32 nDone=0; + sal_Int32 nPos = 0; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 }; + OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU)); + + sal_Int32 nDone=0; + sal_Int32 nPos = 0; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } + + { + const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] = + { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 }; + OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI, + SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI)); + + sal_Int32 nDone=0; + sal_Int32 nPos=0; + + for (sal_Int32 i = 0; i < 4; ++i) + { + sal_Int32 nOldPos = nPos; + nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos); + } + + for (sal_Int32 i = 0; i < 4; ++i) + { + sal_Int32 nOldPos = nPos; + nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos); + } + } + + { + const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 }; + OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS)); + + sal_Int32 nGraphemeCount = 0; + + sal_Int32 nCurPos = 0; + while (nCurPos < aText.getLength()) + { + sal_Int32 nCount2 = 1; + nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(), + i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2); + ++nGraphemeCount; + } + + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount); + } + + aLocale.Language = "hi"; + aLocale.Country = "IN"; + + { + const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 }; + OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII)); + + sal_Int32 nDone=0; + sal_Int32 nPos = 0; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos); + } +} + +//A test to ensure that certain ranges and codepoints that are categorized as +//weak remain as weak, so that existing docs that depend on this don't silently +//change font for those weak chars +void TestBreakIterator::testWeak() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + const sal_Unicode WEAKS[] = + { + 0x0001, 0x0002, + 0x0020, 0x00A0, + 0x0300, 0x036F, //Combining Diacritical Marks + 0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended + 0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement + 0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols + 0x2150, 0x215F, //Number Forms, fractions + 0x2160, 0x2180, //Number Forms, roman numerals + 0x2200, 0x22FF, //Mathematical Operators + 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A + 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B + 0x2A00, 0x2AFF, //Supplemental Mathematical Operators + 0x2100, 0x214F, //Letterlike Symbols + 0x2308, 0x230B, //Miscellaneous technical + 0x25A0, 0x25FF, //Geometric Shapes + 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows + }; + OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS)); + + for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i) + { + sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i); + OString aMsg = + "Char 0x" + + OString::number(static_cast<sal_Int32>(aWeaks[i]), 16) + + " should have been weak"; + CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(), + i18n::ScriptType::WEAK, nScript); + } + } +} + +//A test to ensure that certain ranges and codepoints that are categorized as +//asian remain as asian, so that existing docs that depend on this don't silently +//change font for those asian chars. +//See https://bugs.libreoffice.org/show_bug.cgi?id=38095 +void TestBreakIterator::testAsian() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + const sal_Unicode ASIANS[] = + { + //some typical CJK chars + 0x4E00, 0x62FF, + //The full HalfWidth and FullWidth block has historically been + //designated as taking the CJK font :-( + //HalfWidth and FullWidth forms of ASCII 0-9, categorized under + //UAX24 as "Common" i.e. by that logic WEAK + 0xFF10, 0xFF19, + //HalfWidth and FullWidth forms of ASCII A-z, categorized under + //UAX25 as "Latin", i.e. by that logic LATIN + 0xFF21, 0xFF5A + }; + OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS)); + + for (sal_Int32 i = 0; i < aAsians.getLength(); ++i) + { + sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i); + OString aMsg = + "Char 0x" + + OString::number(static_cast<sal_Int32>(aAsians[i]), 16) + + " should have been asian"; + CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(), + i18n::ScriptType::ASIAN, nScript); + } + } +} + +#if (U_ICU_VERSION_MAJOR_NUM > 51) +//A test to ensure that our Lao word boundary detection is useful +void TestBreakIterator::testLao() +{ + lang::Locale aLocale; + aLocale.Language = "lo"; + aLocale.Country = "LA"; + + const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a }; + OUString aTest(LAO, SAL_N_ELEMENTS(LAO)); + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); +} +#endif + +//A test to ensure that our thai word boundary detection is useful +void TestBreakIterator::testThai() +{ + lang::Locale aLocale; + aLocale.Language = "th"; + aLocale.Country = "TH"; + + //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html + { + const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A }; + OUString aTest(THAI, SAL_N_ELEMENTS(THAI)); + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_MESSAGE("Should skip full word", + aBounds.startPos == 0 && aBounds.endPos == aTest.getLength()); + } + + //See https://bz.apache.org/ooo/show_bug.cgi?id=29548 + //make sure forwards and back are consistent + { + const sal_Unicode THAI[] = + { + 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41, + 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34, + 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27, + 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41, + 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34, + 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27 + }; + OUString aTest(THAI, SAL_N_ELEMENTS(THAI)); + + std::stack<sal_Int32> aPositions; + sal_Int32 nPos = -1; + do + { + nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + aPositions.push(nPos); + } + while (nPos < aTest.getLength()); + nPos = aTest.getLength(); + CPPUNIT_ASSERT(!aPositions.empty()); + aPositions.pop(); + do + { + CPPUNIT_ASSERT(!aPositions.empty()); + nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos; + CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos); + aPositions.pop(); + } + while (nPos > 0); + } + + // tdf#113694 + { + const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 }; + OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP)); + + sal_Int32 nDone=0; + sal_Int32 nPos; + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale, + i18n::CharacterIteratorMode::SKIPCELL, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos); + + nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, + i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos); + nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale, + i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos); + } +} + +#ifdef TODO +void TestBreakIterator::testNorthernThai() +{ + lang::Locale aLocale; + aLocale.Language = "nod"; + aLocale.Country = "TH"; + + const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A }; + OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1)); + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_MESSAGE("Should skip full word", + aBounds.startPos == 0 && aBounds.endPos == aTest.getLength()); +} + +// Not sure if any version earlier than 49 did have Khmer word boundary +// dictionaries, 4.6 does not. + +// As of icu 54, word boundary detection for Khmer is still considered +// insufficient, so icu khmer stuff is disabled + +//A test to ensure that our khmer word boundary detection is useful +//https://bugs.libreoffice.org/show_bug.cgi?id=52020 +void TestBreakIterator::testKhmer() +{ + lang::Locale aLocale; + aLocale.Language = "km"; + aLocale.Country = "KH"; + + const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 }; + + OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER)); + i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3); + + aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5); +} +#endif + +void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak) +{ + lang::Locale aLocale; + aLocale.Language = "ja"; + aLocale.Country = "JP"; + i18n::Boundary aBounds; + + { + const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 }; + + OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE)); + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7); + } + + { + const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 }; + + OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE)); + aBounds = xBreak->getWordBoundary(aTest, 1, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3); + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + + CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6); + } +} + +void TestBreakIterator::testJapanese() +{ + doTestJapanese(m_xBreak); + + // fdo#78479 - test second / cached instantiation of xdictionary + uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance( + "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW); + + doTestJapanese(xTmpBreak); +} + +void TestBreakIterator::testChinese() +{ + lang::Locale aLocale; + aLocale.Language = "zh"; + aLocale.Country = "CN"; + i18n::Boundary aBounds; + + { + const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB }; + + OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE)); + aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6); + } +} +void TestBreakIterator::setUp() +{ + BootstrapFixtureBase::setUp(); + m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW); +} + +void TestBreakIterator::tearDown() +{ + m_xBreak.clear(); + BootstrapFixtureBase::tearDown(); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator); + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/qa/cppunit/test_characterclassification.cxx b/i18npool/qa/cppunit/test_characterclassification.cxx new file mode 100644 index 000000000..4af398440 --- /dev/null +++ b/i18npool/qa/cppunit/test_characterclassification.cxx @@ -0,0 +1,105 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <com/sun/star/i18n/XCharacterClassification.hpp> +#include <unotest/bootstrapfixturebase.hxx> + +using namespace ::com::sun::star; + +class TestCharacterClassification : public test::BootstrapFixtureBase +{ +public: + virtual void setUp() override; + virtual void tearDown() override; + + void testTitleCase(); + void testStringType(); + + CPPUNIT_TEST_SUITE(TestCharacterClassification); + CPPUNIT_TEST(testTitleCase); + CPPUNIT_TEST(testStringType); + CPPUNIT_TEST_SUITE_END(); +private: + uno::Reference<i18n::XCharacterClassification> m_xCC; +}; + +//A test to ensure that our Title Case functionality is working +//http://lists.freedesktop.org/archives/libreoffice/2012-June/032767.html +//https://bz.apache.org/ooo/show_bug.cgi?id=30863 +void TestCharacterClassification::testTitleCase() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //basic example + OUString sTest("Some text"); + OUString sTitleCase = m_xCC->toTitle(sTest, 0, sTest.getLength(), aLocale); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be title", OUString("Some Text"), sTitleCase); + OUString sUpperCase = m_xCC->toUpper(sTest, 0, sTest.getLength(), aLocale); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be upper", OUString("SOME TEXT"), sUpperCase); + OUString sLowerCase = m_xCC->toLower(sTest, 0, sTest.getLength(), aLocale); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be lower ", OUString("some text"), sLowerCase); + } + + { + //tricky one + const sal_Unicode LATINSMALLLETTERDZ[] = { 0x01F3 }; + OUString aTest(LATINSMALLLETTERDZ, SAL_N_ELEMENTS(LATINSMALLLETTERDZ)); + OUString sTitleCase = m_xCC->toTitle(aTest, 0, aTest.getLength(), aLocale); + CPPUNIT_ASSERT_MESSAGE("Should be title", sTitleCase.getLength() == 1 && sTitleCase[0] == 0x01F2); + OUString sUpperCase = m_xCC->toUpper(aTest, 0, aTest.getLength(), aLocale); + CPPUNIT_ASSERT_MESSAGE("Should be upper", sUpperCase.getLength() == 1 && sUpperCase[0] == 0x01F1); + OUString sLowerCase = m_xCC->toLower(aTest, 0, aTest.getLength(), aLocale); + CPPUNIT_ASSERT_MESSAGE("Should be lower ", sLowerCase.getLength() == 1 && sLowerCase[0] == 0x01F3); + } +} + +//https://bugs.libreoffice.org/show_bug.cgi?id=69641 +void TestCharacterClassification::testStringType() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //simple case + OUString sTest("Some text"); + sal_Int32 nResult = m_xCC->getStringType(sTest, 0, sTest.getLength(), aLocale); + CPPUNIT_ASSERT_EQUAL(sal_Int32(230), nResult); + } + + { + //tricky case + const sal_Unicode MATHEMATICAL_ITALIC_SMALL_THETA[] = { 0xD835, 0xDF03 }; + OUString sTest(MATHEMATICAL_ITALIC_SMALL_THETA, SAL_N_ELEMENTS(MATHEMATICAL_ITALIC_SMALL_THETA)); + sal_Int32 nResult = m_xCC->getStringType(sTest, 0, sTest.getLength(), aLocale); + CPPUNIT_ASSERT_EQUAL(sal_Int32(228), nResult); + } + +} + +void TestCharacterClassification::setUp() +{ + BootstrapFixtureBase::setUp(); + m_xCC.set(m_xSFactory->createInstance("com.sun.star.i18n.CharacterClassification"), uno::UNO_QUERY_THROW); +} + +void TestCharacterClassification::tearDown() +{ + BootstrapFixtureBase::tearDown(); + m_xCC.clear(); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(TestCharacterClassification); + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/qa/cppunit/test_defaultnumberingprovider.cxx b/i18npool/qa/cppunit/test_defaultnumberingprovider.cxx new file mode 100644 index 000000000..dbe55050a --- /dev/null +++ b/i18npool/qa/cppunit/test_defaultnumberingprovider.cxx @@ -0,0 +1,131 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include <test/bootstrapfixture.hxx> + +#include <com/sun/star/style/NumberingType.hpp> +#include <com/sun/star/text/DefaultNumberingProvider.hpp> +#include <com/sun/star/text/XNumberingFormatter.hpp> + +#include <comphelper/propertyvalue.hxx> + +using namespace ::com::sun::star; + +/// i18npool defaultnumberingprovider tests. +class I18npoolDefaultnumberingproviderTest : public test::BootstrapFixture +{ +}; + +CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero) +{ + // 1 -> "01" + uno::Reference<text::XNumberingFormatter> xFormatter( + text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY); + uno::Sequence<beans::PropertyValue> aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(1)), + }; + lang::Locale aLocale; + OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale); + // Without the accompanying fix in place, this test would have failed with a + // lang.IllegalArgumentException, support for ARABIC_ZERO was missing. + CPPUNIT_ASSERT_EQUAL(OUString("01"), aActual); + + // 10 -> "10" + aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(10)), + }; + aActual = xFormatter->makeNumberingString(aProperties, aLocale); + CPPUNIT_ASSERT_EQUAL(OUString("10"), aActual); +} + +CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero3) +{ + // 10 -> "010" + uno::Reference<text::XNumberingFormatter> xFormatter( + text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY); + uno::Sequence<beans::PropertyValue> aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO3)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(10)), + }; + lang::Locale aLocale; + OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale); + // Without the accompanying fix in place, this test would have failed with a + // lang.IllegalArgumentException, support for ARABIC_ZERO3 was missing. + CPPUNIT_ASSERT_EQUAL(OUString("010"), aActual); + + // 100 -> "100" + aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO3)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(100)), + }; + aActual = xFormatter->makeNumberingString(aProperties, aLocale); + CPPUNIT_ASSERT_EQUAL(OUString("100"), aActual); +} + +CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero4) +{ + // 100 -> "0100" + uno::Reference<text::XNumberingFormatter> xFormatter( + text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY); + uno::Sequence<beans::PropertyValue> aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO4)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(100)), + }; + lang::Locale aLocale; + OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale); + // Without the accompanying fix in place, this test would have failed with a + // lang.IllegalArgumentException, support for ARABIC_ZERO4 was missing. + CPPUNIT_ASSERT_EQUAL(OUString("0100"), aActual); + + // 1000 -> "1000" + aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO4)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(1000)), + }; + aActual = xFormatter->makeNumberingString(aProperties, aLocale); + CPPUNIT_ASSERT_EQUAL(OUString("1000"), aActual); +} + +CPPUNIT_TEST_FIXTURE(I18npoolDefaultnumberingproviderTest, testArabicZero5) +{ + // 1000 -> "01000" + uno::Reference<text::XNumberingFormatter> xFormatter( + text::DefaultNumberingProvider::create(mxComponentContext), uno::UNO_QUERY); + uno::Sequence<beans::PropertyValue> aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO5)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(1000)), + }; + lang::Locale aLocale; + OUString aActual = xFormatter->makeNumberingString(aProperties, aLocale); + // Without the accompanying fix in place, this test would have failed with a + // lang.IllegalArgumentException, support for ARABIC_ZERO5 was missing. + CPPUNIT_ASSERT_EQUAL(OUString("01000"), aActual); + + // 10000 -> "10000" + aProperties = { + comphelper::makePropertyValue("NumberingType", + static_cast<sal_uInt16>(style::NumberingType::ARABIC_ZERO5)), + comphelper::makePropertyValue("Value", static_cast<sal_Int32>(10000)), + }; + aActual = xFormatter->makeNumberingString(aProperties, aLocale); + CPPUNIT_ASSERT_EQUAL(OUString("10000"), aActual); +} + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/qa/cppunit/test_ordinalsuffix.cxx b/i18npool/qa/cppunit/test_ordinalsuffix.cxx new file mode 100644 index 000000000..fb06a41fa --- /dev/null +++ b/i18npool/qa/cppunit/test_ordinalsuffix.cxx @@ -0,0 +1,100 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include <algorithm> +#include <com/sun/star/i18n/XOrdinalSuffix.hpp> +#include <com/sun/star/lang/Locale.hpp> +#include <unotest/bootstrapfixturebase.hxx> + +using namespace com::sun::star; + +class TestOrdinalSuffix : public test::BootstrapFixtureBase +{ +private: + uno::Reference<i18n::XOrdinalSuffix> m_xOrdinal; + +public: + virtual void setUp() override; + virtual void tearDown() override; + + void testFrench(); + void testEnglish(); + + CPPUNIT_TEST_SUITE(TestOrdinalSuffix); + CPPUNIT_TEST(testFrench); + CPPUNIT_TEST(testEnglish); + CPPUNIT_TEST_SUITE_END(); +}; + +void TestOrdinalSuffix::setUp() +{ + BootstrapFixtureBase::setUp(); + m_xOrdinal.set(m_xSFactory->createInstance("com.sun.star.i18n.OrdinalSuffix"), uno::UNO_QUERY_THROW); +} + +void TestOrdinalSuffix::tearDown() +{ + m_xOrdinal.clear(); + BootstrapFixtureBase::tearDown(); +} + +void TestOrdinalSuffix::testFrench() +{ + lang::Locale aLocale("fr", "LU", ""); + uno::Sequence< OUString > aSuffixes; + OUString *pStart, *pEnd, *pFind; + + //1er + aSuffixes = m_xOrdinal->getOrdinalSuffix(1, aLocale); + pStart = aSuffixes.begin(); + pEnd = aSuffixes.end(); + pFind = std::find(pStart, pEnd, OUString("er")); + CPPUNIT_ASSERT(pFind != pEnd); + + //2e, 3e, etc. + aSuffixes = m_xOrdinal->getOrdinalSuffix(2, aLocale); + pStart = aSuffixes.begin(); + pEnd = aSuffixes.end(); + pFind = std::find(pStart, pEnd, OUString("e")); + CPPUNIT_ASSERT(pFind != pEnd); +} + +void TestOrdinalSuffix::testEnglish() +{ + lang::Locale aLocale("en", "US", ""); + uno::Sequence< OUString > aSuffixes; + OUString *pStart, *pEnd, *pFind; + + //1st + aSuffixes = m_xOrdinal->getOrdinalSuffix(1, aLocale); + pStart = aSuffixes.begin(); + pEnd = aSuffixes.end(); + pFind = std::find(pStart, pEnd, OUString("st")); + CPPUNIT_ASSERT(pFind != pEnd); + + //2nd + aSuffixes = m_xOrdinal->getOrdinalSuffix(2, aLocale); + pStart = aSuffixes.begin(); + pEnd = aSuffixes.end(); + pFind = std::find(pStart, pEnd, OUString("nd")); + CPPUNIT_ASSERT(pFind != pEnd); + + //3rd + aSuffixes = m_xOrdinal->getOrdinalSuffix(3, aLocale); + pStart = aSuffixes.begin(); + pEnd = aSuffixes.end(); + pFind = std::find(pStart, pEnd, OUString("rd")); + CPPUNIT_ASSERT(pFind != pEnd); +} + + +CPPUNIT_TEST_SUITE_REGISTRATION( TestOrdinalSuffix ); + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/qa/cppunit/test_textsearch.cxx b/i18npool/qa/cppunit/test_textsearch.cxx new file mode 100644 index 000000000..22ded53cd --- /dev/null +++ b/i18npool/qa/cppunit/test_textsearch.cxx @@ -0,0 +1,402 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <com/sun/star/util/SearchFlags.hpp> +#include <com/sun/star/util/SearchOptions.hpp> +#include <com/sun/star/util/SearchAlgorithms2.hpp> +#include <com/sun/star/util/XTextSearch2.hpp> +#include <unotest/bootstrapfixturebase.hxx> +#include <i18nutil/transliteration.hxx> + +#include <unicode/regex.h> + +using namespace ::com::sun::star; +typedef U_ICU_NAMESPACE::UnicodeString IcuUniString; + +class TestTextSearch : public test::BootstrapFixtureBase +{ +public: + virtual void setUp() override; + virtual void tearDown() override; + + void testICU(); + void testSearches(); + void testWildcardSearch(); + void testApostropheSearch(); + + CPPUNIT_TEST_SUITE(TestTextSearch); + CPPUNIT_TEST(testICU); + CPPUNIT_TEST(testSearches); + CPPUNIT_TEST(testWildcardSearch); + CPPUNIT_TEST(testApostropheSearch); + CPPUNIT_TEST_SUITE_END(); +private: + uno::Reference<util::XTextSearch> m_xSearch; + uno::Reference<util::XTextSearch2> m_xSearch2; +}; + +// Sanity check our ICU first ... +void TestTextSearch::testICU() +{ + UErrorCode nErr = U_ZERO_ERROR; + sal_uInt32 nSearchFlags = UREGEX_UWORD | UREGEX_CASE_INSENSITIVE; + + OUString aString( "abcdefgh" ); + OUString aPattern( "e" ); + IcuUniString aSearchPat( reinterpret_cast<const UChar*>(aPattern.getStr()), aPattern.getLength() ); + + std::unique_ptr<icu::RegexMatcher> pRegexMatcher(new icu::RegexMatcher( aSearchPat, nSearchFlags, nErr )); + + IcuUniString aSource( reinterpret_cast<const UChar*>(aString.getStr()), aString.getLength() ); + pRegexMatcher->reset( aSource ); + + CPPUNIT_ASSERT( pRegexMatcher->find( 0, nErr ) ); + CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr ); + CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(4), pRegexMatcher->start( nErr ) ); + CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr ); + CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(5), pRegexMatcher->end( nErr ) ); + CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr ); + + OUString aString2( "acababaabcababadcdaa" ); + OUString aPattern2( "a" ); + + IcuUniString aSearchPat2( reinterpret_cast<const UChar*>(aPattern2.getStr()), aPattern2.getLength() ); + pRegexMatcher.reset(new icu::RegexMatcher( aSearchPat2, nSearchFlags, nErr )); + + IcuUniString aSource2( reinterpret_cast<const UChar*>(aString2.getStr()), aString2.getLength() ); + pRegexMatcher->reset( aSource2 ); + + CPPUNIT_ASSERT( pRegexMatcher->find( 0, nErr ) ); + CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr ); + CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(0), pRegexMatcher->start( nErr ) ); + CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr ); + CPPUNIT_ASSERT_EQUAL( static_cast<int32_t>(1), pRegexMatcher->end( nErr ) ); + CPPUNIT_ASSERT_EQUAL( U_ZERO_ERROR, nErr ); +} + +void TestTextSearch::testSearches() +{ + OUString str( "acababaabcababadcdaa" ); + sal_Int32 startPos = 2, endPos = 20 ; + OUString const searchStr( "(ab)*a(c|d)+" ); + sal_Int32 const fStartRes = 10, fEndRes = 18 ; + sal_Int32 const bStartRes = 18, bEndRes = 10 ; + + // set options + util::SearchOptions aOptions; + aOptions.algorithmType = util::SearchAlgorithms_REGEXP ; + aOptions.searchFlag = util::SearchFlags::ALL_IGNORE_CASE; + aOptions.searchString = searchStr; + m_xSearch->setOptions( aOptions ); + + util::SearchResult aRes; + + // search forward + aRes = m_xSearch->searchForward( str, startPos, endPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( fStartRes, aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( fEndRes, aRes.endOffset[0] ); + + // search backwards + aRes = m_xSearch->searchBackward( str, endPos, startPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( bStartRes, aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( bEndRes, aRes.endOffset[0] ); + + aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_CASE + | TransliterationFlags::IGNORE_WIDTH); + aOptions.searchString = "([^ ]*)[ ]*([^ ]*)"; + m_xSearch->setOptions(aOptions); + aRes = m_xSearch->searchForward("11 22 33", 2, 7); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(3), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 2) && (aRes.endOffset[0] == 5)); + CPPUNIT_ASSERT((aRes.startOffset[1] == 2) && (aRes.endOffset[1] == 2)); + CPPUNIT_ASSERT((aRes.startOffset[2] == 3) && (aRes.endOffset[2] == 5)); +} + +void TestTextSearch::testWildcardSearch() +{ + util::SearchOptions2 aOptions; + OUString aText; + util::SearchResult aRes; + + aOptions.AlgorithmType2 = util::SearchAlgorithms2::WILDCARD ; + aOptions.WildcardEscapeCharacter = '~'; + // aOptions.searchFlag = ::css::util::SearchFlags::WILD_MATCH_SELECTION; + // is not set, so substring match is allowed. + aOptions.transliterateFlags = sal_Int32(::css::i18n::TransliterationModules::TransliterationModules_IGNORE_CASE); + aText = "abAca"; + + aOptions.searchString = "a"; + m_xSearch2->setOptions2( aOptions ); + // match first "a", [0,1) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 1)); + // match last "a", (5,4] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 4)); + + aOptions.searchString = "a?"; + m_xSearch2->setOptions2( aOptions ); + // match "ab", [0,2) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 2)); + // match "ac", (4,2] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 2)); + + aOptions.searchString = "a*c"; + m_xSearch2->setOptions2( aOptions ); + // match "abac", [0,4) XXX NOTE: first match forward + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 4)); + // match "ac", (4,2] XXX NOTE: first match backward, not greedy + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 2)); + + aOptions.searchString = "b*a"; + m_xSearch2->setOptions2( aOptions ); + // match "ba", [1,3) XXX NOTE: first match forward, not greedy + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 1) && (aRes.endOffset[0] == 3)); + // match "baca", (5,1] XXX NOTE: first match backward + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 1)); + + aText = "ab?ca"; + + aOptions.searchString = "?~??"; + m_xSearch2->setOptions2( aOptions ); + // match "b?c", [1,4) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 1) && (aRes.endOffset[0] == 4)); + // match "b?c", (4,1] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 1)); + + aText = "ab*ca"; + + aOptions.searchString = "?~*?"; + m_xSearch2->setOptions2( aOptions ); + // match "b?c", [1,4) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 1) && (aRes.endOffset[0] == 4)); + // match "b?c", (4,1] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 4) && (aRes.endOffset[0] == 1)); + + aOptions.searchString = "ca?"; + m_xSearch2->setOptions2( aOptions ); + // no match + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions); + // no match + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions); + + aOptions.searchString = "ca*"; + m_xSearch2->setOptions2( aOptions ); + // match "ca", [3,5) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 3) && (aRes.endOffset[0] == 5)); + // match "ca", (5,3] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 3)); + + aOptions.searchString = "*ca*"; + m_xSearch2->setOptions2( aOptions ); + // match "abaca", [0,5) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 5)); + // match "abaca", (5,0] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 5) && (aRes.endOffset[0] == 0)); + + aText = "123123"; + aOptions.searchString = "*2?"; + m_xSearch2->setOptions2( aOptions ); + // match first "123", [0,3) + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 3)); + // match "123123", (6,0] Yes this looks odd, but it is as searching "?2*" forward. + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 6) && (aRes.endOffset[0] == 0)); + + aOptions.searchFlag |= util::SearchFlags::WILD_MATCH_SELECTION; + m_xSearch2->setOptions2( aOptions ); + // match "123123", [0,6) with greedy '*' + aRes = m_xSearch2->searchForward( aText, 0, aText.getLength()); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 0) && (aRes.endOffset[0] == 6)); + // match "123123", (6,0] + aRes = m_xSearch2->searchBackward( aText, aText.getLength(), 0); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(1), aRes.subRegExpressions); + CPPUNIT_ASSERT((aRes.startOffset[0] == 6) && (aRes.endOffset[0] == 0)); +} + +void TestTextSearch::testApostropheSearch() +{ + // A) find typographic apostrophes also by using ASCII apostrophe in searchString + OUString str( u"It\u2019s an apostrophe." ); + sal_Int32 startPos = 0, endPos = str.getLength(); + + // set options + util::SearchOptions aOptions; + aOptions.algorithmType = util::SearchAlgorithms_ABSOLUTE; + aOptions.searchFlag = util::SearchFlags::ALL_IGNORE_CASE; + aOptions.searchString = "'"; + m_xSearch->setOptions( aOptions ); + + util::SearchResult aRes; + + // search forward + aRes = m_xSearch->searchForward( str, startPos, endPos ); + // This was 0. + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] ); + + // search backwards + aRes = m_xSearch->searchBackward( str, endPos, startPos ); + // This was 0. + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] ); + + // check with transliteration + aOptions.transliterateFlags = static_cast<int>(TransliterationFlags::IGNORE_CASE + | TransliterationFlags::IGNORE_WIDTH); + m_xSearch->setOptions(aOptions); + + // search forward + aRes = m_xSearch->searchForward( str, startPos, endPos ); + // This was 0. + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] ); + + // search backwards + aRes = m_xSearch->searchBackward( str, endPos, startPos ); + // This was 0. + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] ); + + // B) search ASCII apostrophe in a text with ASCII apostrophes + str = str.replace(u'\u2019', '\''); + + // search forward + aRes = m_xSearch->searchForward( str, startPos, endPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] ); + + // search backwards + aRes = m_xSearch->searchBackward( str, endPos, startPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] ); + + // C) search typographic apostrophe in a text with ASCII apostrophes (no result) + aOptions.searchString = OUString(u"\u2019"); + m_xSearch->setOptions( aOptions ); + + aRes = m_xSearch->searchForward( str, startPos, endPos ); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions); + + aRes = m_xSearch->searchBackward( str, endPos, startPos ); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aRes.subRegExpressions); + + // D) search typographic apostrophe in a text with typographic apostrophes + str = str.replace('\'', u'\u2019'); + + // search forward + aRes = m_xSearch->searchForward( str, startPos, endPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.endOffset[0] ); + + // search backwards + aRes = m_xSearch->searchBackward( str, endPos, startPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(3), aRes.startOffset[0] ); + CPPUNIT_ASSERT_EQUAL( static_cast<sal_Int32>(2), aRes.endOffset[0] ); + + // E) search mixed apostrophes in a text with mixed apostrophes: + aOptions.searchString = OUString(u"'\u2019"); + m_xSearch->setOptions( aOptions ); + str = u"test: \u2019'"; + + // search forward + aRes = m_xSearch->searchForward( str, startPos, str.getLength()); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + + // search backwards + aRes = m_xSearch->searchBackward( str, str.getLength(), startPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + + // F) search mixed apostrophes in a text with ASCII apostrophes: + str = u"test: ''"; + + // search forward + aRes = m_xSearch->searchForward( str, startPos, str.getLength()); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); + + // search backwards + aRes = m_xSearch->searchBackward( str, str.getLength(), startPos ); + CPPUNIT_ASSERT( aRes.subRegExpressions > 0 ); +} + +void TestTextSearch::setUp() +{ + BootstrapFixtureBase::setUp(); + m_xSearch.set(m_xSFactory->createInstance("com.sun.star.util.TextSearch"), uno::UNO_QUERY_THROW); + m_xSearch2.set(m_xSFactory->createInstance("com.sun.star.util.TextSearch2"), uno::UNO_QUERY_THROW); +} + +void TestTextSearch::tearDown() +{ + m_xSearch.clear(); + m_xSearch2.clear(); + BootstrapFixtureBase::tearDown(); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(TestTextSearch); + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |