From fb94cc0d1348140d03c2826771c57255ff74a94a Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Thu, 11 Apr 2024 16:42:39 -0600 Subject: [PATCH] tdf#49885 Reviewed BreakIterator customizations This change completes the review of BreakIterator rule customizations, and adds unit tests for relevant customizations. Change-Id: I06678fcccfc48d020aac64dd9f58ff36a763af30 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166017 Tested-by: Jenkins Reviewed-by: Eike Rathke --- i18npool/qa/cppunit/test_breakiterator.cxx | 559 +++++++++++++++++++ i18npool/source/breakiterator/data/README | 612 ++++----------------- 2 files changed, 668 insertions(+), 503 deletions(-) diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 0f2629fe05ec..b33466bee46d 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -31,6 +31,7 @@ public: void testLineBreaking(); void testWordBoundaries(); + void testSentenceBoundaries(); void testGraphemeIteration(); void testWeak(); void testAsian(); @@ -43,9 +44,18 @@ public: void testJapanese(); void testChinese(); + void testLegacyDictWordPrepostDash_de_DE(); + void testLegacyDictWordPrepostDash_nds_DE(); + void testLegacyDictWordPrepostDash_nl_NL(); + void testLegacyDictWordPrepostDash_sv_SE(); + void testLegacyHebrewQuoteInsideWord(); + void testLegacySurrogatePairs(); + void testLegacyWordCountCompat(); + CPPUNIT_TEST_SUITE(TestBreakIterator); CPPUNIT_TEST(testLineBreaking); CPPUNIT_TEST(testWordBoundaries); + CPPUNIT_TEST(testSentenceBoundaries); CPPUNIT_TEST(testGraphemeIteration); CPPUNIT_TEST(testWeak); CPPUNIT_TEST(testAsian); @@ -57,6 +67,13 @@ public: #endif CPPUNIT_TEST(testJapanese); CPPUNIT_TEST(testChinese); + CPPUNIT_TEST(testLegacyDictWordPrepostDash_de_DE); + CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE); + CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL); + CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE); + CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord); + CPPUNIT_TEST(testLegacySurrogatePairs); + CPPUNIT_TEST(testLegacyWordCountCompat); CPPUNIT_TEST_SUITE_END(); private: @@ -118,6 +135,173 @@ void TestBreakIterator::testLineBreaking() } } + // i#22602: writer breaks word after dot immediately followed by a letter + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + //Here we want the line break to leave ./bar/baz clumped together on the next line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "foo ./bar/baz", strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period", + static_cast(4), aResult.breakIndex); + } + } + + // i#81448: slash and backslash make non-breaking spaces of preceding spaces + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // Per the bug, the line break should leave ...BE clumped together on the next line. + // However, the current behavior does not wrap the string at all. This test asserts the + // current behavior as a point of reference. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "THIS... ...BE", strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(0), aResult.breakIndex); + } + } + + // i#81448: slash and backslash make non-breaking spaces of preceding spaces + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // The line break should leave /BE clumped together on the next line. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "THIS... /BE", strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(8), aResult.breakIndex); + } + } + + // i#80548: Bad word wrap between dash and word + { + aLocale.Language = "fi"; + aLocale.Country = "FI"; + + { + // Per the bug, the line break should leave -bar clumped together on the next line. + // However, this change was reverted at some point. This test asserts the new behavior. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", + static_cast(5), aResult.breakIndex); + } + } + + // i#80645: Line erroneously breaks at backslash + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // Here we want the line break to leave C:\Program Files\ on the first line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + } + } + + // i#80841: Words separated by hyphens will always break to next line + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // Here we want the line break to leave toll- on the first line + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "toll-free", strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); + } + } + + // i#83464: Line break between letter and $ + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // Here we want the line break to leave US$ clumped on the next line. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "word US$ 123", strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); + } + } + + // Unknown bug number: "fix line break problem of dot after letter and before number" + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // Here we want the line break to leave US$ clumped on the next line. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "word L.5 word", strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); + } + } + + // i#83229: Wrong line break when word contains a hyphen + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + { + // Here we want the line break to leave 100- clumped on the first line. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(9), aResult.breakIndex); + } + } + + // i#83649: Line break should be between typographical quote and left bracket + { + aLocale.Language = "de"; + aLocale.Country = "DE"; + + { + // Here we want the line break to leave »angetan werden« on the first line + const OUString str = u"»angetan werden« [Passiv]"_ustr; + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + } + } + + // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters + { + aLocale.Language = "zh"; + aLocale.Country = "HK"; + + { + // Per the bug, this should break at the ideographic comma. However, this change has + // been reverted at some point. This test only verifies current behavior. + const OUString str = u"word word、word word"_ustr; + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(13), aResult.breakIndex); + } + } + + // i#80891: Character in the forbidden list sometimes appears at the start of line + { + aLocale.Language = "zh"; + aLocale.Country = "HK"; + + { + // Per the bug, the ideographic two-dot leader should be a forbidden character. However, + // this change seems to have been reverted or broken at some point. + const OUString str = u"電話︰電話"_ustr; + i18n::LineBreakResults aResult + = m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(2), aResult.breakIndex); + } + } + //See https://bz.apache.org/ooo/show_bug.cgi?id=19716 { aLocale.Language = "en"; @@ -160,6 +344,20 @@ void TestBreakIterator::testLineBreaking() CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast(5), aResult.breakIndex); } } + + // i#65267: Comma is badly broken at end of line + // - The word should be wrapped along with the comma + { + aLocale.Language = "de"; + aLocale.Country = "DE"; + + { + auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort", + strlen("Wort -prinzessinnen,"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex); + } + } } //See https://bugs.libreoffice.org/show_bug.cgi?id=49629 @@ -601,6 +799,174 @@ void TestBreakIterator::testWordBoundaries() CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); } + + // i#55778: Words containing numbers get broken up + { + aLocale.Language = "en"; + aLocale.Country = "US"; + + static constexpr OUString aTest = u"first i18n third"_ustr; + + aBounds + = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos); + } + + // i#56347: "BreakIterator patch for Hungarian" + // Rules for Hungarian affixes after numbers and certain symbols + { + auto mode = i18n::WordType::DICTIONARY_WORD; + aLocale.Language = "hu"; + aLocale.Country = "HU"; + + OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + } + + // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) + // Rules for Hungarian affixes after numbers and certain symbols in edit mode. + // The patch was merged, but the original bug was never closed and the current behavior seems + // identical to the ICU default behavior. Added this test to ensure that doesn't change. + { + auto mode = i18n::WordType::ANY_WORD; + aLocale.Language = "hu"; + aLocale.Country = "HU"; + + OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; + + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + } +} + +void TestBreakIterator::testSentenceBoundaries() +{ + lang::Locale aLocale; + aLocale.Language = "en"; + aLocale.Country = "US"; + + // Trivial characteristic test for sentence boundary detection + { + OUString aTest("This is a sentence. This is a different sentence."); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale)); + } + + // i#24098: i18n API beginOfSentence/endOfSentence + // fix beginOfSentence, ... when cursor is on the beginning of the sentence + { + OUString aTest("This is a sentence. This is a different sentence."); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale)); + } + + // i#24098: i18n API beginOfSentence/endOfSentence + // "skip preceding space for beginOfSentence" + { + OUString aTest("This is a sentence. This is a different sentence."); + + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale)); + } } //See https://bugs.libreoffice.org/show_bug.cgi?id=40292 @@ -1043,6 +1409,199 @@ void TestBreakIterator::testChinese() CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } } + +void TestBreakIterator::testLegacyDictWordPrepostDash_de_DE() +{ + lang::Locale aLocale; + aLocale.Language = "de"; + aLocale.Country = "DE"; + + { + auto aTest = u"Arbeits- -nehmer"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + } +} + +void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE() +{ + lang::Locale aLocale; + aLocale.Language = "nds"; + aLocale.Country = "DE"; + + { + auto aTest = u"Arbeits- -nehmer"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + } +} + +void TestBreakIterator::testLegacyDictWordPrepostDash_nl_NL() +{ + lang::Locale aLocale; + aLocale.Language = "nl"; + aLocale.Country = "NL"; + + { + auto aTest = u"Arbeits- -nehmer"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + } +} + +void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE() +{ + lang::Locale aLocale; + aLocale.Language = "sv"; + aLocale.Country = "SE"; + + { + auto aTest = u"Arbeits- -nehmer"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + } +} + +void TestBreakIterator::testLegacyHebrewQuoteInsideWord() +{ + lang::Locale aLocale; + + aLocale.Language = "he"; + aLocale.Country = "IL"; + + { + auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + } +} + +void TestBreakIterator::testLegacySurrogatePairs() +{ + lang::Locale aLocale; + + aLocale.Language = "ja"; + aLocale.Country = "JP"; + + // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs. + // and many others to address bugs: i#75631 i#75633 i#75412 etc. + // + // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test. + { + const sal_Unicode buf[] = { u"X 𠮟 X" }; + OUString aTest(buf, SAL_N_ELEMENTS(buf)); + + auto aBounds + = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } +} + +void TestBreakIterator::testLegacyWordCountCompat() +{ + lang::Locale aLocale; + + aLocale.Language = "en"; + aLocale.Country = "US"; + + // i#80815: "Word count differs from MS Word" + // This is a characteristic test for word count using test data from the linked bug. + { + const OUString str = u"" + "test data for word count issue #80815\n" + "fo\\\'sforos\n" + "archipi\\\'elago\n" + "do\\^me\n" + "f**k\n" + "\n" + "battery-driven\n" + "and/or\n" + "apple(s)\n" + "money+opportunity\n" + "Micro$oft\n" + "\n" + "300$\n" + "I(not you)\n" + "a****n\n" + "1+3=4\n" + "\n" + "aaaaaaa.aaaaaaa\n" + "aaaaaaa,aaaaaaa\n" + "aaaaaaa;aaaaaaa\n"_ustr; + + int num_words = 0; + sal_Int32 next_pos = 0; + int iter_guard = 0; + while (true) + { + CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100); + + auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT); + + if (aBounds.endPos < next_pos) + { + break; + } + + next_pos = aBounds.endPos; + ++num_words; + } + + CPPUNIT_ASSERT_EQUAL(23, num_words); + } +} + void TestBreakIterator::setUp() { BootstrapFixtureBase::setUp(); diff --git a/i18npool/source/breakiterator/data/README b/i18npool/source/breakiterator/data/README index 6246b80ae77f..76e3e37c3faf 100644 --- a/i18npool/source/breakiterator/data/README +++ b/i18npool/source/breakiterator/data/README @@ -9,411 +9,108 @@ At various stages these copies have been customized and are now horribly out of sync. It unclear which diffs from the base versions are deliberate and which are now accidental :-( -We need to review the various issues referenced in the commits that caused -customizations and see if they're still relevant or not, write regression tests -for them, if any are still relevant then apply the changes back on top of the -latest versions. +The various issues and customizations have been reviewed, with tests written for +customizations that are still relevant. However, these files are still extremely +out-of-date and need to be refreshed. Relevant customizations should be reapplied +on top of a current version. -to-review, later are ok: - -commit e1ad946ef5db3f7c0a540207d0f0fd85799e3b66 -Author: Release Engineers -Date: Thu Aug 6 18:13:57 2009 +0000 - - CWS-TOOLING: integrate CWS tl73 - 2009-07-31 15:29:33 +0200 tl r274535 : #i64400# dash/hyphen should not break words - -commit 9964a76ef58786bba47d409970512d7ded6c8889 -Author: Rüdiger Timm -Date: Wed Jul 2 07:53:05 2008 +0000 - - INTEGRATION: CWS i18n41 (1.1.2); FILE ADDED - 2008/04/25 17:06:26 khong 1.1.2.3: i55063, make period a sentence delimiter - 2008/04/25 06:40:50 khong 1.1.2.2: i55063, make space as Thai sentence delimiter - 2008/04/24 03:19:10 khong 1.1.2.1: i55063, set Thai letters as sentence delimiter for Thai and English mixed text - -commit e4a6e4284dae1ca6fbfa7d1e43690dbf87d796cd -Author: Rüdiger Timm -Date: Wed Jul 2 07:52:44 2008 +0000 - - INTEGRATION: CWS i18n41 (1.9.12); FILE MERGED - 2008/06/17 20:22:30 khong 1.9.12.2: i83229 fix the problem of leading hyphen for numbers - 2008/04/23 06:20:16 khong 1.9.12.1: i72868, i80891, i83229, fix Chinese punctuations and hyphen for line breakiterator - -commit 55dff22611659a1567c968fbf9e512a2765ab62e -Author: Rüdiger Timm -Date: Wed Jul 2 07:52:07 2008 +0000 - - INTEGRATION: CWS i18n41 (1.33.36); FILE MERGED - 2008/06/05 22:18:29 khong 1.33.36.2: RESYNC: (1.33-1.35); FILE MERGED - 2008/04/23 06:11:55 khong 1.33.36.1: i55063, enable language specific sentence breakiterator - -commit 1c2b8095631a3c2d2f396bf50a8f0c62f49be65c -Author: Rüdiger Timm -Date: Wed Jul 2 07:51:12 2008 +0000 - - INTEGRATION: CWS i18n41 (1.12.140); FILE MERGED - 2008/06/05 22:18:26 khong 1.12.140.2: RESYNC: (1.12-1.13); FILE MERGED - 2008/04/23 06:04:53 khong 1.12.140.1: i87530 avoid breaking line before un-completed cell - -commit 9bbdb52df370c69c0f7eba387a2068ee80bd7994 -Author: Rüdiger Timm -Date: Wed Jul 2 07:50:43 2008 +0000 - - INTEGRATION: CWS i18n41 (1.25.2); FILE MERGED - 2008/06/05 22:18:23 khong 1.25.2.2: RESYNC: (1.25-1.26); FILE MERGED - 2008/04/23 06:09:02 khong 1.25.2.1: i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts - -commit 8dcdd3ca268f78295731b86797c2b8cd447ba667 -Author: Kurt Zenker -Date: Tue May 20 13:36:01 2008 +0000 - - INTEGRATION: CWS i18n43_DEV300 (1.33.38); FILE MERGED - 2008/04/29 21:51:51 khong 1.33.38.1: #i88411# apply the patch from Coleman Kane to fix icu setBreakType issue - -commit bedef98c24ef9ada6aaffe9bc5284d9759a31a9a -Author: Kurt Zenker -Date: Wed Apr 2 08:49:09 2008 +0000 - - INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED - 2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word - 2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word - -commit 59144104b3f91a2e6ed816f0bde0fdb91ea218d7 -Author: Kurt Zenker -Date: Wed Apr 2 08:48:53 2008 +0000 - - INTEGRATION: CWS i18n40 (1.24.44); FILE MERGED - 2008/03/19 18:56:42 khong 1.24.44.2: i80815 make word count feature like MS Word - 2008/03/15 07:31:38 khong 1.24.44.1: #i80815# count punctuation as word - -commit 3f0b51776602c45e8aca991450fcbb30f2484ae5 -Author: Vladimir Glazounov -Date: Mon Jan 28 14:33:46 2008 +0000 - - INTEGRATION: CWS i18n39 (1.8.4); FILE MERGED - 2007/12/12 17:45:45 khong 1.8.4.3: b6634800# fix line break problem of dot after letter and before number - 2007/12/08 01:05:52 khong 1.8.4.2: #i83649# fixed the problem of line break between quotation mark and open bracket - 2007/12/07 23:44:30 khong 1.8.4.1: #i83464# fix the problem of line break between letter and 1326 - -commit 5d8ef209b1f63d1c8ea5014bdbef96660b355423 -Author: Vladimir Glazounov -Date: Tue Oct 23 08:09:00 2007 +0000 - - INTEGRATION: CWS i18n38 (1.7.4); FILE MERGED - 2007/09/19 00:08:04 khong 1.7.4.3: i81448 fixed dot line break issue - 2007/09/10 23:57:12 khong 1.7.4.2: i81440 fix the problem of line break on punctuations - 2007/09/10 22:55:46 khong 1.7.4.1: i81448 fix problem of line break on symbols - -commit a2f3b48cacfcef338ca5e37acde34c83876e082e -Author: Vladimir Glazounov -Date: Tue Oct 23 08:08:47 2007 +0000 - - INTEGRATION: CWS i18n38 (1.32.10); FILE MERGED - 2007/09/18 20:32:39 khong 1.32.10.1: i81519 set break type icu breakiterator - -commit 1967d8fb182b3101dee4f715e78be384400bc1e8 -Author: Kurt Zenker -Date: Wed Sep 5 16:37:28 2007 +0000 - - INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED - 2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator - 2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem - -commit d2c2baf1a31d281d20e8b4d4c806dda027b2d5a3 -Author: Vladimir Glazounov -Date: Tue Aug 28 11:46:45 2007 +0000 - - INTEGRATION: CWS i18n36_SRC680 (1.5.20.1.2); FILE MERGED - 2007/08/22 17:12:36 khong 1.5.20.1.2.1: i80841 fix hyphen line break problem - -commit d56bedfb425cf77f176f143455e4a9fb6ce65540 -Author: Vladimir Glazounov -Date: Tue Aug 28 11:46:34 2007 +0000 - - INTEGRATION: CWS i18n36_SRC680 (1.21.2.1.2); FILE MERGED - 2007/08/22 20:02:28 khong 1.21.2.1.2.2: i80923 fix infinite loop problem - 2007/08/22 17:11:44 khong 1.21.2.1.2.1: i80923 fix a infinite loop - -commit 8a36b196925a5561eabde0a0ef293c73fcb5add3 -Author: Ivo Hinkelmann -Date: Fri Aug 17 13:58:48 2007 +0000 - - INTEGRATION: CWS i18n34 (1.5.22); FILE MERGED - 2007/08/13 22:26:12 khong 1.5.22.1: i80548 i80645 fix dash and backslash issues in line breakiterator - -commit c00b2b49bad765144f90552139e63d87d520d1cf -Author: Ivo Hinkelmann -Date: Fri Aug 17 13:58:36 2007 +0000 - - INTEGRATION: CWS i18n34 (1.15.4); FILE MERGED - 2007/08/13 22:33:38 khong 1.15.4.1: i86439 fix surrogate characters handling issues - -commit 3fc5fbc71d4c244d7c8002aa530481741e585bd4 -Author: Ivo Hinkelmann -Date: Fri Aug 17 13:58:23 2007 +0000 - - INTEGRATION: CWS i18n34 (1.31.4); FILE MERGED - 2007/08/13 22:33:37 khong 1.31.4.1: i86439 fix surrogate characters handling issues - -commit ee44b43881e7c82c379931f111c452a477b73341 -Author: Ivo Hinkelmann -Date: Fri Aug 17 13:58:11 2007 +0000 - - INTEGRATION: CWS i18n34 (1.21.4); FILE MERGED - 2007/08/14 08:38:53 khong 1.21.4.2: i86439 fix surrogate characters handling issues - 2007/08/13 22:33:37 khong 1.21.4.1: i86439 fix surrogate characters handling issues - -commit f47369dbbc385f8968ad43e43cba293a29a4c2df -Author: Jens-Heiner Rechtien -Date: Tue Jul 31 16:09:13 2007 +0000 - - INTEGRATION: CWS i18n32 (1.29.14); FILE MERGED - 2007/07/24 20:39:44 khong 1.29.14.1: #i79148# fix a local word breakiterator rules loading issue - -commit 2791553b4e3fc5e04b96d0b2fd119d9fba1946bc -Author: Rüdiger Timm -Date: Thu Jul 26 08:08:51 2007 +0000 - - INTEGRATION: CWS i18n31 (1.14.60); FILE MERGED - 2007/07/16 22:18:44 khong 1.14.60.4: i75631 i75632 i75633 i75412 handle surrogate pair characters - 2007/07/13 20:37:32 khong 1.14.60.3: #i75632# use ICU characters properties - 2007/07/04 01:17:22 khong 1.14.60.2: i75631 i75632 i75633 i75412 handle surrogate pair characters - 2007/06/27 04:33:11 khong 1.14.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters - -commit 1c79a2bf1e89ac4eb409922ab7eb8ad3cacc688a -Author: Rüdiger Timm -Date: Thu Jul 26 08:08:39 2007 +0000 - - INTEGRATION: CWS i18n31 (1.8.60); FILE MERGED - 2007/06/27 04:33:11 khong 1.8.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters - -commit 517bbaddbaf81a5a6bb00979944cad13a1575d50 -Author: Rüdiger Timm -Date: Thu Jul 26 08:08:27 2007 +0000 - - INTEGRATION: CWS i18n31 (1.28.14); FILE MERGED - 2007/07/13 20:37:32 khong 1.28.14.5: #i75632# use ICU characters properties - 2007/07/04 01:17:22 khong 1.28.14.4: i75631 i75632 i75633 i75412 handle surrogate pair characters - 2007/06/27 23:25:58 khong 1.28.14.3: i75412 handle surrogate pair characters - 2007/06/27 05:33:20 khong 1.28.14.2: RESYNC: (1.28-1.29); FILE MERGED - 2007/06/27 04:33:11 khong 1.28.14.1: i75631 i75632 i75633 i75412 handle surrogate pair characters - -commit 0154e3492f2527535c0d648274e7ff674674318b -Author: Rüdiger Timm -Date: Thu Jul 26 08:08:14 2007 +0000 - - INTEGRATION: CWS i18n31 (1.14.42); FILE MERGED - 2007/06/27 05:33:03 khong 1.14.42.2: RESYNC: (1.14-1.15); FILE MERGED - 2007/06/27 04:33:11 khong 1.14.42.1: i75631 i75632 i75633 i75412 handle surrogate pair characters - -commit e2a5a2532ee187669980adb7bfa747c7803c330a -Author: Rüdiger Timm -Date: Thu Jul 26 08:08:02 2007 +0000 - - INTEGRATION: CWS i18n31 (1.19.60); FILE MERGED - 2007/07/13 20:37:32 khong 1.19.60.4: #i75632# use ICU characters properties - 2007/07/04 01:17:22 khong 1.19.60.3: i75631 i75632 i75633 i75412 handle surrogate pair characters - 2007/06/27 05:00:48 khong 1.19.60.2: i75231 handle surrogate pair characters - 2007/06/27 04:33:11 khong 1.19.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters - -commit 80a26a7d4720b5b8cfa0acc624b28014c96d9948 -Author: Jens-Heiner Rechtien -Date: Tue Jun 26 16:41:02 2007 +0000 - - INTEGRATION: CWS ause081 (1.2.332); FILE MERGED - 2007/06/21 10:53:19 hjs 1.2.332.1: #i78393# remove component_getDescriptionFunc from exports - -commit c2801db6b04bf6f0dbb07727c91b2c66e7e027b8 -Author: Ivo Hinkelmann -Date: Wed Jun 6 11:17:38 2007 +0000 - - INTEGRATION: CWS i18n30 (1.4.24); FILE MERGED - 2007/05/08 21:32:18 khong 1.4.24.1: #i73903# update line breakiterator rule to icu3.6 style - -commit ea290668f78475c3b277c9e44bf5622ccb4dcec8 -Author: Ivo Hinkelmann -Date: Wed Jun 6 11:17:25 2007 +0000 - - INTEGRATION: CWS i18n30 (1.28.4); FILE MERGED - 2007/05/08 21:47:00 khong 1.28.4.3: #i75412# remove fix from cws i18n30, move it to other cws to fix with other Japanese surrogate issues - 2007/03/20 18:39:58 khong 1.28.4.2: #i72589# fixed BS problem for surrogate characters - 2007/03/13 19:11:44 khong 1.28.4.1: #i75319# fixed ANY_WORD rule loading problem - -commit b6308a6e322fd4eaa7845793beb70900624f351c -Author: Ivo Hinkelmann -Date: Wed Jun 6 11:17:12 2007 +0000 - - INTEGRATION: CWS i18n30 (1.14.32); FILE MERGED - 2007/05/08 21:44:15 khong 1.14.32.1: #i76706# fix infinite loop for CJK word breakiterator for text mixed with Latin and CJK characters - -commit e068e0e9aa9405ea4016ad19e9a963129adfed79 -Author: Rüdiger Timm -Date: Thu Jan 25 08:35:42 2007 +0000 - - INTEGRATION: CWS i18n28 (1.1.2); FILE ADDED - 2006/12/06 05:52:39 khong 1.1.2.1: #i64400# add an optional breakiterator entry in localedata - -commit 8d6f35a46085bb420e8896505504b376d17b842a -Author: Rüdiger Timm -Date: Thu Jan 25 08:35:31 2007 +0000 - - INTEGRATION: CWS i18n28 (1.24.36); FILE MERGED - 2006/12/19 17:27:58 khong 1.24.36.2: RESYNC: (1.24-1.25); FILE MERGED - 2006/12/06 05:52:38 khong 1.24.36.1: #i64400# add an optional breakiterator entry in localedata - -commit 633d34fa33330339ab6795ce3703477216e0062e -Author: Kurt Zenker -Date: Tue Dec 12 15:14:36 2006 +0000 - - INTEGRATION: CWS icuupgrade (1.9.24); FILE MERGED - 2006/10/11 06:11:11 khong 1.9.24.4: RESYNC: (1.10-1.11); FILE MERGED - 2006/07/07 10:57:40 hdu 1.9.24.3: RESYNC: (1.9-1.10); FILE MERGED - 2006/06/30 01:31:40 khong 1.9.24.2: #i53388# upgrade icu to 3.4.1 - 2006/06/15 19:16:55 khong 1.9.24.1: #i60645# upgrade icu to 3.4.1 - -commit 5d46dabe95271c846601a2575d3304fd5b4b24f1 -Author: Kurt Zenker -Date: Tue Dec 12 15:14:05 2006 +0000 - - INTEGRATION: CWS icuupgrade (1.22.20); FILE MERGED - 2006/11/11 07:12:47 khong 1.22.20.6: #142664# fix breakiterator crash problem - 2006/10/11 06:10:51 khong 1.22.20.5: RESYNC: (1.23-1.24); FILE MERGED - 2006/09/06 01:00:31 khong 1.22.20.4: #i60645# upgrade to icu 3.6 - 2006/07/07 10:57:32 hdu 1.22.20.3: RESYNC: (1.22-1.23); FILE MERGED - 2006/06/30 01:31:40 khong 1.22.20.2: #i53388# upgrade icu to 3.4.1 - 2006/06/20 14:27:26 hdu 1.22.20.1: #i60645# fix crash when udata_open failed - -commit 7431d816cdfc47b08978c0afd1f6503644bb11b8 -Author: Kurt Zenker -Date: Mon Nov 6 13:40:05 2006 +0000 - - INTEGRATION: CWS i18n27 (1.3.142); FILE MERGED - 2006/10/10 21:10:57 khong 1.3.142.1: #i65267# fix line break rule - -commit d7471e1462ffd9baeb3449eb86ccbb649e32b233 -Author: Kurt Zenker -Date: Mon Nov 6 13:39:52 2006 +0000 - - INTEGRATION: CWS i18n27 (1.1.2); FILE ADDED - 2006/10/10 21:08:55 khong 1.1.2.1: #i56348# add Hungarian word break rule for edit mode - -commit 1b65b0b886e2cb16382bc11770230fb6a140f33b -Author: Jens-Heiner Rechtien -Date: Tue Oct 24 12:53:13 2006 +0000 - - INTEGRATION: CWS tl29 (1.12.24); FILE MERGED - 2006/09/20 01:24:53 khong 1.12.24.1: #i69482# fixed mismatch of nextWord and getWordBoundary - -commit 97d89862a2285071202cc8010d888ffcbf96279a -Author: Jens-Heiner Rechtien -Date: Thu Nov 17 19:30:35 2005 +0000 - - INTEGRATION: CWS i18n23 (1.20.22); FILE MERGED - 2005/11/17 20:00:37 khong 1.20.22.3: RESYNC: (1.20-1.21); FILE MERGED - 2005/11/17 19:45:05 khong 1.20.22.2: #i57866# merge cws i18n23 and thaiissues - 2005/11/15 21:10:24 khong 1.20.22.1: #i57866# fix line breakiterator problem - -commit 05fadde6f025bcaafca4f3093e88be3cc1bb6836 -Author: Oliver Bolte -Date: Wed Nov 16 09:18:37 2005 +0000 - - INTEGRATION: CWS thaiissues (1.20.6); FILE MERGED - 2005/10/26 20:42:40 khong 1.20.6.2: use icu thai linke break algorithm for thai breakiterator - 2005/10/26 13:36:24 fme 1.20.6.1: #i55716# Handling of WORDJOINER - -commit a10b0e70c641d7438c557ef718c6942b3abffaec -Author: Oliver Bolte -Date: Wed Nov 16 09:18:25 2005 +0000 - - INTEGRATION: CWS thaiissues (1.8.6); FILE MERGED - 2005/10/26 20:42:39 khong 1.8.6.1: use icu thai linke break algorithm for thai breakiterator - -commit 4a1f1586173839d532f90507c72306bc9e2aec56 -Author: Oliver Bolte -Date: Wed Nov 16 09:18:11 2005 +0000 - - INTEGRATION: CWS thaiissues (1.9.4); FILE MERGED - 2005/10/28 17:54:39 khong 1.9.4.1: Fix a bug in ctl line break when there is word joiner character - -commit beb2a536738ba761a92f8266570f1859c85f94ae -Author: Rüdiger Timm -Date: Tue Nov 8 15:59:16 2005 +0000 - - INTEGRATION: CWS siloch (1.3.50); FILE MERGED - 2005/10/26 10:55:05 er 1.3.50.1: #i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking; contributed by Nemeth Laszlo - -commit 939e7c2bc93c13b6740051beeb08c5883b65ffce -Author: Kurt Zenker -Date: Fri Nov 4 14:33:30 2005 +0000 - - INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED - 2005/10/21 00:35:09 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word. - -commit 51594ef552a872b9868e5c7a025a68665488a016 -Author: Kurt Zenker -Date: Fri Nov 4 14:33:16 2005 +0000 - - INTEGRATION: CWS i18n21 (1.2.2); FILE MERGED - 2005/10/21 00:35:08 khong 1.2.2.1: #i55778 reverse back last change, treat letter and number combination as one word. - -commit f4fe39909c7ed645a8b387cf66de249572226ad6 -Author: Kurt Zenker -Date: Fri Nov 4 14:33:03 2005 +0000 - - INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED - 2005/10/21 00:35:08 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word. - -commit 7f8af14611e66655ea7354083eafd71afc9703e3 -Author: Kurt Zenker -Date: Fri Nov 4 14:32:41 2005 +0000 - - INTEGRATION: CWS i18n21 (1.4.46); FILE MERGED - 2005/10/21 00:35:07 khong 1.4.46.1: #i55778 reverse back last change, treat letter and number combination as one word. - -commit 924e158b9d871fbf7500e9215540e26aa95b3b20 -Author: Rüdiger Timm -Date: Mon Oct 17 14:43:17 2005 +0000 - - INTEGRATION: CWS i18n20 (1.1.2); FILE ADDED - 2005/09/22 23:47:49 khong 1.1.2.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. - -commit a428a8927006a10ccfe7182e6fe5a8b677281eca -Author: Rüdiger Timm -Date: Mon Oct 17 14:42:30 2005 +0000 - - INTEGRATION: CWS i18n20 (1.18.32); FILE MERGED - 2005/09/23 15:59:13 khong 1.18.32.6: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. - 2005/09/23 08:09:54 khong 1.18.32.5: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. - 2005/09/23 07:38:03 khong 1.18.32.4: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule - 2005/09/22 23:47:48 khong 1.18.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. - 2005/08/26 23:34:37 khong 1.18.32.2: #i50172# add cell breakiterator rule for Tamil - 2005/08/26 23:31:59 khong 1.18.32.1: #i50172# add cell breakiterator rule for Tamil - -commit f518f78557931b81e06fd7b31bb22c6639e5e553 -Author: Rüdiger Timm -Date: Mon Oct 17 14:42:14 2005 +0000 - - INTEGRATION: CWS i18n20 (1.6.32); FILE MERGED - 2005/09/23 15:59:13 khong 1.6.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. - 2005/09/23 07:38:02 khong 1.6.32.2: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule - 2005/09/22 23:47:48 khong 1.6.32.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. - -commit 9b870055ecd043d1d4fadeacd351f8739e1979a0 -Author: Vladimir Glazounov -Date: Fri Feb 25 09:08:13 2005 +0000 - - INTEGRATION: CWS i18n16 (1.16.22); FILE MERGED - 2005/02/04 19:05:45 khong 1.16.22.3: #i41671# use ICU rules for Thai breakiterator - 2005/01/24 21:56:34 khong 1.16.22.2: #i35285# merge cws i18n16 with top version 1.17 - 2005/01/12 01:12:41 khong 1.16.22.1: #i35285# remove uprv_malloc, use udata_open for loading icu rule breakiterator - -commit 29b9e86f5dac388d7aaced24d3826ac9331b03e3 -Author: Vladimir Glazounov -Date: Fri Feb 25 09:07:59 2005 +0000 +done, regression tests added: - INTEGRATION: CWS i18n16 (1.5.22); FILE MERGED - 2005/02/04 19:05:45 khong 1.5.22.1: #i41671# use ICU rules for Thai breakiterator +#112623# update Japanese word breakiterator dictionary +#i50172# add cell breakiterator rule for Tamil +#i80412# indic cursoring +#i107843# em-dash/en-dash breakiterator fix for spell checking +#i103552# Japanese word for 'shutdown' added to ja.dic +#i113785# ligatures for spell checking will no longer break words +An opening quote should not be counted as a word by word count tool (regression test in writer) +fdo#31271 wrong line break with ( +#i89042# word count fix (regression test is in writer) +#i58513# add break iterator rules for Finish +#i19716# fix wrong line break on bracket characters +#i21290# extend Greek script type +#i21907# fix isBeginWord and isEndWord problem +#i85411# Apply patch for ZWSP +#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break +#i13451# add '-' as midLetter for Catalan dictionary word breakiterator +#i13494# fix word breakiterator rule to handle punctuations and signs correctly +#i29548# Fix Thai word breakiterator problem +#i11993# #i14904# fix word breakiterator issues +#i64400# dash/hyphen should not break words (de/nds/nl/sv) +#i22602# make dot stick on beginning of a word when doing line break +#i24098# skip preceding space for beginOfSentence +#i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence +#i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +#i50172# add cell breakiterator rule for Tamil +#i55778# reverse back last change, treat letter and number combination as one word. +#i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking +#i56348# add Hungarian word break rule for edit mode +#i65267# fix line break rule +#i86439# many changes to implement, tweak, debug UTF-16 surrogate pair handling +#i75631# " +#i75632# " +#i75633# " +#i75412# " +#i80645# fix backslash issues in line breakiterator +#i80841# fix hyphen line break problem +#i81448# fixed dot line break issue +#i81448# fix the problem of line break on punctuations (commit message says i81440) +#i81448# fix problem of line break on symbols +#i83649# fixed the problem of line break between quotation mark and open bracket +#i83464# fix the problem of line break between letter and 1326 +b6634800# fix line break problem of dot after letter and before number +#i83229# fix the problem of leading hyphen for numbers +#i80815# count words like MS Word + +likely superseded: + +#i21392# Obscure line break behavior mismatch in string of symbols between MSO and LO. +#i80548# "fix dash issues in line breakiterator" - fix no longer works +#i72868# "fix Chinese punctuation for line breakiterator" - fix no longer works +#i80891# "fix Chinese punctuation for line breakiterator" - fix no longer works + +#i27711# Adding/tweaking/removing languages later added to ICU. +#i33756# " +#i41671# " +#i41671# " +#i55063# " +#i24850# ICU upgrades, internal bug fixes, or other work-arounds. +#i24098# " +#112772# " +#i35285# " +4a1f1586173839d532f90507c72306bc9e2aec56 " +a10b0e70c641d7438c557ef718c6942b3abffaec " +05fadde6f025bcaafca4f3093e88be3cc1bb6836 " +#i57866# " +#i57866# " +#i69482# " +#142664# " +#i60645# " +#i53388# " +#i60645# " +#i78393# " +#i73903# " +#i75412# " +#i72589# " +#i75319# " +#i76706# " +#i64400# " +#i64400# " +#i79148# " +#i55063# " +#i87530# " +#i88041# " +#i88411# " +#i80923# " +#i80923# " +#i81519# " + + +suspect: + + +- The intentions behind the following commits are unclear, as the referenced bugs were in the +StarOffice internal bug tracker. These changes are contemporaneous with TR14 Revision 17, and seem +to be part of an effort to backport upstream rule changes across multiple language customizations. commit 746ea3d8c29b27b23af3433446f66db0ad3096d6 Author: Oliver Bolte @@ -436,108 +133,17 @@ Date: Tue Jan 11 10:18:51 2005 +0000 INTEGRATION: CWS i18n15 (1.3.36); FILE MERGED 2004/09/04 02:03:53 khong 1.3.36.1: #117685# make dictionary word contain only letter or only number, dot can be in middle or end of a word, but only one. -commit e5a62ce85bebcc9fb2bf0e5b9aced5fc7748055b -Author: Oliver Bolte -Date: Tue Jan 11 10:18:37 2005 +0000 - - INTEGRATION: CWS i18n15 (1.16.4); FILE MERGED - 2004/10/07 18:19:11 khong 1.16.4.1: #i33756# update Hungarian breakiterator - -commit d2a6a31e6981800c2a920f8c6ff901c341a0466e -Author: Kurt Zenker -Date: Fri Jul 30 13:38:57 2004 +0000 - - INTEGRATION: CWS i18n13 (1.8.92); FILE MERGED - 2004/06/14 23:24:16 khong 1.8.92.2: #112772# Japanese word breakiterator is not correct - 2004/06/11 19:23:04 khong 1.8.92.1: #112772# Japanese word breakiterator is not correct -commit d6b8dabc3dc4811e1152d411a8428ccb334d16ab -Author: Kurt Zenker -Date: Fri Jul 30 13:38:17 2004 +0000 - - INTEGRATION: CWS i18n13 (1.7.162); FILE MERGED - 2004/06/11 19:23:04 khong 1.7.162.1: #112772# Japanese word breakiterator is not correct - -commit 9ea4c16a699ac7cf5e255a19653651ac993f022b -Author: Kurt Zenker -Date: Fri Jul 30 13:38:05 2004 +0000 - - INTEGRATION: CWS i18n13 (1.9.92); FILE MERGED - 2004/06/11 19:23:04 khong 1.9.92.1: #112772# Japanese word breakiterator is not correct +- The intention behind the following commit is unclear, as the bug references are incorrect and no +good candidates were immediately apparent. Based on the text of the commit, however, it appears to +be a simple bug fix for skipSpace(). This function has also had a great deal of churn since this +commit, further suggesting it is no longer pertinent. -commit 2887ecb5554eee699e1dce4ffbc2dfcf71a54a41 +commit 1967d8fb182b3101dee4f715e78be384400bc1e8 Author: Kurt Zenker -Date: Fri Jul 30 13:37:54 2004 +0000 - - INTEGRATION: CWS i18n13 (1.15.18); FILE MERGED - 2004/06/17 20:29:38 khong 1.15.18.2: # - 2004/06/02 04:54:24 khong 1.15.18.1: #i11993# fix getWordBoundary problem when position is on the end of the word. - -commit 606556eed208d1218f950df2200510a7e19af1d9 -Author: Oliver Bolte -Date: Fri May 28 15:33:28 2004 +0000 - - INTEGRATION: CWS i18n12 (1.1.2); FILE ADDED - 2004/04/30 14:37:52 er 1.1.2.1: #i27711# Hungarian breakiterator (provided by Timar Andras) - -commit 9710ca90166c18c0a92f7f0246a7c2f7dae87ebc -Author: Oliver Bolte -Date: Fri May 28 15:33:17 2004 +0000 - - INTEGRATION: CWS i18n12 (1.4.22); FILE MERGED - 2004/04/13 11:55:32 er 1.4.22.1: #i27711# Hungarian breakiterator - -commit b138663ef4f4ade38fb42f8a2f567527cf15949b -Author: Oliver Bolte -Date: Fri May 28 15:33:02 2004 +0000 - - INTEGRATION: CWS i18n12 (1.13.22); FILE MERGED - 2004/04/30 11:25:47 er 1.13.22.2: RESYNC: (1.13-1.14); FILE MERGED - 2004/04/13 11:55:32 er 1.13.22.1: #i27711# Hungarian breakiterator - -commit f5bc5f04e4de8fa502d498a99f4ef6a340d796c0 -Author: Oliver Bolte -Date: Wed Mar 17 08:02:14 2004 +0000 - - INTEGRATION: CWS i18n11 (1.13.14); FILE MERGED - 2004/02/04 02:09:04 khong 1.13.14.2: #i24098# skip preceding space for beginOfSentence - 2004/01/06 19:41:49 khong 1.13.14.1: #i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence - -commit 16401a5b865b5da8a2dd70057e8b048e9b797d5a -Author: Oliver Bolte -Date: Wed Mar 17 08:02:01 2004 +0000 - - INTEGRATION: CWS i18n11 (1.12.14); FILE MERGED - 2004/02/10 14:21:13 er 1.12.14.3: RESYNC: (1.12-1.13); FILE MERGED - 2004/02/05 16:45:30 khong 1.12.14.2: #i24850# fix the problem in previousCharBlock, when target char block is in position 1 - 2004/02/04 02:13:48 khong 1.12.14.1: #i24098# check boundary condition for Sentence, Script, CharBlock breakiterator - -commit 4da98b648497af30de0fcf1a16e649ce18b0564f -Author: Jens-Heiner Rechtien -Date: Mon Mar 8 16:17:05 2004 +0000 - - INTEGRATION: CWS i18n09 (1.2.2); FILE MERGED - 2003/12/04 23:45:37 khong 1.2.2.3: #i22602# make dot stick on beginning of a word when doing line break - 2003/12/04 23:12:37 khong 1.2.2.2: #i21392# change line break rule to match with MS office +Date: Wed Sep 5 16:37:28 2007 +0000 -done, regression tests added: + INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED + 2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator + 2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem -#112623# update Japanese word breakiterator dictionary -#i50172# add cell breakiterator rule for Tamil -#i80412# indic cursoring -#i107843# em-dash/en-dash breakiterator fix for spell checking -#i103552# Japanese word for 'shutdown' added to ja.dic -#i113785# ligatures for spell checking will no longer break words -An opening quote should not be counted as a word by word count tool (regression test in writer) -fdo#31271 wrong line break with ( -#i89042# word count fix (regression test is in writer) -#i58513# add break iterator rules for Finish -#i19716# fix wrong line break on bracket characters -#i21290# extend Greek script type -#i21907# fix isBeginWord and isEndWord problem -#i85411# Apply patch for ZWSP -#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break -#i13451# add '-' as midLetter for Catalan dictionary word breakiterator -#i13494# fix word breakiterator rule to handle punctuations and signs correctly -#i29548# Fix Thai word breakiterator problem -#i11993# #i14904# fix word breakiterator issues -- 2.39.2