From 53702ea897ec00baa61bd191a3f9948ccfb176d0 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 24 May 2024 07:29:01 +0200 Subject: Adding debian version 4:24.2.3-2. Signed-off-by: Daniel Baumann --- .../reviewed-breakIterator-customizations.diff | 1269 ++++++++++++++++++++ 1 file changed, 1269 insertions(+) create mode 100644 debian/patches/reviewed-breakIterator-customizations.diff (limited to 'debian/patches/reviewed-breakIterator-customizations.diff') diff --git a/debian/patches/reviewed-breakIterator-customizations.diff b/debian/patches/reviewed-breakIterator-customizations.diff new file mode 100644 index 0000000000..80f9bd814e --- /dev/null +++ b/debian/patches/reviewed-breakIterator-customizations.diff @@ -0,0 +1,1269 @@ +From fb94cc0d1348140d03c2826771c57255ff74a94a Mon Sep 17 00:00:00 2001 +From: Jonathan Clark +Date: Thu, 11 Apr 2024 16:42:39 -0600 +Subject: [PATCH] tdf#49885 Reviewed BreakIterator customizations + +This change completes the review of BreakIterator rule customizations, +and adds unit tests for relevant customizations. + +Change-Id: I06678fcccfc48d020aac64dd9f58ff36a763af30 +Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166017 +Tested-by: Jenkins +Reviewed-by: Eike Rathke +--- + i18npool/qa/cppunit/test_breakiterator.cxx | 559 +++++++++++++++++++ + i18npool/source/breakiterator/data/README | 612 ++++----------------- + 2 files changed, 668 insertions(+), 503 deletions(-) + +diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx +index 0f2629fe05ec..b33466bee46d 100644 +--- a/i18npool/qa/cppunit/test_breakiterator.cxx ++++ b/i18npool/qa/cppunit/test_breakiterator.cxx +@@ -31,6 +31,7 @@ public: + + void testLineBreaking(); + void testWordBoundaries(); ++ void testSentenceBoundaries(); + void testGraphemeIteration(); + void testWeak(); + void testAsian(); +@@ -43,9 +44,18 @@ public: + void testJapanese(); + void testChinese(); + ++ void testLegacyDictWordPrepostDash_de_DE(); ++ void testLegacyDictWordPrepostDash_nds_DE(); ++ void testLegacyDictWordPrepostDash_nl_NL(); ++ void testLegacyDictWordPrepostDash_sv_SE(); ++ void testLegacyHebrewQuoteInsideWord(); ++ void testLegacySurrogatePairs(); ++ void testLegacyWordCountCompat(); ++ + CPPUNIT_TEST_SUITE(TestBreakIterator); + CPPUNIT_TEST(testLineBreaking); + CPPUNIT_TEST(testWordBoundaries); ++ CPPUNIT_TEST(testSentenceBoundaries); + CPPUNIT_TEST(testGraphemeIteration); + CPPUNIT_TEST(testWeak); + CPPUNIT_TEST(testAsian); +@@ -57,6 +67,13 @@ public: + #endif + CPPUNIT_TEST(testJapanese); + CPPUNIT_TEST(testChinese); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_de_DE); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE); ++ CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord); ++ CPPUNIT_TEST(testLegacySurrogatePairs); ++ CPPUNIT_TEST(testLegacyWordCountCompat); + CPPUNIT_TEST_SUITE_END(); + + private: +@@ -118,6 +135,173 @@ void TestBreakIterator::testLineBreaking() + } + } + ++ // i#22602: writer breaks word after dot immediately followed by a letter ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ //Here we want the line break to leave ./bar/baz clumped together on the next line ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "foo ./bar/baz", strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period", ++ static_cast(4), aResult.breakIndex); ++ } ++ } ++ ++ // i#81448: slash and backslash make non-breaking spaces of preceding spaces ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Per the bug, the line break should leave ...BE clumped together on the next line. ++ // However, the current behavior does not wrap the string at all. This test asserts the ++ // current behavior as a point of reference. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "THIS... ...BE", strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(0), aResult.breakIndex); ++ } ++ } ++ ++ // i#81448: slash and backslash make non-breaking spaces of preceding spaces ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // The line break should leave /BE clumped together on the next line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "THIS... /BE", strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(8), aResult.breakIndex); ++ } ++ } ++ ++ // i#80548: Bad word wrap between dash and word ++ { ++ aLocale.Language = "fi"; ++ aLocale.Country = "FI"; ++ ++ { ++ // Per the bug, the line break should leave -bar clumped together on the next line. ++ // However, this change was reverted at some point. This test asserts the new behavior. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", ++ static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // i#80645: Line erroneously breaks at backslash ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave C:\Program Files\ on the first line ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ } ++ } ++ ++ // i#80841: Words separated by hyphens will always break to next line ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave toll- on the first line ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "toll-free", strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // i#83464: Line break between letter and $ ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave US$ clumped on the next line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "word US$ 123", strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // Unknown bug number: "fix line break problem of dot after letter and before number" ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave US$ clumped on the next line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "word L.5 word", strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // i#83229: Wrong line break when word contains a hyphen ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave 100- clumped on the first line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(9), aResult.breakIndex); ++ } ++ } ++ ++ // i#83649: Line break should be between typographical quote and left bracket ++ { ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ { ++ // Here we want the line break to leave »angetan werden« on the first line ++ const OUString str = u"»angetan werden« [Passiv]"_ustr; ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ } ++ } ++ ++ // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters ++ { ++ aLocale.Language = "zh"; ++ aLocale.Country = "HK"; ++ ++ { ++ // Per the bug, this should break at the ideographic comma. However, this change has ++ // been reverted at some point. This test only verifies current behavior. ++ const OUString str = u"word word、word word"_ustr; ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(13), aResult.breakIndex); ++ } ++ } ++ ++ // i#80891: Character in the forbidden list sometimes appears at the start of line ++ { ++ aLocale.Language = "zh"; ++ aLocale.Country = "HK"; ++ ++ { ++ // Per the bug, the ideographic two-dot leader should be a forbidden character. However, ++ // this change seems to have been reverted or broken at some point. ++ const OUString str = u"電話︰電話"_ustr; ++ i18n::LineBreakResults aResult ++ = m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(2), aResult.breakIndex); ++ } ++ } ++ + //See https://bz.apache.org/ooo/show_bug.cgi?id=19716 + { + aLocale.Language = "en"; +@@ -160,6 +344,20 @@ void TestBreakIterator::testLineBreaking() + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast(5), aResult.breakIndex); + } + } ++ ++ // i#65267: Comma is badly broken at end of line ++ // - The word should be wrapped along with the comma ++ { ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ { ++ auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort", ++ strlen("Wort -prinzessinnen,"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex); ++ } ++ } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=49629 +@@ -601,6 +799,174 @@ void TestBreakIterator::testWordBoundaries() + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + } ++ ++ // i#55778: Words containing numbers get broken up ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ static constexpr OUString aTest = u"first i18n third"_ustr; ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos); ++ } ++ ++ // i#56347: "BreakIterator patch for Hungarian" ++ // Rules for Hungarian affixes after numbers and certain symbols ++ { ++ auto mode = i18n::WordType::DICTIONARY_WORD; ++ aLocale.Language = "hu"; ++ aLocale.Country = "HU"; ++ ++ OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); ++ } ++ ++ // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) ++ // Rules for Hungarian affixes after numbers and certain symbols in edit mode. ++ // The patch was merged, but the original bug was never closed and the current behavior seems ++ // identical to the ICU default behavior. Added this test to ensure that doesn't change. ++ { ++ auto mode = i18n::WordType::ANY_WORD; ++ aLocale.Language = "hu"; ++ aLocale.Country = "HU"; ++ ++ OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testSentenceBoundaries() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ // Trivial characteristic test for sentence boundary detection ++ { ++ OUString aTest("This is a sentence. This is a different sentence."); ++ ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale)); ++ } ++ ++ // i#24098: i18n API beginOfSentence/endOfSentence ++ // fix beginOfSentence, ... when cursor is on the beginning of the sentence ++ { ++ OUString aTest("This is a sentence. This is a different sentence."); ++ ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale)); ++ } ++ ++ // i#24098: i18n API beginOfSentence/endOfSentence ++ // "skip preceding space for beginOfSentence" ++ { ++ OUString aTest("This is a sentence. This is a different sentence."); ++ ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale)); ++ } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=40292 +@@ -1043,6 +1409,199 @@ void TestBreakIterator::testChinese() + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } + } ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_de_DE() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "nds"; ++ aLocale.Country = "DE"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_nl_NL() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "nl"; ++ aLocale.Country = "NL"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "sv"; ++ aLocale.Country = "SE"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyHebrewQuoteInsideWord() ++{ ++ lang::Locale aLocale; ++ ++ aLocale.Language = "he"; ++ aLocale.Country = "IL"; ++ ++ { ++ auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacySurrogatePairs() ++{ ++ lang::Locale aLocale; ++ ++ aLocale.Language = "ja"; ++ aLocale.Country = "JP"; ++ ++ // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs. ++ // and many others to address bugs: i#75631 i#75633 i#75412 etc. ++ // ++ // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test. ++ { ++ const sal_Unicode buf[] = { u"X 𠮟 X" }; ++ OUString aTest(buf, SAL_N_ELEMENTS(buf)); ++ ++ auto aBounds ++ = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyWordCountCompat() ++{ ++ lang::Locale aLocale; ++ ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ // i#80815: "Word count differs from MS Word" ++ // This is a characteristic test for word count using test data from the linked bug. ++ { ++ const OUString str = u"" ++ "test data for word count issue #80815\n" ++ "fo\\\'sforos\n" ++ "archipi\\\'elago\n" ++ "do\\^me\n" ++ "f**k\n" ++ "\n" ++ "battery-driven\n" ++ "and/or\n" ++ "apple(s)\n" ++ "money+opportunity\n" ++ "Micro$oft\n" ++ "\n" ++ "300$\n" ++ "I(not you)\n" ++ "a****n\n" ++ "1+3=4\n" ++ "\n" ++ "aaaaaaa.aaaaaaa\n" ++ "aaaaaaa,aaaaaaa\n" ++ "aaaaaaa;aaaaaaa\n"_ustr; ++ ++ int num_words = 0; ++ sal_Int32 next_pos = 0; ++ int iter_guard = 0; ++ while (true) ++ { ++ CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100); ++ ++ auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT); ++ ++ if (aBounds.endPos < next_pos) ++ { ++ break; ++ } ++ ++ next_pos = aBounds.endPos; ++ ++num_words; ++ } ++ ++ CPPUNIT_ASSERT_EQUAL(23, num_words); ++ } ++} ++ + void TestBreakIterator::setUp() + { + BootstrapFixtureBase::setUp(); +diff --git a/i18npool/source/breakiterator/data/README b/i18npool/source/breakiterator/data/README +index 6246b80ae77f..76e3e37c3faf 100644 +--- a/i18npool/source/breakiterator/data/README ++++ b/i18npool/source/breakiterator/data/README +@@ -9,411 +9,108 @@ At various stages these copies have been customized and are now horribly out of + sync. It unclear which diffs from the base versions are deliberate and which + are now accidental :-( + +-We need to review the various issues referenced in the commits that caused +-customizations and see if they're still relevant or not, write regression tests +-for them, if any are still relevant then apply the changes back on top of the +-latest versions. ++The various issues and customizations have been reviewed, with tests written for ++customizations that are still relevant. However, these files are still extremely ++out-of-date and need to be refreshed. Relevant customizations should be reapplied ++on top of a current version. + +-to-review, later are ok: +- +-commit e1ad946ef5db3f7c0a540207d0f0fd85799e3b66 +-Author: Release Engineers +-Date: Thu Aug 6 18:13:57 2009 +0000 +- +- CWS-TOOLING: integrate CWS tl73 +- 2009-07-31 15:29:33 +0200 tl r274535 : #i64400# dash/hyphen should not break words +- +-commit 9964a76ef58786bba47d409970512d7ded6c8889 +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:53:05 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.1.2); FILE ADDED +- 2008/04/25 17:06:26 khong 1.1.2.3: i55063, make period a sentence delimiter +- 2008/04/25 06:40:50 khong 1.1.2.2: i55063, make space as Thai sentence delimiter +- 2008/04/24 03:19:10 khong 1.1.2.1: i55063, set Thai letters as sentence delimiter for Thai and English mixed text +- +-commit e4a6e4284dae1ca6fbfa7d1e43690dbf87d796cd +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:52:44 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.9.12); FILE MERGED +- 2008/06/17 20:22:30 khong 1.9.12.2: i83229 fix the problem of leading hyphen for numbers +- 2008/04/23 06:20:16 khong 1.9.12.1: i72868, i80891, i83229, fix Chinese punctuations and hyphen for line breakiterator +- +-commit 55dff22611659a1567c968fbf9e512a2765ab62e +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:52:07 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.33.36); FILE MERGED +- 2008/06/05 22:18:29 khong 1.33.36.2: RESYNC: (1.33-1.35); FILE MERGED +- 2008/04/23 06:11:55 khong 1.33.36.1: i55063, enable language specific sentence breakiterator +- +-commit 1c2b8095631a3c2d2f396bf50a8f0c62f49be65c +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:51:12 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.12.140); FILE MERGED +- 2008/06/05 22:18:26 khong 1.12.140.2: RESYNC: (1.12-1.13); FILE MERGED +- 2008/04/23 06:04:53 khong 1.12.140.1: i87530 avoid breaking line before un-completed cell +- +-commit 9bbdb52df370c69c0f7eba387a2068ee80bd7994 +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:50:43 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.25.2); FILE MERGED +- 2008/06/05 22:18:23 khong 1.25.2.2: RESYNC: (1.25-1.26); FILE MERGED +- 2008/04/23 06:09:02 khong 1.25.2.1: i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts +- +-commit 8dcdd3ca268f78295731b86797c2b8cd447ba667 +-Author: Kurt Zenker +-Date: Tue May 20 13:36:01 2008 +0000 +- +- INTEGRATION: CWS i18n43_DEV300 (1.33.38); FILE MERGED +- 2008/04/29 21:51:51 khong 1.33.38.1: #i88411# apply the patch from Coleman Kane to fix icu setBreakType issue +- +-commit bedef98c24ef9ada6aaffe9bc5284d9759a31a9a +-Author: Kurt Zenker +-Date: Wed Apr 2 08:49:09 2008 +0000 +- +- INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED +- 2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word +- 2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word +- +-commit 59144104b3f91a2e6ed816f0bde0fdb91ea218d7 +-Author: Kurt Zenker +-Date: Wed Apr 2 08:48:53 2008 +0000 +- +- INTEGRATION: CWS i18n40 (1.24.44); FILE MERGED +- 2008/03/19 18:56:42 khong 1.24.44.2: i80815 make word count feature like MS Word +- 2008/03/15 07:31:38 khong 1.24.44.1: #i80815# count punctuation as word +- +-commit 3f0b51776602c45e8aca991450fcbb30f2484ae5 +-Author: Vladimir Glazounov +-Date: Mon Jan 28 14:33:46 2008 +0000 +- +- INTEGRATION: CWS i18n39 (1.8.4); FILE MERGED +- 2007/12/12 17:45:45 khong 1.8.4.3: b6634800# fix line break problem of dot after letter and before number +- 2007/12/08 01:05:52 khong 1.8.4.2: #i83649# fixed the problem of line break between quotation mark and open bracket +- 2007/12/07 23:44:30 khong 1.8.4.1: #i83464# fix the problem of line break between letter and 1326 +- +-commit 5d8ef209b1f63d1c8ea5014bdbef96660b355423 +-Author: Vladimir Glazounov +-Date: Tue Oct 23 08:09:00 2007 +0000 +- +- INTEGRATION: CWS i18n38 (1.7.4); FILE MERGED +- 2007/09/19 00:08:04 khong 1.7.4.3: i81448 fixed dot line break issue +- 2007/09/10 23:57:12 khong 1.7.4.2: i81440 fix the problem of line break on punctuations +- 2007/09/10 22:55:46 khong 1.7.4.1: i81448 fix problem of line break on symbols +- +-commit a2f3b48cacfcef338ca5e37acde34c83876e082e +-Author: Vladimir Glazounov +-Date: Tue Oct 23 08:08:47 2007 +0000 +- +- INTEGRATION: CWS i18n38 (1.32.10); FILE MERGED +- 2007/09/18 20:32:39 khong 1.32.10.1: i81519 set break type icu breakiterator +- +-commit 1967d8fb182b3101dee4f715e78be384400bc1e8 +-Author: Kurt Zenker +-Date: Wed Sep 5 16:37:28 2007 +0000 +- +- INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED +- 2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator +- 2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem +- +-commit d2c2baf1a31d281d20e8b4d4c806dda027b2d5a3 +-Author: Vladimir Glazounov +-Date: Tue Aug 28 11:46:45 2007 +0000 +- +- INTEGRATION: CWS i18n36_SRC680 (1.5.20.1.2); FILE MERGED +- 2007/08/22 17:12:36 khong 1.5.20.1.2.1: i80841 fix hyphen line break problem +- +-commit d56bedfb425cf77f176f143455e4a9fb6ce65540 +-Author: Vladimir Glazounov +-Date: Tue Aug 28 11:46:34 2007 +0000 +- +- INTEGRATION: CWS i18n36_SRC680 (1.21.2.1.2); FILE MERGED +- 2007/08/22 20:02:28 khong 1.21.2.1.2.2: i80923 fix infinite loop problem +- 2007/08/22 17:11:44 khong 1.21.2.1.2.1: i80923 fix a infinite loop +- +-commit 8a36b196925a5561eabde0a0ef293c73fcb5add3 +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:48 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.5.22); FILE MERGED +- 2007/08/13 22:26:12 khong 1.5.22.1: i80548 i80645 fix dash and backslash issues in line breakiterator +- +-commit c00b2b49bad765144f90552139e63d87d520d1cf +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:36 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.15.4); FILE MERGED +- 2007/08/13 22:33:38 khong 1.15.4.1: i86439 fix surrogate characters handling issues +- +-commit 3fc5fbc71d4c244d7c8002aa530481741e585bd4 +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:23 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.31.4); FILE MERGED +- 2007/08/13 22:33:37 khong 1.31.4.1: i86439 fix surrogate characters handling issues +- +-commit ee44b43881e7c82c379931f111c452a477b73341 +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:11 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.21.4); FILE MERGED +- 2007/08/14 08:38:53 khong 1.21.4.2: i86439 fix surrogate characters handling issues +- 2007/08/13 22:33:37 khong 1.21.4.1: i86439 fix surrogate characters handling issues +- +-commit f47369dbbc385f8968ad43e43cba293a29a4c2df +-Author: Jens-Heiner Rechtien +-Date: Tue Jul 31 16:09:13 2007 +0000 +- +- INTEGRATION: CWS i18n32 (1.29.14); FILE MERGED +- 2007/07/24 20:39:44 khong 1.29.14.1: #i79148# fix a local word breakiterator rules loading issue +- +-commit 2791553b4e3fc5e04b96d0b2fd119d9fba1946bc +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:51 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.14.60); FILE MERGED +- 2007/07/16 22:18:44 khong 1.14.60.4: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/07/13 20:37:32 khong 1.14.60.3: #i75632# use ICU characters properties +- 2007/07/04 01:17:22 khong 1.14.60.2: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/06/27 04:33:11 khong 1.14.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 1c79a2bf1e89ac4eb409922ab7eb8ad3cacc688a +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:39 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.8.60); FILE MERGED +- 2007/06/27 04:33:11 khong 1.8.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 517bbaddbaf81a5a6bb00979944cad13a1575d50 +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:27 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.28.14); FILE MERGED +- 2007/07/13 20:37:32 khong 1.28.14.5: #i75632# use ICU characters properties +- 2007/07/04 01:17:22 khong 1.28.14.4: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/06/27 23:25:58 khong 1.28.14.3: i75412 handle surrogate pair characters +- 2007/06/27 05:33:20 khong 1.28.14.2: RESYNC: (1.28-1.29); FILE MERGED +- 2007/06/27 04:33:11 khong 1.28.14.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 0154e3492f2527535c0d648274e7ff674674318b +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:14 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.14.42); FILE MERGED +- 2007/06/27 05:33:03 khong 1.14.42.2: RESYNC: (1.14-1.15); FILE MERGED +- 2007/06/27 04:33:11 khong 1.14.42.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit e2a5a2532ee187669980adb7bfa747c7803c330a +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:02 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.19.60); FILE MERGED +- 2007/07/13 20:37:32 khong 1.19.60.4: #i75632# use ICU characters properties +- 2007/07/04 01:17:22 khong 1.19.60.3: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/06/27 05:00:48 khong 1.19.60.2: i75231 handle surrogate pair characters +- 2007/06/27 04:33:11 khong 1.19.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 80a26a7d4720b5b8cfa0acc624b28014c96d9948 +-Author: Jens-Heiner Rechtien +-Date: Tue Jun 26 16:41:02 2007 +0000 +- +- INTEGRATION: CWS ause081 (1.2.332); FILE MERGED +- 2007/06/21 10:53:19 hjs 1.2.332.1: #i78393# remove component_getDescriptionFunc from exports +- +-commit c2801db6b04bf6f0dbb07727c91b2c66e7e027b8 +-Author: Ivo Hinkelmann +-Date: Wed Jun 6 11:17:38 2007 +0000 +- +- INTEGRATION: CWS i18n30 (1.4.24); FILE MERGED +- 2007/05/08 21:32:18 khong 1.4.24.1: #i73903# update line breakiterator rule to icu3.6 style +- +-commit ea290668f78475c3b277c9e44bf5622ccb4dcec8 +-Author: Ivo Hinkelmann +-Date: Wed Jun 6 11:17:25 2007 +0000 +- +- INTEGRATION: CWS i18n30 (1.28.4); FILE MERGED +- 2007/05/08 21:47:00 khong 1.28.4.3: #i75412# remove fix from cws i18n30, move it to other cws to fix with other Japanese surrogate issues +- 2007/03/20 18:39:58 khong 1.28.4.2: #i72589# fixed BS problem for surrogate characters +- 2007/03/13 19:11:44 khong 1.28.4.1: #i75319# fixed ANY_WORD rule loading problem +- +-commit b6308a6e322fd4eaa7845793beb70900624f351c +-Author: Ivo Hinkelmann +-Date: Wed Jun 6 11:17:12 2007 +0000 +- +- INTEGRATION: CWS i18n30 (1.14.32); FILE MERGED +- 2007/05/08 21:44:15 khong 1.14.32.1: #i76706# fix infinite loop for CJK word breakiterator for text mixed with Latin and CJK characters +- +-commit e068e0e9aa9405ea4016ad19e9a963129adfed79 +-Author: Rüdiger Timm +-Date: Thu Jan 25 08:35:42 2007 +0000 +- +- INTEGRATION: CWS i18n28 (1.1.2); FILE ADDED +- 2006/12/06 05:52:39 khong 1.1.2.1: #i64400# add an optional breakiterator entry in localedata +- +-commit 8d6f35a46085bb420e8896505504b376d17b842a +-Author: Rüdiger Timm +-Date: Thu Jan 25 08:35:31 2007 +0000 +- +- INTEGRATION: CWS i18n28 (1.24.36); FILE MERGED +- 2006/12/19 17:27:58 khong 1.24.36.2: RESYNC: (1.24-1.25); FILE MERGED +- 2006/12/06 05:52:38 khong 1.24.36.1: #i64400# add an optional breakiterator entry in localedata +- +-commit 633d34fa33330339ab6795ce3703477216e0062e +-Author: Kurt Zenker +-Date: Tue Dec 12 15:14:36 2006 +0000 +- +- INTEGRATION: CWS icuupgrade (1.9.24); FILE MERGED +- 2006/10/11 06:11:11 khong 1.9.24.4: RESYNC: (1.10-1.11); FILE MERGED +- 2006/07/07 10:57:40 hdu 1.9.24.3: RESYNC: (1.9-1.10); FILE MERGED +- 2006/06/30 01:31:40 khong 1.9.24.2: #i53388# upgrade icu to 3.4.1 +- 2006/06/15 19:16:55 khong 1.9.24.1: #i60645# upgrade icu to 3.4.1 +- +-commit 5d46dabe95271c846601a2575d3304fd5b4b24f1 +-Author: Kurt Zenker +-Date: Tue Dec 12 15:14:05 2006 +0000 +- +- INTEGRATION: CWS icuupgrade (1.22.20); FILE MERGED +- 2006/11/11 07:12:47 khong 1.22.20.6: #142664# fix breakiterator crash problem +- 2006/10/11 06:10:51 khong 1.22.20.5: RESYNC: (1.23-1.24); FILE MERGED +- 2006/09/06 01:00:31 khong 1.22.20.4: #i60645# upgrade to icu 3.6 +- 2006/07/07 10:57:32 hdu 1.22.20.3: RESYNC: (1.22-1.23); FILE MERGED +- 2006/06/30 01:31:40 khong 1.22.20.2: #i53388# upgrade icu to 3.4.1 +- 2006/06/20 14:27:26 hdu 1.22.20.1: #i60645# fix crash when udata_open failed +- +-commit 7431d816cdfc47b08978c0afd1f6503644bb11b8 +-Author: Kurt Zenker +-Date: Mon Nov 6 13:40:05 2006 +0000 +- +- INTEGRATION: CWS i18n27 (1.3.142); FILE MERGED +- 2006/10/10 21:10:57 khong 1.3.142.1: #i65267# fix line break rule +- +-commit d7471e1462ffd9baeb3449eb86ccbb649e32b233 +-Author: Kurt Zenker +-Date: Mon Nov 6 13:39:52 2006 +0000 +- +- INTEGRATION: CWS i18n27 (1.1.2); FILE ADDED +- 2006/10/10 21:08:55 khong 1.1.2.1: #i56348# add Hungarian word break rule for edit mode +- +-commit 1b65b0b886e2cb16382bc11770230fb6a140f33b +-Author: Jens-Heiner Rechtien +-Date: Tue Oct 24 12:53:13 2006 +0000 +- +- INTEGRATION: CWS tl29 (1.12.24); FILE MERGED +- 2006/09/20 01:24:53 khong 1.12.24.1: #i69482# fixed mismatch of nextWord and getWordBoundary +- +-commit 97d89862a2285071202cc8010d888ffcbf96279a +-Author: Jens-Heiner Rechtien +-Date: Thu Nov 17 19:30:35 2005 +0000 +- +- INTEGRATION: CWS i18n23 (1.20.22); FILE MERGED +- 2005/11/17 20:00:37 khong 1.20.22.3: RESYNC: (1.20-1.21); FILE MERGED +- 2005/11/17 19:45:05 khong 1.20.22.2: #i57866# merge cws i18n23 and thaiissues +- 2005/11/15 21:10:24 khong 1.20.22.1: #i57866# fix line breakiterator problem +- +-commit 05fadde6f025bcaafca4f3093e88be3cc1bb6836 +-Author: Oliver Bolte +-Date: Wed Nov 16 09:18:37 2005 +0000 +- +- INTEGRATION: CWS thaiissues (1.20.6); FILE MERGED +- 2005/10/26 20:42:40 khong 1.20.6.2: use icu thai linke break algorithm for thai breakiterator +- 2005/10/26 13:36:24 fme 1.20.6.1: #i55716# Handling of WORDJOINER +- +-commit a10b0e70c641d7438c557ef718c6942b3abffaec +-Author: Oliver Bolte +-Date: Wed Nov 16 09:18:25 2005 +0000 +- +- INTEGRATION: CWS thaiissues (1.8.6); FILE MERGED +- 2005/10/26 20:42:39 khong 1.8.6.1: use icu thai linke break algorithm for thai breakiterator +- +-commit 4a1f1586173839d532f90507c72306bc9e2aec56 +-Author: Oliver Bolte +-Date: Wed Nov 16 09:18:11 2005 +0000 +- +- INTEGRATION: CWS thaiissues (1.9.4); FILE MERGED +- 2005/10/28 17:54:39 khong 1.9.4.1: Fix a bug in ctl line break when there is word joiner character +- +-commit beb2a536738ba761a92f8266570f1859c85f94ae +-Author: Rüdiger Timm +-Date: Tue Nov 8 15:59:16 2005 +0000 +- +- INTEGRATION: CWS siloch (1.3.50); FILE MERGED +- 2005/10/26 10:55:05 er 1.3.50.1: #i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking; contributed by Nemeth Laszlo +- +-commit 939e7c2bc93c13b6740051beeb08c5883b65ffce +-Author: Kurt Zenker +-Date: Fri Nov 4 14:33:30 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED +- 2005/10/21 00:35:09 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit 51594ef552a872b9868e5c7a025a68665488a016 +-Author: Kurt Zenker +-Date: Fri Nov 4 14:33:16 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.2.2); FILE MERGED +- 2005/10/21 00:35:08 khong 1.2.2.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit f4fe39909c7ed645a8b387cf66de249572226ad6 +-Author: Kurt Zenker +-Date: Fri Nov 4 14:33:03 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED +- 2005/10/21 00:35:08 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit 7f8af14611e66655ea7354083eafd71afc9703e3 +-Author: Kurt Zenker +-Date: Fri Nov 4 14:32:41 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.4.46); FILE MERGED +- 2005/10/21 00:35:07 khong 1.4.46.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit 924e158b9d871fbf7500e9215540e26aa95b3b20 +-Author: Rüdiger Timm +-Date: Mon Oct 17 14:43:17 2005 +0000 +- +- INTEGRATION: CWS i18n20 (1.1.2); FILE ADDED +- 2005/09/22 23:47:49 khong 1.1.2.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- +-commit a428a8927006a10ccfe7182e6fe5a8b677281eca +-Author: Rüdiger Timm +-Date: Mon Oct 17 14:42:30 2005 +0000 +- +- INTEGRATION: CWS i18n20 (1.18.32); FILE MERGED +- 2005/09/23 15:59:13 khong 1.18.32.6: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/09/23 08:09:54 khong 1.18.32.5: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/09/23 07:38:03 khong 1.18.32.4: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule +- 2005/09/22 23:47:48 khong 1.18.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/08/26 23:34:37 khong 1.18.32.2: #i50172# add cell breakiterator rule for Tamil +- 2005/08/26 23:31:59 khong 1.18.32.1: #i50172# add cell breakiterator rule for Tamil +- +-commit f518f78557931b81e06fd7b31bb22c6639e5e553 +-Author: Rüdiger Timm +-Date: Mon Oct 17 14:42:14 2005 +0000 +- +- INTEGRATION: CWS i18n20 (1.6.32); FILE MERGED +- 2005/09/23 15:59:13 khong 1.6.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/09/23 07:38:02 khong 1.6.32.2: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule +- 2005/09/22 23:47:48 khong 1.6.32.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- +-commit 9b870055ecd043d1d4fadeacd351f8739e1979a0 +-Author: Vladimir Glazounov +-Date: Fri Feb 25 09:08:13 2005 +0000 +- +- INTEGRATION: CWS i18n16 (1.16.22); FILE MERGED +- 2005/02/04 19:05:45 khong 1.16.22.3: #i41671# use ICU rules for Thai breakiterator +- 2005/01/24 21:56:34 khong 1.16.22.2: #i35285# merge cws i18n16 with top version 1.17 +- 2005/01/12 01:12:41 khong 1.16.22.1: #i35285# remove uprv_malloc, use udata_open for loading icu rule breakiterator +- +-commit 29b9e86f5dac388d7aaced24d3826ac9331b03e3 +-Author: Vladimir Glazounov +-Date: Fri Feb 25 09:07:59 2005 +0000 ++done, regression tests added: + +- INTEGRATION: CWS i18n16 (1.5.22); FILE MERGED +- 2005/02/04 19:05:45 khong 1.5.22.1: #i41671# use ICU rules for Thai breakiterator ++#112623# update Japanese word breakiterator dictionary ++#i50172# add cell breakiterator rule for Tamil ++#i80412# indic cursoring ++#i107843# em-dash/en-dash breakiterator fix for spell checking ++#i103552# Japanese word for 'shutdown' added to ja.dic ++#i113785# ligatures for spell checking will no longer break words ++An opening quote should not be counted as a word by word count tool (regression test in writer) ++fdo#31271 wrong line break with ( ++#i89042# word count fix (regression test is in writer) ++#i58513# add break iterator rules for Finish ++#i19716# fix wrong line break on bracket characters ++#i21290# extend Greek script type ++#i21907# fix isBeginWord and isEndWord problem ++#i85411# Apply patch for ZWSP ++#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break ++#i13451# add '-' as midLetter for Catalan dictionary word breakiterator ++#i13494# fix word breakiterator rule to handle punctuations and signs correctly ++#i29548# Fix Thai word breakiterator problem ++#i11993# #i14904# fix word breakiterator issues ++#i64400# dash/hyphen should not break words (de/nds/nl/sv) ++#i22602# make dot stick on beginning of a word when doing line break ++#i24098# skip preceding space for beginOfSentence ++#i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence ++#i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. ++#i50172# add cell breakiterator rule for Tamil ++#i55778# reverse back last change, treat letter and number combination as one word. ++#i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking ++#i56348# add Hungarian word break rule for edit mode ++#i65267# fix line break rule ++#i86439# many changes to implement, tweak, debug UTF-16 surrogate pair handling ++#i75631# " ++#i75632# " ++#i75633# " ++#i75412# " ++#i80645# fix backslash issues in line breakiterator ++#i80841# fix hyphen line break problem ++#i81448# fixed dot line break issue ++#i81448# fix the problem of line break on punctuations (commit message says i81440) ++#i81448# fix problem of line break on symbols ++#i83649# fixed the problem of line break between quotation mark and open bracket ++#i83464# fix the problem of line break between letter and 1326 ++b6634800# fix line break problem of dot after letter and before number ++#i83229# fix the problem of leading hyphen for numbers ++#i80815# count words like MS Word ++ ++likely superseded: ++ ++#i21392# Obscure line break behavior mismatch in string of symbols between MSO and LO. ++#i80548# "fix dash issues in line breakiterator" - fix no longer works ++#i72868# "fix Chinese punctuation for line breakiterator" - fix no longer works ++#i80891# "fix Chinese punctuation for line breakiterator" - fix no longer works ++ ++#i27711# Adding/tweaking/removing languages later added to ICU. ++#i33756# " ++#i41671# " ++#i41671# " ++#i55063# " ++#i24850# ICU upgrades, internal bug fixes, or other work-arounds. ++#i24098# " ++#112772# " ++#i35285# " ++4a1f1586173839d532f90507c72306bc9e2aec56 " ++a10b0e70c641d7438c557ef718c6942b3abffaec " ++05fadde6f025bcaafca4f3093e88be3cc1bb6836 " ++#i57866# " ++#i57866# " ++#i69482# " ++#142664# " ++#i60645# " ++#i53388# " ++#i60645# " ++#i78393# " ++#i73903# " ++#i75412# " ++#i72589# " ++#i75319# " ++#i76706# " ++#i64400# " ++#i64400# " ++#i79148# " ++#i55063# " ++#i87530# " ++#i88041# " ++#i88411# " ++#i80923# " ++#i80923# " ++#i81519# " ++ ++ ++suspect: ++ ++ ++- The intentions behind the following commits are unclear, as the referenced bugs were in the ++StarOffice internal bug tracker. These changes are contemporaneous with TR14 Revision 17, and seem ++to be part of an effort to backport upstream rule changes across multiple language customizations. + + commit 746ea3d8c29b27b23af3433446f66db0ad3096d6 + Author: Oliver Bolte +@@ -436,108 +133,17 @@ Date: Tue Jan 11 10:18:51 2005 +0000 + INTEGRATION: CWS i18n15 (1.3.36); FILE MERGED + 2004/09/04 02:03:53 khong 1.3.36.1: #117685# make dictionary word contain only letter or only number, dot can be in middle or end of a word, but only one. + +-commit e5a62ce85bebcc9fb2bf0e5b9aced5fc7748055b +-Author: Oliver Bolte +-Date: Tue Jan 11 10:18:37 2005 +0000 +- +- INTEGRATION: CWS i18n15 (1.16.4); FILE MERGED +- 2004/10/07 18:19:11 khong 1.16.4.1: #i33756# update Hungarian breakiterator +- +-commit d2a6a31e6981800c2a920f8c6ff901c341a0466e +-Author: Kurt Zenker +-Date: Fri Jul 30 13:38:57 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.8.92); FILE MERGED +- 2004/06/14 23:24:16 khong 1.8.92.2: #112772# Japanese word breakiterator is not correct +- 2004/06/11 19:23:04 khong 1.8.92.1: #112772# Japanese word breakiterator is not correct + +-commit d6b8dabc3dc4811e1152d411a8428ccb334d16ab +-Author: Kurt Zenker +-Date: Fri Jul 30 13:38:17 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.7.162); FILE MERGED +- 2004/06/11 19:23:04 khong 1.7.162.1: #112772# Japanese word breakiterator is not correct +- +-commit 9ea4c16a699ac7cf5e255a19653651ac993f022b +-Author: Kurt Zenker +-Date: Fri Jul 30 13:38:05 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.9.92); FILE MERGED +- 2004/06/11 19:23:04 khong 1.9.92.1: #112772# Japanese word breakiterator is not correct ++- The intention behind the following commit is unclear, as the bug references are incorrect and no ++good candidates were immediately apparent. Based on the text of the commit, however, it appears to ++be a simple bug fix for skipSpace(). This function has also had a great deal of churn since this ++commit, further suggesting it is no longer pertinent. + +-commit 2887ecb5554eee699e1dce4ffbc2dfcf71a54a41 ++commit 1967d8fb182b3101dee4f715e78be384400bc1e8 + Author: Kurt Zenker +-Date: Fri Jul 30 13:37:54 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.15.18); FILE MERGED +- 2004/06/17 20:29:38 khong 1.15.18.2: # +- 2004/06/02 04:54:24 khong 1.15.18.1: #i11993# fix getWordBoundary problem when position is on the end of the word. +- +-commit 606556eed208d1218f950df2200510a7e19af1d9 +-Author: Oliver Bolte +-Date: Fri May 28 15:33:28 2004 +0000 +- +- INTEGRATION: CWS i18n12 (1.1.2); FILE ADDED +- 2004/04/30 14:37:52 er 1.1.2.1: #i27711# Hungarian breakiterator (provided by Timar Andras) +- +-commit 9710ca90166c18c0a92f7f0246a7c2f7dae87ebc +-Author: Oliver Bolte +-Date: Fri May 28 15:33:17 2004 +0000 +- +- INTEGRATION: CWS i18n12 (1.4.22); FILE MERGED +- 2004/04/13 11:55:32 er 1.4.22.1: #i27711# Hungarian breakiterator +- +-commit b138663ef4f4ade38fb42f8a2f567527cf15949b +-Author: Oliver Bolte +-Date: Fri May 28 15:33:02 2004 +0000 +- +- INTEGRATION: CWS i18n12 (1.13.22); FILE MERGED +- 2004/04/30 11:25:47 er 1.13.22.2: RESYNC: (1.13-1.14); FILE MERGED +- 2004/04/13 11:55:32 er 1.13.22.1: #i27711# Hungarian breakiterator +- +-commit f5bc5f04e4de8fa502d498a99f4ef6a340d796c0 +-Author: Oliver Bolte +-Date: Wed Mar 17 08:02:14 2004 +0000 +- +- INTEGRATION: CWS i18n11 (1.13.14); FILE MERGED +- 2004/02/04 02:09:04 khong 1.13.14.2: #i24098# skip preceding space for beginOfSentence +- 2004/01/06 19:41:49 khong 1.13.14.1: #i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence +- +-commit 16401a5b865b5da8a2dd70057e8b048e9b797d5a +-Author: Oliver Bolte +-Date: Wed Mar 17 08:02:01 2004 +0000 +- +- INTEGRATION: CWS i18n11 (1.12.14); FILE MERGED +- 2004/02/10 14:21:13 er 1.12.14.3: RESYNC: (1.12-1.13); FILE MERGED +- 2004/02/05 16:45:30 khong 1.12.14.2: #i24850# fix the problem in previousCharBlock, when target char block is in position 1 +- 2004/02/04 02:13:48 khong 1.12.14.1: #i24098# check boundary condition for Sentence, Script, CharBlock breakiterator +- +-commit 4da98b648497af30de0fcf1a16e649ce18b0564f +-Author: Jens-Heiner Rechtien +-Date: Mon Mar 8 16:17:05 2004 +0000 +- +- INTEGRATION: CWS i18n09 (1.2.2); FILE MERGED +- 2003/12/04 23:45:37 khong 1.2.2.3: #i22602# make dot stick on beginning of a word when doing line break +- 2003/12/04 23:12:37 khong 1.2.2.2: #i21392# change line break rule to match with MS office ++Date: Wed Sep 5 16:37:28 2007 +0000 + +-done, regression tests added: ++ INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED ++ 2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator ++ 2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem + +-#112623# update Japanese word breakiterator dictionary +-#i50172# add cell breakiterator rule for Tamil +-#i80412# indic cursoring +-#i107843# em-dash/en-dash breakiterator fix for spell checking +-#i103552# Japanese word for 'shutdown' added to ja.dic +-#i113785# ligatures for spell checking will no longer break words +-An opening quote should not be counted as a word by word count tool (regression test in writer) +-fdo#31271 wrong line break with ( +-#i89042# word count fix (regression test is in writer) +-#i58513# add break iterator rules for Finish +-#i19716# fix wrong line break on bracket characters +-#i21290# extend Greek script type +-#i21907# fix isBeginWord and isEndWord problem +-#i85411# Apply patch for ZWSP +-#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break +-#i13451# add '-' as midLetter for Catalan dictionary word breakiterator +-#i13494# fix word breakiterator rule to handle punctuations and signs correctly +-#i29548# Fix Thai word breakiterator problem +-#i11993# #i14904# fix word breakiterator issues +-- +2.39.2 + -- cgit v1.2.3