From 53702ea897ec00baa61bd191a3f9948ccfb176d0 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 24 May 2024 07:29:01 +0200 Subject: Adding debian version 4:24.2.3-2. Signed-off-by: Daniel Baumann --- debian/patches/breakiterator-updates.diff | 3620 ++++++++++++++++++++ debian/patches/icu-74.1.diff | 71 + .../reviewed-breakIterator-customizations.diff | 1269 +++++++ debian/patches/series | 4 + debian/patches/use-PyConfig.diff | 80 + 5 files changed, 5044 insertions(+) create mode 100644 debian/patches/breakiterator-updates.diff create mode 100644 debian/patches/icu-74.1.diff create mode 100644 debian/patches/reviewed-breakIterator-customizations.diff create mode 100644 debian/patches/use-PyConfig.diff (limited to 'debian/patches') diff --git a/debian/patches/breakiterator-updates.diff b/debian/patches/breakiterator-updates.diff new file mode 100644 index 0000000000..8ac9cdbb5d --- /dev/null +++ b/debian/patches/breakiterator-updates.diff @@ -0,0 +1,3620 @@ +From 5b688b03a916a0f6127c7aba891bf613cff0de0b Mon Sep 17 00:00:00 2001 +From: Jonathan Clark +Date: Wed, 17 Apr 2024 09:09:50 -0600 +Subject: [PATCH] tdf#49885 BreakIterator rule upgrades + +This change re-bases the BreakIterator rule customizations on top of a +clean copy of the ICU 74.2 rules. + +Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4 +--- + i18npool/CustomTarget_breakiterator.mk | 6 +- + i18npool/qa/cppunit/test_breakiterator.cxx | 356 +++++---- + .../source/breakiterator/data/dict_word.txt | 267 ++++--- + .../breakiterator/data/dict_word_he.txt | 139 ---- + .../breakiterator/data/dict_word_hu.txt | 324 +++++---- + .../breakiterator/data/dict_word_nodash.txt | 147 ---- + .../data/dict_word_prepostdash.txt | 288 +++++--- + .../source/breakiterator/data/edit_word.txt | 261 ++++--- + .../breakiterator/data/edit_word_he.txt | 142 ---- + .../breakiterator/data/edit_word_hu.txt | 294 +++++--- + i18npool/source/breakiterator/data/line.txt | 680 ++++++------------ + i18npool/source/breakiterator/data/sent.txt | 128 ---- + 12 files changed, 1307 insertions(+), 1725 deletions(-) + delete mode 100644 i18npool/source/breakiterator/data/dict_word_he.txt + delete mode 100644 i18npool/source/breakiterator/data/dict_word_nodash.txt + delete mode 100644 i18npool/source/breakiterator/data/edit_word_he.txt + delete mode 100644 i18npool/source/breakiterator/data/sent.txt + +diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk +index 8229a5e8f314..ef951142837a 100644 +--- a/i18npool/CustomTarget_breakiterator.mk ++++ b/i18npool/CustomTarget_breakiterator.mk +@@ -45,16 +45,12 @@ endif + + i18npool_BRKTXTS := \ + count_word.brk \ +- $(call gb_Helper_optional_locale,he,dict_word_he.brk) \ + $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \ +- dict_word_nodash.brk \ + dict_word_prepostdash.brk \ + dict_word.brk \ +- $(call gb_Helper_optional_locale,he,edit_word_he.brk) \ + $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \ + edit_word.brk \ +- line.brk \ +- sent.brk ++ line.brk + + # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules. + # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools, +diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx +index b33466bee46d..2a35b2eee58f 100644 +--- a/i18npool/qa/cppunit/test_breakiterator.cxx ++++ b/i18npool/qa/cppunit/test_breakiterator.cxx +@@ -184,11 +184,10 @@ void TestBreakIterator::testLineBreaking() + + { + // Per the bug, the line break should leave -bar clumped together on the next line. +- // However, this change was reverted at some point. This test asserts the new behavior. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", +- static_cast(5), aResult.breakIndex); ++ static_cast(4), aResult.breakIndex); + } + } + +@@ -198,11 +197,29 @@ void TestBreakIterator::testLineBreaking() + aLocale.Country = "US"; + + { +- // Here we want the line break to leave C:\Program Files\ on the first line ++ // Note that the current behavior deviates from the original fix for this bug. ++ // ++ // The original report was filed due to wrapping all of "\Program Files\aaaa" to the ++ // next line, even though only "aaaa" overflowed. The original fix was to simply make ++ // U+005C reverse solidus (backslash) a breaking character. ++ // ++ // However, the root cause for this bug was not the behavior of '\', but rather some ++ // other bug making all of "\Program Files\" behave like a single token, despite it ++ // even containing whitespace. ++ // ++ // Reverting to the ICU line rules fixes this root issue. Now, in the following, ++ // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also ++ // consistent with the behavior of other office programs. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0, + aHyphOptions, aUserOptions); +- CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ CPPUNIT_ASSERT_EQUAL(static_cast(11), aResult.breakIndex); ++ ++ // An identical result should be generated for solidus. ++ aResult = m_xBreak->getLineBreak( ++ "C:/Program Files/LibreOffice", strlen("C:/Program Files/Libre"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(11), aResult.breakIndex); + } + } + +@@ -251,23 +268,125 @@ void TestBreakIterator::testLineBreaking() + aLocale.Country = "US"; + + { ++ // The root cause for this bug was the Unicode standard introducing special treatment ++ // for '-' in a number range context. This change makes number ranges (e.g. "100-199") ++ // behave as if they are single tokens for the purposes of line breaking. Unfortunately, ++ // this caused a significant appearance change to existing documents. ++ // ++ // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping ++ // number ranges as a single token is consistent with other applications, including web ++ // browsers, and other office suites as mentioned in the bug discussion. Removing this ++ // customization seems like it would be a major change, however. ++ // + // Here we want the line break to leave 100- clumped on the first line. ++ + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions); +- CPPUNIT_ASSERT_EQUAL(static_cast(9), aResult.breakIndex); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); ++ } ++ ++ { ++ // From the same bug: "the leading minus must stay with numbers and strings" ++ ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "range of -100.000 to 100.000", strlen("range of -1"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); ++ ++ constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); + } +- } + +- // i#83649: Line break should be between typographical quote and left bracket +- { + aLocale.Language = "de"; + aLocale.Country = "DE"; + + { +- // Here we want the line break to leave »angetan werden« on the first line ++ // From the same bug: "the leading minus must stay with numbers and strings" ++ ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "EURO is -10,50", strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex); ++ ++ // Also the mathematical minus sign: ++ ++ constexpr OUString str = u"EURO is \u221210,50"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex); ++ } ++ ++ { ++ // From the same bug: "the leading minus must stay with numbers and strings" ++ ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "und -kosten", strlen("und -ko"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex); ++ ++ // But not the non-breaking hyphen: ++ ++ constexpr OUString str = u"und \u2011"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex); ++ } ++ } ++ ++ // i#83649: "Line break should be between typographical quote and left bracket" ++ // - Actually: Spaces between quotation mark and opening punctuation not treated as a break. ++ // - Note that per the Unicode standard, prohibiting breaks in this context is intentional ++ // because it may cause issues in certain languages due to the various ways quotation ++ // characters are used. ++ // - We do it anyway by customizing the ICU line breaking rules. ++ { ++ { ++ // This uses the sample text provided in the bug report. Based on usage, it is assumed ++ // they were in the de_DE locale. ++ ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ // Per the bug report, it is expected that »angetan werden« remains on the first line. + const OUString str = u"»angetan werden« [Passiv]"_ustr; + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( +- str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions); ++ str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ ++ // The same result should be returned for this and the first case. ++ const OUString str2 = u"»angetan werden« Passiv"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ ++ // Under ICU rules, no amount of spaces would cause this to wrap. ++ const OUString str3 = u"»angetan werden« [Passiv]"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(20), aResult.breakIndex); ++ ++ // However, tabs will ++ const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ } ++ ++ { ++ // The same behavior is seen in English ++ ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ const OUString str = u"\"angetan werden\" [Passiv]"_ustr; ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ ++ const OUString str2 = u"\"angetan werden\" Passiv"_ustr; ++ aResult = m_xBreak->getLineBreak( ++ str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + } + } +@@ -355,7 +474,7 @@ void TestBreakIterator::testLineBreaking() + auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort", + strlen("Wort -prinzessinnen,"), aLocale, 0, + aHyphOptions, aUserOptions); +- CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex); + } + } + } +@@ -638,7 +757,8 @@ void TestBreakIterator::testWordBoundaries() + CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); + } + +- //See https://bz.apache.org/ooo/show_bug.cgi?id=85411 ++ // i#85411: ZWSP should be a word separator for spellchecking ++ // - This fix was applied to both dict and edit customizations + for (int j = 0; j < 3; ++j) + { + switch (j) +@@ -660,21 +780,23 @@ void TestBreakIterator::testWordBoundaries() + break; + } + +- static constexpr OUString aTest = +- u"I\u200Bwant\u200Bto\u200Bgo"_ustr; ++ static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr; + + sal_Int32 nPos = 0; +- sal_Int32 aExpected[] = {1, 6, 9, 12}; ++ sal_Int32 aExpected[] = { 1, 6, 9, 12 }; + size_t i = 0; + do + { + CPPUNIT_ASSERT(i < std::size(aExpected)); +- nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, +- i18n::WordType::DICTIONARY_WORD, true).endPos; +- CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); ++ auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, ++ i18n::WordType::DICTIONARY_WORD, true); ++ CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos); ++ auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, ++ i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); ++ CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos); ++ nPos = dwPos.endPos; + ++i; +- } +- while (nPos++ < aTest.getLength()); ++ } while (nPos++ < aTest.getLength()); + CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); + } + +@@ -814,121 +936,45 @@ void TestBreakIterator::testWordBoundaries() + } + + // i#56347: "BreakIterator patch for Hungarian" +- // Rules for Hungarian affixes after numbers and certain symbols +- { +- auto mode = i18n::WordType::DICTIONARY_WORD; +- aLocale.Language = "hu"; +- aLocale.Country = "HU"; +- +- OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; +- +- aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); +- } +- + // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) +- // Rules for Hungarian affixes after numbers and certain symbols in edit mode. +- // The patch was merged, but the original bug was never closed and the current behavior seems +- // identical to the ICU default behavior. Added this test to ensure that doesn't change. ++ // Rules for Hungarian affixes after numbers and certain symbols + { +- auto mode = i18n::WordType::ANY_WORD; + aLocale.Language = "hu"; + aLocale.Country = "HU"; + + OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; + +- aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos); +- +- aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); ++ for (auto mode : ++ { i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES }) ++ { ++ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + +- aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos); ++ aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + +- aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos); ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); + +- aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos); ++ aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + +- aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + +- aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos); ++ aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + +- aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); +- CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); ++ aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); ++ } + } + } + +@@ -967,6 +1013,56 @@ void TestBreakIterator::testSentenceBoundaries() + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale)); + } ++ ++ // i#55063: Sentence selection in Thai should select a space-delimited phrase. ++ // - This customization broke at some point. It works in an English locale in a synthetic test ++ // like this one, but does not work in the Thai locale, nor on Thai text in practice. ++ { ++ static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr; ++ ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale)); ++ ++ aLocale.Language = "th"; ++ aLocale.Country = "TH"; ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale)); ++ } ++ ++ // i#55063: Thai phrases should delimit English sentence selection. ++ // - This customization broke at some point. It works in an English locale in a synthetic test ++ // like this one, but does not work in the Thai locale, nor on Thai text in practice. ++ { ++ static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr; ++ ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale)); ++ ++ aLocale.Language = "th"; ++ aLocale.Country = "TH"; ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale)); ++ } ++ ++ // i#55063: Characteristic test for English text delimiting Thai phrases (sentences) ++ // - English text should not delimit Thai phrases. ++ { ++ static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr; ++ ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale)); ++ ++ aLocale.Language = "th"; ++ aLocale.Country = "TH"; ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale)); ++ } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=40292 +@@ -1501,6 +1597,7 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord() + aLocale.Language = "he"; + aLocale.Country = "IL"; + ++ // i#51661: Add quotation mark as middle letter for Hebrew + { + auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + +@@ -1514,6 +1611,21 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord() + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + } ++ ++ // i#51661: Add quotation mark as middle letter for Hebrew ++ { ++ auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; ++ ++ i18n::Boundary aBounds = m_xBreak->getWordBoundary( ++ aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, ++ i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); ++ } + } + + void TestBreakIterator::testLegacySurrogatePairs() +diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt +index b1666f44daab..f804b0eec214 100644 +--- a/i18npool/source/breakiterator/data/dict_word.txt ++++ b/i18npool/source/breakiterator/data/dict_word.txt +@@ -1,148 +1,199 @@ + # +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. ++# Copyright (C) 2016 and later: Unicode, Inc. and others. ++# License & terms of use: http://www.unicode.org/copyright.html ++# Copyright (C) 2002-2016, International Business Machines Corporation ++# and others. All Rights Reserved. + # +-# file: dict_word.txt ++# file: word.txt + # +-# ICU Word Break Rules ++# ICU Word Break Rules + # See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 ++# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 + # ++# Note: Updates to word.txt will usually need to be merged into ++# word_POSIX.txt also. + +- +- +-#################################################################################### ++############################################################################## + # + # Character class definitions from TR 29 + # +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; +- +-$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] +- [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] +- [:name = HYPHEN-MINUS:] ]; +- +-$SufixLetter = [:name= FULL STOP:]; +- +- +-$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] +- [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] +- [:name = PRIME:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; ++############################################################################## ++ ++### BEGIN CUSTOMIZATION ++### This file contains LibreOffice-specific rule customizations. ++### ++### To aid future maintainability: ++### - The change location should be bracketed by comments of this form. ++### - The original rule should be commented out, and the modified rule placed alongside. ++### - By doing this, maintainers can more easily compare to an upstream baseline. ++### ++### END CUSTOMIZATION ++ ++!!chain; ++!!quoted_literals_only; ++ + + # + # Character Class Definitions. +-# The names are those from TR29. + # +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; + ++$Han = [:Han:]; + ++$CR = [\p{Word_Break = CR}]; ++$LF = [\p{Word_Break = LF}]; ++$Newline = [\p{Word_Break = Newline}]; ++$Extend = [\p{Word_Break = Extend}-$Han]; ++$ZWJ = [\p{Word_Break = ZWJ}]; ++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; ++$Format = [\p{Word_Break = Format}]; ++$Katakana = [\p{Word_Break = Katakana}]; ++$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; ++$ALetter = [\p{Word_Break = ALetter}]; ++$Single_Quote = [\p{Word_Break = Single_Quote}]; ++$Double_Quote = [\p{Word_Break = Double_Quote}]; ++$MidNumLet = [\p{Word_Break = MidNumLet}]; ++$MidNum = [\p{Word_Break = MidNum}]; ++$Numeric = [\p{Word_Break = Numeric}]; ++$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; ++$WSegSpace = [\p{Word_Break = WSegSpace}]; ++$Extended_Pict = [\p{Extended_Pictographic}]; + ++### BEGIN CUSTOMIZATION ++### Unknown issue number: Dictionary words can contain hyphens ++### tdf#49885: Sync custom BreakIterator rules with ICU originals ++### - ICU is now more permissive about punctuation inside words. ++### - For compatibility, exclude certain characters that were previously excluded. + +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### ++$IncludedML = [:name = HYPHEN-MINUS:]; ++$ExcludedML = [[:name = COLON:] ++ [:name = GREEK ANO TELEIA:] ++ [:name = PRESENTATION FORM FOR VERTICAL COLON:] ++ [:name = SMALL COLON:] ++ [:name = FULLWIDTH COLON:]]; + +-$Format = [[:Cf:] - $TheZWSP]; ++# $MidLetter = [\p{Word_Break = MidLetter}]; ++$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; + ++### END CUSTOMIZATION + ++$Hiragana = [:Hiragana:]; ++$Ideographic = [\p{Ideographic}]; + +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# + ++# Dictionary character set, for triggering language-based break engines. Currently ++# limited to LineBreak=Complex_Context. Note that this set only works in Unicode ++# 5.0 or later as the definition of Complex_Context was corrected to include all ++# characters requiring dictionary break. + +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$SufixLetterEx= $SufixLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; ++$Control = [\p{Grapheme_Cluster_Break = Control}]; ++$HangulSyllable = [\uac00-\ud7a3]; ++$ComplexContext = [:LineBreak = Complex_Context:]; ++$KanaKanji = [$Han $Hiragana $Katakana]; ++$dictionaryCJK = [$KanaKanji $HangulSyllable]; ++$dictionary = [$ComplexContext $dictionaryCJK]; + ++# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; ++# leave CJK scripts out of ALetterPlus ++$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; + +-[[:P:][:S:]]*; ++## ------------------------------------------------- + ++# Rule 3 - CR x LF + # +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; ++$CR $LF; + ++# Rule 3c Do not break within emoji zwj sequences. ++# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. + # +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; ++$ZWJ $Extended_Pict; + ++# Rule 3d - Keep horizontal whitespace together. + # +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; ++$WSegSpace $WSegSpace; ++ ++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning ++# of a region of Text. ++ ++$ExFm = [$Extend $Format $ZWJ]; ++ ++^$ExFm+; # This rule fires only when there are format or extend characters at the ++ # start of text, or immediately following another boundary. It groups them, in ++ # the event there are more than one. ++ ++[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, ++ # with no special rule status value. ++ ++$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but ++$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. ++$HangulSyllable {200}; ++$Hebrew_Letter $ExFm* {200}; ++$Katakana $ExFm* {400}; # note: these status values override those from rule 5 ++$Hiragana $ExFm* {400}; # by virtue of being numerically larger. ++$Ideographic $ExFm* {400}; # + + # +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) ++# rule 5 ++# Do not break between most letters. + # ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 6 and 7 ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; ++ ++# rule 7a ++$Hebrew_Letter $ExFm* $Single_Quote {200}; ++ ++# rule 7b and 7c ++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; ++ ++# rule 8 ++ ++$Numeric $ExFm* $Numeric; ++ ++# rule 9 ++ ++($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; ++ ++# rule 10 ++ ++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 11 and 12 ++ ++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; ++ ++# rule 13 ++# to be consistent with $KanaKanji $KanaKanhi, changed ++# from 300 to 400. ++# See also TestRuleStatus in intltest/rbbiapts.cpp ++$Katakana $ExFm* $Katakana {400}; ++ ++# rule 13a/b ++ ++$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) ++$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) ++$Numeric $ExFm* $ExtendNumLet {100}; # (13a) ++$Katakana $ExFm* $ExtendNumLet {400}; # (13a) ++$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) ++ ++$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) ++$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) ++$ExtendNumLet $ExFm* $Numeric {100}; # (13b) ++$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. ++# rules 15 - 17 ++# Pairs of Regional Indicators stay together. ++# With incoming rule chaining disabled by ^, this rule will match exactly two of them. ++# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. + # +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; ++^$Regional_Indicator $ExFm* $Regional_Indicator; + +-#!.*; +-! ($NonStarters* | \n \r) .; ++# special handling for CJK characters: chain for later dictionary segmentation ++$HangulSyllable $HangulSyllable {200}; ++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + ++# Rule 999 ++# Match a single code point if no other rule applies. ++.; +diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt b/i18npool/source/breakiterator/data/dict_word_he.txt +deleted file mode 100644 +index 40197d92a431..000000000000 +--- a/i18npool/source/breakiterator/data/dict_word_he.txt ++++ /dev/null +@@ -1,139 +0,0 @@ +-# +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. +-# +-# file: dict_word.txt +-# +-# ICU Word Break Rules +-# See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 +-# +- +- +- +-#################################################################################### +-# +-# Character class definitions from TR 29 +-# +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +- +-$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] +- - $Katakana +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] +- [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]]; +- +-$SufixLetter = [:name= FULL STOP:]; +- +-$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] +- [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] +- [:name = PRIME:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; +- +-# +-# Character Class Definitions. +-# The names are those from TR29. +-# +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; +- +- +- +- +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### +- +-$Format = [[:Cf:] - $TheZWSP]; +- +- +- +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# +- +- +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$SufixLetterEx= $SufixLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$FormatEx = $Format $Extend*; +- +- +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; +- +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +- +-[[:P:][:S:]]*; +- +-# +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; +- +-# +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-# [:IDEOGRAPHIC:] $Extend* {400}; +- +-# +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; +- +-# +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) +-# +- +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. +-# +-$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +- +-#!.*; +-! ($NonStarters* | \n \r) .; +- +diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt +index b0a0276b36a8..88648e6e5716 100644 +--- a/i18npool/source/breakiterator/data/dict_word_hu.txt ++++ b/i18npool/source/breakiterator/data/dict_word_hu.txt +@@ -1,176 +1,222 @@ + # +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. ++# Copyright (C) 2016 and later: Unicode, Inc. and others. ++# License & terms of use: http://www.unicode.org/copyright.html ++# Copyright (C) 2002-2016, International Business Machines Corporation ++# and others. All Rights Reserved. + # +-# file: dict_word.txt ++# file: word.txt + # +-# ICU Word Break Rules ++# ICU Word Break Rules + # See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 ++# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 + # ++# Note: Updates to word.txt will usually need to be merged into ++# word_POSIX.txt also. + +- +- +-#################################################################################### ++############################################################################## + # + # Character class definitions from TR 29 + # +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; +- +- +-# Fix spelling of a)-ban, b)-ben, when the letter is a reference +-# resulting bad word breaking "ban" and "ben" +-# (reference fields are not expanded in spell checking, yet, only +-# for grammar checking). +- +-$PrefixLetter = [[:name = RIGHT PARENTHESIS:]]; +- +-$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] +- [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] +- [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] +- [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] +- [:name = DIGIT ZERO:] +- [:name = DIGIT ONE:] +- [:name = DIGIT TWO:] +- [:name = DIGIT THREE:] +- [:name = DIGIT FOUR:] +- [:name = DIGIT FIVE:] +- [:name = DIGIT SIX:] +- [:name = DIGIT SEVEN:] +- [:name = DIGIT EIGHT:] +- [:name = DIGIT NINE:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] +- [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] +- [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] +- [:name = EN DASH:] [:name = EM DASH:] +- [:name = RIGHT DOUBLE QUOTATION MARK:] +- [:name = LEFT PARENTHESIS:] +- [:name = RIGHT PARENTHESIS:] +- [:name = RIGHT SQUARE BRACKET:] +- [:name = EXCLAMATION MARK:] +- [:name = QUESTION MARK:] +- [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; +- +-$SufixLetter = [:name= FULL STOP:]; +- +-$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] +- [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] +- [:name = PRIME:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; ++############################################################################## ++ ++### BEGIN CUSTOMIZATION ++### This file contains LibreOffice-specific rule customizations. ++### ++### To aid future maintainability: ++### - The change location should be bracketed by comments of this form. ++### - The original rule should be commented out, and the modified rule placed alongside. ++### - By doing this, maintainers can more easily compare to an upstream baseline. ++### ++### END CUSTOMIZATION ++ ++!!chain; ++!!quoted_literals_only; ++ + + # + # Character Class Definitions. +-# The names are those from TR29. + # +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; +- +- + ++$Han = [:Han:]; ++ ++$CR = [\p{Word_Break = CR}]; ++$LF = [\p{Word_Break = LF}]; ++$Newline = [\p{Word_Break = Newline}]; ++$Extend = [\p{Word_Break = Extend}-$Han]; ++$ZWJ = [\p{Word_Break = ZWJ}]; ++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; ++$Format = [\p{Word_Break = Format}]; ++$Katakana = [\p{Word_Break = Katakana}]; ++$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; ++$Single_Quote = [\p{Word_Break = Single_Quote}]; ++$Double_Quote = [\p{Word_Break = Double_Quote}]; ++$MidNumLet = [\p{Word_Break = MidNumLet}]; ++$MidNum = [\p{Word_Break = MidNum}]; ++$Numeric = [\p{Word_Break = Numeric}]; ++$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; ++$WSegSpace = [\p{Word_Break = WSegSpace}]; ++$Extended_Pict = [\p{Extended_Pictographic}]; ++ ++### BEGIN CUSTOMIZATION ++### Unknown issue number: Dictionary words can contain hyphens ++### tdf#49885: Sync custom BreakIterator rules with ICU originals ++### - ICU is now more permissive about punctuation inside words. ++### - For compatibility, exclude certain characters that were previously excluded. ++### tdf#116072: Extend MidLetter in Hungarian word breaking ++### i#56347: BreakIterator patch for Hungarian ++### i#56348: Special chars in first pos not handled by spell checking for Hungarian ++ ++$Symbols_hu = [[:name = PERCENT SIGN:] ++ [:name = PER MILLE SIGN:] ++ [:name = PER TEN THOUSAND SIGN:] ++ [:name = SECTION SIGN:] ++ [:name = DEGREE SIGN:] ++ [:name = EURO SIGN:] ++ [:name = HYPHEN-MINUS:] ++ [:name = EN DASH:] ++ [:name = EM DASH:]]; ++ ++#$ALetter = [\p{Word_Break = ALetter}]; ++$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; ++ ++$IncludedML = [:name = HYPHEN-MINUS:]; ++$ExcludedML = [[:name = COLON:] ++ [:name = GREEK ANO TELEIA:] ++ [:name = PRESENTATION FORM FOR VERTICAL COLON:] ++ [:name = SMALL COLON:] ++ [:name = FULLWIDTH COLON:]]; ++ ++$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:] ++ [:name = LEFT PARENTHESIS:] ++ [:name = RIGHT PARENTHESIS:] ++ [:name = RIGHT SQUARE BRACKET:] ++ [:name = EXCLAMATION MARK:] ++ [:name = QUESTION MARK:] ++ $Symbols_hu]; ++ ++# $MidLetter = [\p{Word_Break = MidLetter}]; ++$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu]; ++ ++### END CUSTOMIZATION ++ ++$Hiragana = [:Hiragana:]; ++$Ideographic = [\p{Ideographic}]; ++ ++ ++# Dictionary character set, for triggering language-based break engines. Currently ++# limited to LineBreak=Complex_Context. Note that this set only works in Unicode ++# 5.0 or later as the definition of Complex_Context was corrected to include all ++# characters requiring dictionary break. ++ ++$Control = [\p{Grapheme_Cluster_Break = Control}]; ++$HangulSyllable = [\uac00-\ud7a3]; ++$ComplexContext = [:LineBreak = Complex_Context:]; ++$KanaKanji = [$Han $Hiragana $Katakana]; ++$dictionaryCJK = [$KanaKanji $HangulSyllable]; ++$dictionary = [$ComplexContext $dictionaryCJK]; ++ ++# TODO: check if handling of katakana in dictionary makes rules incorrect/void ++ ++# leave CJK scripts out of ALetterPlus ++$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; ++ ++ ++## ------------------------------------------------- ++ ++# Rule 3 - CR x LF ++# ++$CR $LF; + +-#################################################################################### ++# Rule 3c Do not break within emoji zwj sequences. ++# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. + # +-# Word Break Rules. Definitions and Rules specific to word break begin Here. ++$ZWJ $Extended_Pict; ++ ++# Rule 3d - Keep horizontal whitespace together. + # +-#################################################################################### ++$WSegSpace $WSegSpace; + +-$Format = [[:Cf:] - $TheZWSP]; ++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning ++# of a region of Text. + ++$ExFm = [$Extend $Format $ZWJ]; + ++^$ExFm+; # This rule fires only when there are format or extend characters at the ++ # start of text, or immediately following another boundary. It groups them, in ++ # the event there are more than one. + +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# ++[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, ++ # with no special rule status value. + ++$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but ++$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. ++$HangulSyllable {200}; ++$Hebrew_Letter $ExFm* {200}; ++$Katakana $ExFm* {400}; # note: these status values override those from rule 5 ++$Hiragana $ExFm* {400}; # by virtue of being numerically larger. ++$Ideographic $ExFm* {400}; # + + # +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. ++# rule 5 ++# Do not break between most letters. + # +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$SufixLetterEx= $SufixLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + ++# rule 6 and 7 ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; ++# rule 7a ++$Hebrew_Letter $ExFm* $Single_Quote {200}; + +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; ++# rule 7b and 7c ++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +-[[:P:][:S:]]*; ++# rule 8 + +-# +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; ++$Numeric $ExFm* $Numeric; + +-# +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; ++# rule 9 + +-# +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; ++($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +-# +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) +-# ++# rule 10 ++ ++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 11 and 12 ++ ++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; ++ ++# rule 13 ++# to be consistent with $KanaKanji $KanaKanhi, changed ++# from 300 to 400. ++# See also TestRuleStatus in intltest/rbbiapts.cpp ++$Katakana $ExFm* $Katakana {400}; ++ ++# rule 13a/b ++ ++$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) ++$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) ++$Numeric $ExFm* $ExtendNumLet {100}; # (13a) ++$Katakana $ExFm* $ExtendNumLet {400}; # (13a) ++$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) ++ ++$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) ++$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) ++$ExtendNumLet $ExFm* $Numeric {100}; # (13b) ++$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. ++# rules 15 - 17 ++# Pairs of Regional Indicators stay together. ++# With incoming rule chaining disabled by ^, this rule will match exactly two of them. ++# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. + # +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; ++^$Regional_Indicator $ExFm* $Regional_Indicator; + +-#!.*; +-! ($NonStarters* | \n \r) .; ++# special handling for CJK characters: chain for later dictionary segmentation ++$HangulSyllable $HangulSyllable {200}; ++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + ++# Rule 999 ++# Match a single code point if no other rule applies. ++.; +diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt +deleted file mode 100644 +index 279cc50e5b66..000000000000 +--- a/i18npool/source/breakiterator/data/dict_word_nodash.txt ++++ /dev/null +@@ -1,147 +0,0 @@ +-# +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. +-# +-# file: dict_word.txt +-# +-# ICU Word Break Rules +-# See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 +-# +- +- +- +-#################################################################################### +-# +-# Character class definitions from TR 29 +-# +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; +- +-$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] +- [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ]; +- +-$SufixLetter = [:name= FULL STOP:]; +- +- +-$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] +- [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] +- [:name = PRIME:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; +- +-# +-# Character Class Definitions. +-# The names are those from TR29. +-# +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; +- +- +- +- +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### +- +-$Format = [[:Cf:] - $TheZWSP]; +- +- +- +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# +- +- +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$SufixLetterEx= $SufixLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; +- +- +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; +- +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +- +-[[:P:][:S:]]*; +- +-# +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; +- +-# +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; +- +-# +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; +- +-# +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) +-# +- +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. +-# +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +- +-#!.*; +-! ($NonStarters* | \n \r) .; +- +diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt +index fb29b478af21..b39503d1b405 100644 +--- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt ++++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt +@@ -1,157 +1,221 @@ + # +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. ++# Copyright (C) 2016 and later: Unicode, Inc. and others. ++# License & terms of use: http://www.unicode.org/copyright.html ++# Copyright (C) 2002-2016, International Business Machines Corporation ++# and others. All Rights Reserved. + # +-# file: dict_word.txt ++# file: word.txt + # +-# ICU Word Break Rules ++# ICU Word Break Rules + # See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 ++# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 + # ++# Note: Updates to word.txt will usually need to be merged into ++# word_POSIX.txt also. + +- +- +-#################################################################################### ++############################################################################## + # + # Character class definitions from TR 29 + # +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; ++############################################################################## + +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; ++### BEGIN CUSTOMIZATION ++### This file contains LibreOffice-specific rule customizations. ++### ++### To aid future maintainability: ++### - The change location should be bracketed by comments of this form. ++### - The original rule should be commented out, and the modified rule placed alongside. ++### - By doing this, maintainers can more easily compare to an upstream baseline. ++### ++### END CUSTOMIZATION + +-# list of dashes or hyphens that should be accepted as part of the word if a single one of these +-# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to +-# be part of the word in order to have it properly spell checked etc. +-$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ]; ++!!chain; ++!!quoted_literals_only; + + +-$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] +- [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] +- [:name = HYPHEN-MINUS:] ]; ++# ++# Character Class Definitions. ++# + +-$SufixLetter = [:name= FULL STOP:]; +- ++$Han = [:Han:]; + +-$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] +- [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] +- [:name = PRIME:]]; +-$Numeric = [:LineBreak = Numeric:]; ++$CR = [\p{Word_Break = CR}]; ++$LF = [\p{Word_Break = LF}]; ++$Newline = [\p{Word_Break = Newline}]; ++$Extend = [\p{Word_Break = Extend}-$Han]; ++$ZWJ = [\p{Word_Break = ZWJ}]; ++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; ++$Format = [\p{Word_Break = Format}]; ++$Katakana = [\p{Word_Break = Katakana}]; ++$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; ++$ALetter = [\p{Word_Break = ALetter}]; ++$Single_Quote = [\p{Word_Break = Single_Quote}]; ++$Double_Quote = [\p{Word_Break = Double_Quote}]; ++$MidNumLet = [\p{Word_Break = MidNumLet}]; ++$MidNum = [\p{Word_Break = MidNum}]; ++$Numeric = [\p{Word_Break = Numeric}]; ++$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; ++$WSegSpace = [\p{Word_Break = WSegSpace}]; ++$Extended_Pict = [\p{Extended_Pictographic}]; + ++### BEGIN CUSTOMIZATION ++### Unknown issue number: Dictionary words can contain hyphens ++### tdf#49885: Sync custom BreakIterator rules with ICU originals ++### - ICU is now more permissive about punctuation inside words. ++### - For compatibility, exclude certain characters that were previously excluded. + +-$TheZWSP = \u200b; ++$IncludedML = [:name = HYPHEN-MINUS:]; ++$ExcludedML = [[:name = COLON:] ++ [:name = GREEK ANO TELEIA:] ++ [:name = PRESENTATION FORM FOR VERTICAL COLON:] ++ [:name = SMALL COLON:] ++ [:name = FULLWIDTH COLON:]]; + +-# +-# Character Class Definitions. +-# The names are those from TR29. +-# +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; ++# $MidLetter = [\p{Word_Break = MidLetter}]; ++$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; + ++### END CUSTOMIZATION + ++### BEGIN CUSTOMIZATION ++### Unknown issue number: Allow leading and trailing hyphens in certain languages ++### This part of the customization does not replace any rules. + ++$PrePostHyphen = [:name = HYPHEN-MINUS:]; + +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### ++### END CUSTOMIZATION + +-$Format = [[:Cf:] - $TheZWSP]; ++$Hiragana = [:Hiragana:]; ++$Ideographic = [\p{Ideographic}]; + + ++# Dictionary character set, for triggering language-based break engines. Currently ++# limited to LineBreak=Complex_Context. Note that this set only works in Unicode ++# 5.0 or later as the definition of Complex_Context was corrected to include all ++# characters requiring dictionary break. + +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# ++$Control = [\p{Grapheme_Cluster_Break = Control}]; ++$HangulSyllable = [\uac00-\ud7a3]; ++$ComplexContext = [:LineBreak = Complex_Context:]; ++$KanaKanji = [$Han $Hiragana $Katakana]; ++$dictionaryCJK = [$KanaKanji $HangulSyllable]; ++$dictionary = [$ComplexContext $dictionaryCJK]; + ++# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$SufixLetterEx= $SufixLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; ++# leave CJK scripts out of ALetterPlus ++$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + + ++## ------------------------------------------------- ++ ++# Rule 3 - CR x LF + # +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; ++$CR $LF; + ++# Rule 3c Do not break within emoji zwj sequences. ++# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. + # +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. ++$ZWJ $Extended_Pict; ++ ++# Rule 3d - Keep horizontal whitespace together. + # +-# At most one leading or trailing dash/hyphen should be accepted as well. +-# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to +-# be part of the word in order to have it properly spell checked etc. +-$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; ++$WSegSpace $WSegSpace; + +-[[:P:][:S:]]*; ++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning ++# of a region of Text. + +-# +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; ++$ExFm = [$Extend $Format $ZWJ]; + +-# +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; ++^$ExFm+; # This rule fires only when there are format or extend characters at the ++ # start of text, or immediately following another boundary. It groups them, in ++ # the event there are more than one. + +-# +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; ++[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, ++ # with no special rule status value. ++ ++$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but ++$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. ++$HangulSyllable {200}; ++$Hebrew_Letter $ExFm* {200}; ++$Katakana $ExFm* {400}; # note: these status values override those from rule 5 ++$Hiragana $ExFm* {400}; # by virtue of being numerically larger. ++$Ideographic $ExFm* {400}; # + + # +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) ++# rule 5 ++# Do not break between most letters. + # + +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. ++### BEGIN CUSTOMIZATION ++### Unknown issue number: Allow leading and trailing hyphens in certain languages ++ ++# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); ++($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?; ++ ++### END CUSTOMIZATION ++ ++# rule 6 and 7 ++ ++### BEGIN CUSTOMIZATION ++### Unknown issue number: Allow leading and trailing hyphens in certain languages ++ ++# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; ++($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200}; ++ ++### END CUSTOMIZATION ++ ++# rule 7a ++$Hebrew_Letter $ExFm* $Single_Quote {200}; ++ ++# rule 7b and 7c ++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; ++ ++# rule 8 ++ ++$Numeric $ExFm* $Numeric; ++ ++# rule 9 ++ ++($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; ++ ++# rule 10 ++ ++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 11 and 12 ++ ++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; ++ ++# rule 13 ++# to be consistent with $KanaKanji $KanaKanhi, changed ++# from 300 to 400. ++# See also TestRuleStatus in intltest/rbbiapts.cpp ++$Katakana $ExFm* $Katakana {400}; ++ ++# rule 13a/b ++ ++$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) ++$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) ++$Numeric $ExFm* $ExtendNumLet {100}; # (13a) ++$Katakana $ExFm* $ExtendNumLet {400}; # (13a) ++$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) ++ ++$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) ++$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) ++$ExtendNumLet $ExFm* $Numeric {100}; # (13b) ++$ExtendNumLet $ExFm* $Katakana {400}; # (13b) ++ ++# rules 15 - 17 ++# Pairs of Regional Indicators stay together. ++# With incoming rule chaining disabled by ^, this rule will match exactly two of them. ++# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. + # +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; ++^$Regional_Indicator $ExFm* $Regional_Indicator; + +-#!.*; +-! ($NonStarters* | \n \r) .; ++# special handling for CJK characters: chain for later dictionary segmentation ++$HangulSyllable $HangulSyllable {200}; ++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + ++# Rule 999 ++# Match a single code point if no other rule applies. ++.; +diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt +index 92b344c19d41..14fc221aa96e 100644 +--- a/i18npool/source/breakiterator/data/edit_word.txt ++++ b/i18npool/source/breakiterator/data/edit_word.txt +@@ -1,142 +1,199 @@ + # +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. ++# Copyright (C) 2016 and later: Unicode, Inc. and others. ++# License & terms of use: http://www.unicode.org/copyright.html ++# Copyright (C) 2002-2016, International Business Machines Corporation ++# and others. All Rights Reserved. + # +-# file: edit_word.txt ++# file: word.txt + # +-# ICU Word Break Rules ++# ICU Word Break Rules + # See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 ++# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 + # ++# Note: Updates to word.txt will usually need to be merged into ++# word_POSIX.txt also. + +- +- +-#################################################################################### ++############################################################################## + # + # Character class definitions from TR 29 + # +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; +- +-$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; +- +-$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; ++############################################################################## ++ ++### BEGIN CUSTOMIZATION ++### This file contains LibreOffice-specific rule customizations. ++### ++### To aid future maintainability: ++### - The change location should be bracketed by comments of this form. ++### - The original rule should be commented out, and the modified rule placed alongside. ++### - By doing this, maintainers can more easily compare to an upstream baseline. ++### ++### END CUSTOMIZATION ++ ++!!chain; ++!!quoted_literals_only; ++ + + # + # Character Class Definitions. +-# The names are those from TR29. + # +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; + ++$Han = [:Han:]; + ++$CR = [\p{Word_Break = CR}]; ++$LF = [\p{Word_Break = LF}]; ++$Newline = [\p{Word_Break = Newline}]; ++$Extend = [\p{Word_Break = Extend}-$Han]; ++$ZWJ = [\p{Word_Break = ZWJ}]; ++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; ++$Format = [\p{Word_Break = Format}]; ++$Katakana = [\p{Word_Break = Katakana}]; ++$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; ++$ALetter = [\p{Word_Break = ALetter}]; ++$Single_Quote = [\p{Word_Break = Single_Quote}]; ++$Double_Quote = [\p{Word_Break = Double_Quote}]; ++$MidLetter = [\p{Word_Break = MidLetter}]; ++$MidNum = [\p{Word_Break = MidNum}]; ++$Numeric = [\p{Word_Break = Numeric}]; ++$WSegSpace = [\p{Word_Break = WSegSpace}]; ++$Extended_Pict = [\p{Extended_Pictographic}]; + ++### BEGIN CUSTOMIZATION ++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. ++### This change subtracts undesired characters from the above families + +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### ++# $MidNumLet = [\p{Word_Break = MidNumLet}]; ++$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; + +-$Format = [[:Cf:] - $TheZWSP]; ++# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; ++$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; + ++### END CUSTOMIZATION + ++$Hiragana = [:Hiragana:]; ++$Ideographic = [\p{Ideographic}]; + +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# + ++# Dictionary character set, for triggering language-based break engines. Currently ++# limited to LineBreak=Complex_Context. Note that this set only works in Unicode ++# 5.0 or later as the definition of Complex_Context was corrected to include all ++# characters requiring dictionary break. + +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; ++$Control = [\p{Grapheme_Cluster_Break = Control}]; ++$HangulSyllable = [\uac00-\ud7a3]; ++$ComplexContext = [:LineBreak = Complex_Context:]; ++$KanaKanji = [$Han $Hiragana $Katakana]; ++$dictionaryCJK = [$KanaKanji $HangulSyllable]; ++$dictionary = [$ComplexContext $dictionaryCJK]; + ++# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; ++# leave CJK scripts out of ALetterPlus ++$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; + +-# Punctuations by themselves +-[[:P:][:S:]-[:name = FULL STOP:]]*; +-[[:name = FULL STOP:]]*; ++## ------------------------------------------------- + ++# Rule 3 - CR x LF + # +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; ++$CR $LF; + ++# Rule 3c Do not break within emoji zwj sequences. ++# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. + # +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; ++$ZWJ $Extended_Pict; + ++# Rule 3d - Keep horizontal whitespace together. + # +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; ++$WSegSpace $WSegSpace; ++ ++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning ++# of a region of Text. ++ ++$ExFm = [$Extend $Format $ZWJ]; ++ ++^$ExFm+; # This rule fires only when there are format or extend characters at the ++ # start of text, or immediately following another boundary. It groups them, in ++ # the event there are more than one. ++ ++[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, ++ # with no special rule status value. ++ ++$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but ++$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. ++$HangulSyllable {200}; ++$Hebrew_Letter $ExFm* {200}; ++$Katakana $ExFm* {400}; # note: these status values override those from rule 5 ++$Hiragana $ExFm* {400}; # by virtue of being numerically larger. ++$Ideographic $ExFm* {400}; # + + # +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) ++# rule 5 ++# Do not break between most letters. + # ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 6 and 7 ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; ++ ++# rule 7a ++$Hebrew_Letter $ExFm* $Single_Quote {200}; ++ ++# rule 7b and 7c ++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; ++ ++# rule 8 ++ ++$Numeric $ExFm* $Numeric; ++ ++# rule 9 ++ ++($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. ++# rule 10 ++ ++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 11 and 12 ++ ++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; ++ ++# rule 13 ++# to be consistent with $KanaKanji $KanaKanhi, changed ++# from 300 to 400. ++# See also TestRuleStatus in intltest/rbbiapts.cpp ++$Katakana $ExFm* $Katakana {400}; ++ ++# rule 13a/b ++ ++$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) ++$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) ++$Numeric $ExFm* $ExtendNumLet {100}; # (13a) ++$Katakana $ExFm* $ExtendNumLet {400}; # (13a) ++$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) ++ ++$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) ++$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) ++$ExtendNumLet $ExFm* $Numeric {100}; # (13b) ++$ExtendNumLet $ExFm* $Katakana {400}; # (13b) ++ ++# rules 15 - 17 ++# Pairs of Regional Indicators stay together. ++# With incoming rule chaining disabled by ^, this rule will match exactly two of them. ++# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. + # +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; ++^$Regional_Indicator $ExFm* $Regional_Indicator; + +-#!.*; +-! ($NonStarters* | \n \r) .; ++# special handling for CJK characters: chain for later dictionary segmentation ++$HangulSyllable $HangulSyllable {200}; ++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found ++ ++### BEGIN CUSTOMIZATION ++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. ++### This customization does not replace any rules. ++[[:P:][:S:]-[:name = FULL STOP:]]* ++[[:name = FULL STOP:]]*; ++### END CUSTOMIZATION + ++# Rule 999 ++# Match a single code point if no other rule applies. ++.; +diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt b/i18npool/source/breakiterator/data/edit_word_he.txt +deleted file mode 100644 +index 0b5908814e08..000000000000 +--- a/i18npool/source/breakiterator/data/edit_word_he.txt ++++ /dev/null +@@ -1,142 +0,0 @@ +-# +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. +-# +-# file: edit_word.txt +-# +-# ICU Word Break Rules +-# See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 +-# +- +- +- +-#################################################################################### +-# +-# Character class definitions from TR 29 +-# +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; +- +-$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; +- +-$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; +- +-# +-# Character Class Definitions. +-# The names are those from TR29. +-# +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; +- +- +- +- +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### +- +-$Format = [[:Cf:] - $TheZWSP]; +- +- +- +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# +- +- +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; +- +- +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; +- +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; +- +-# Punctuations by themselves +-[[:P:][:S:]-[:name = FULL STOP:]]*; +-[[:name = FULL STOP:]]*; +- +-# +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; +- +-# +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; +- +-# +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. +-# +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; +- +-# +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) +-# +- +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. +-# +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; +- +-#!.*; +-! ($NonStarters* | \n \r) .; +- +diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt +index 4a08acab0029..389ad2bacc13 100644 +--- a/i18npool/source/breakiterator/data/edit_word_hu.txt ++++ b/i18npool/source/breakiterator/data/edit_word_hu.txt +@@ -1,159 +1,215 @@ + # +-# Copyright (C) 2002-2003, International Business Machines Corporation and others. +-# All Rights Reserved. ++# Copyright (C) 2016 and later: Unicode, Inc. and others. ++# License & terms of use: http://www.unicode.org/copyright.html ++# Copyright (C) 2002-2016, International Business Machines Corporation ++# and others. All Rights Reserved. + # +-# file: edit_word.txt ++# file: word.txt + # +-# ICU Word Break Rules ++# ICU Word Break Rules + # See Unicode Standard Annex #29. +-# These rules are based on Version 4.0.0, dated 2003-04-17 ++# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 + # ++# Note: Updates to word.txt will usually need to be merged into ++# word_POSIX.txt also. + +- +- +-#################################################################################### ++############################################################################## + # + # Character class definitions from TR 29 + # +-#################################################################################### +-$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] +- [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +- +-$Ideographic = [:Ideographic:]; +-$Hangul = [:Script = HANGUL:]; +- +-$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] +- [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] +- [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] +- [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] +- [:name = DIGIT ZERO:] +- [:name = DIGIT ONE:] +- [:name = DIGIT TWO:] +- [:name = DIGIT THREE:] +- [:name = DIGIT FOUR:] +- [:name = DIGIT FIVE:] +- [:name = DIGIT SIX:] +- [:name = DIGIT SEVEN:] +- [:name = DIGIT EIGHT:] +- [:name = DIGIT NINE:] +- - $Ideographic +- - $Katakana +- - $Hangul +- - [:Script = Thai:] +- - [:Script = Lao:] +- - [:Script = Hiragana:]]; +- +-$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] +- [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] +- [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:] +- [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] +- [:name = EN DASH:] [:name = EM DASH:] +- [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; +- +-$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; +-$Numeric = [:LineBreak = Numeric:]; +- +- +-$TheZWSP = \u200b; ++############################################################################## ++ ++### BEGIN CUSTOMIZATION ++### This file contains LibreOffice-specific rule customizations. ++### ++### To aid future maintainability: ++### - The change location should be bracketed by comments of this form. ++### - The original rule should be commented out, and the modified rule placed alongside. ++### - By doing this, maintainers can more easily compare to an upstream baseline. ++### ++### END CUSTOMIZATION ++ ++!!chain; ++!!quoted_literals_only; ++ + + # + # Character Class Definitions. +-# The names are those from TR29. + # +-$CR = \u000d; +-$LF = \u000a; +-$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +-$Extend = [[:Grapheme_Extend = TRUE:]]; + ++$Han = [:Han:]; + ++$CR = [\p{Word_Break = CR}]; ++$LF = [\p{Word_Break = LF}]; ++$Newline = [\p{Word_Break = Newline}]; ++$Extend = [\p{Word_Break = Extend}-$Han]; ++$ZWJ = [\p{Word_Break = ZWJ}]; ++$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; ++$Format = [\p{Word_Break = Format}]; ++$Katakana = [\p{Word_Break = Katakana}]; ++$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; ++$Single_Quote = [\p{Word_Break = Single_Quote}]; ++$Double_Quote = [\p{Word_Break = Double_Quote}]; ++$MidNum = [\p{Word_Break = MidNum}]; ++$Numeric = [\p{Word_Break = Numeric}]; ++$WSegSpace = [\p{Word_Break = WSegSpace}]; ++$Extended_Pict = [\p{Extended_Pictographic}]; + ++### BEGIN CUSTOMIZATION ++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. ++### This change subtracts undesired characters from the above families ++### i#56347: BreakIterator patch for Hungarian ++### i#56348: Special chars in first pos not handled by spell checking for Hungarian + +-#################################################################################### +-# +-# Word Break Rules. Definitions and Rules specific to word break begin Here. +-# +-#################################################################################### ++$Symbols_hu = [[:name = PERCENT SIGN:] ++ [:name = PER MILLE SIGN:] ++ [:name = PER TEN THOUSAND SIGN:] ++ [:name = SECTION SIGN:] ++ [:name = DEGREE SIGN:] ++ [:name = EURO SIGN:] ++ [:name = HYPHEN-MINUS:] ++ [:name = EN DASH:] ++ [:name = EM DASH:]]; + +-$Format = [[:Cf:] - $TheZWSP]; ++# $ALetter = [\p{Word_Break = ALetter}]; ++$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; + ++# $MidLetter = [\p{Word_Break = MidLetter}]; ++$MidLetter = [\p{Word_Break = MidLetter} $Symbols_hu]; + ++# $MidNumLet = [\p{Word_Break = MidNumLet}]; ++$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; + +-# Rule 3: Treat a grapheme cluster as if it were a single character. +-# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +-# because we don't need to find the boundaries between adjacent syllables - +-# they won't be word boundaries. +-# ++# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; ++$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; + ++### END CUSTOMIZATION + +-# +-# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +-# +-$ALetterEx = $ALetter $Extend*; +-$NumericEx = $Numeric $Extend*; +-$MidNumEx = $MidNum $Extend*; +-$MidLetterEx = $MidLetter $Extend*; +-$KatakanaEx = $Katakana $Extend*; +-$IdeographicEx= $Ideographic $Extend*; +-$HangulEx = $Hangul $Extend*; +-$FormatEx = $Format $Extend*; ++$Hiragana = [:Hiragana:]; ++$Ideographic = [\p{Ideographic}]; + + +-# +-# Numbers. Rules 8, 11, 12 form the TR. +-# +-$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +-$NumberSequence {100}; ++# Dictionary character set, for triggering language-based break engines. Currently ++# limited to LineBreak=Complex_Context. Note that this set only works in Unicode ++# 5.0 or later as the definition of Complex_Context was corrected to include all ++# characters requiring dictionary break. + +-# +-# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +-# - must include at least one letter. +-# - may include both letters and numbers. +-# - may include MideLetter, MidNumber punctuation. +-# +-$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +-($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; ++$Control = [\p{Grapheme_Cluster_Break = Control}]; ++$HangulSyllable = [\uac00-\ud7a3]; ++$ComplexContext = [:LineBreak = Complex_Context:]; ++$KanaKanji = [$Han $Hiragana $Katakana]; ++$dictionaryCJK = [$KanaKanji $HangulSyllable]; ++$dictionary = [$ComplexContext $dictionaryCJK]; + +-# Punctuations by themselves +-[[:P:][:S:]-[:name = FULL STOP:]]*; +-[[:name = FULL STOP:]]*; ++# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +-# +-# Do not break between Katakana. Rule #13. +-# +-$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +-[:Hiragana:] $Extend* {300}; ++# leave CJK scripts out of ALetterPlus ++$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + ++ ++## ------------------------------------------------- ++ ++# Rule 3 - CR x LF + # +-# Ideographic Characters. Stand by themselves as words. +-# Separated from the "Everything Else" rule, below, only so that they +-# can be tagged with a return value. TODO: is this what we want? +-# +-$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +-$HangulEx ($FormatEx* $HangulEx)* {400}; ++$CR $LF; + ++# Rule 3c Do not break within emoji zwj sequences. ++# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. + # +-# Everything Else, with no tag. +-# Non-Control chars combine with $Extend (combining) chars. +-# Controls are do not. ++$ZWJ $Extended_Pict; ++ ++# Rule 3d - Keep horizontal whitespace together. + # +-[^$Control [:Ideographic:]] $Extend*; +-$CR $LF; ++$WSegSpace $WSegSpace; ++ ++# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning ++# of a region of Text. ++ ++$ExFm = [$Extend $Format $ZWJ]; ++ ++^$ExFm+; # This rule fires only when there are format or extend characters at the ++ # start of text, or immediately following another boundary. It groups them, in ++ # the event there are more than one. ++ ++[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, ++ # with no special rule status value. ++ ++$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but ++$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. ++$HangulSyllable {200}; ++$Hebrew_Letter $ExFm* {200}; ++$Katakana $ExFm* {400}; # note: these status values override those from rule 5 ++$Hiragana $ExFm* {400}; # by virtue of being numerically larger. ++$Ideographic $ExFm* {400}; # + + # +-# Reverse Rules. Back up over any of the chars that can group together. +-# (Reverse rules do not need to be exact; they can back up too far, +-# but must back up at least enough, and must stop on a boundary.) ++# rule 5 ++# Do not break between most letters. + # ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 6 and 7 ++($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; ++ ++# rule 7a ++$Hebrew_Letter $ExFm* $Single_Quote {200}; ++ ++# rule 7b and 7c ++$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; ++ ++# rule 8 ++ ++$Numeric $ExFm* $Numeric; ++ ++# rule 9 + +-# NonStarters are the set of all characters that can appear at the 2nd - nth position of +-# a word. (They may also be the first.) The reverse rule skips over these, until it +-# reaches something that can only be the start (and probably only) char in a "word". +-# A space or punctuation meets the test. ++($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; ++ ++# rule 10 ++ ++$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); ++ ++# rule 11 and 12 ++ ++$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; ++ ++# rule 13 ++# to be consistent with $KanaKanji $KanaKanhi, changed ++# from 300 to 400. ++# See also TestRuleStatus in intltest/rbbiapts.cpp ++$Katakana $ExFm* $Katakana {400}; ++ ++# rule 13a/b ++ ++$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) ++$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) ++$Numeric $ExFm* $ExtendNumLet {100}; # (13a) ++$Katakana $ExFm* $ExtendNumLet {400}; # (13a) ++$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) ++ ++$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) ++$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) ++$ExtendNumLet $ExFm* $Numeric {100}; # (13b) ++$ExtendNumLet $ExFm* $Katakana {400}; # (13b) ++ ++# rules 15 - 17 ++# Pairs of Regional Indicators stay together. ++# With incoming rule chaining disabled by ^, this rule will match exactly two of them. ++# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. + # +-$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; ++^$Regional_Indicator $ExFm* $Regional_Indicator; + +-#!.*; +-! ($NonStarters* | \n \r) .; ++# special handling for CJK characters: chain for later dictionary segmentation ++$HangulSyllable $HangulSyllable {200}; ++$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found ++ ++### BEGIN CUSTOMIZATION ++### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. ++### This customization does not replace any rules. ++[[:P:][:S:]-[:name = FULL STOP:]]* ++[[:name = FULL STOP:]]*; ++### END CUSTOMIZATION + ++# Rule 999 ++# Match a single code point if no other rule applies. ++.; +diff --git a/i18npool/source/breakiterator/data/line.txt b/i18npool/source/breakiterator/data/line.txt +index ff3f3eafc42e..46a618c63cae 100644 +--- a/i18npool/source/breakiterator/data/line.txt ++++ b/i18npool/source/breakiterator/data/line.txt +@@ -1,176 +1,116 @@ +-# Copyright (c) 2002-2006 International Business Machines Corporation and ++# Copyright (C) 2016 and later: Unicode, Inc. and others. ++# License & terms of use: http://www.unicode.org/copyright.html ++# Copyright (c) 2002-2016 International Business Machines Corporation and + # others. All Rights Reserved. + # + # file: line.txt + # + # Line Breaking Rules +-# Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0 +-# http://www.unicode.org/reports/tr14/ +- +- ++# Implement default line breaking as defined by ++# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/) ++# for Unicode 14.0, with the following modification: ++# ++# Boundaries between hyphens and following letters are suppressed when ++# there is a boundary preceding the hyphen. See rule 20.9 ++# ++# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). ++# It sets characters of class CJ to behave like NS. + + # + # Character Classes defined by TR 14. + # + +-!!chain; +-!!LBCMNoChain; ++### BEGIN CUSTOMIZATION ++### This file contains LibreOffice-specific rule customizations. ++### ++### To aid future maintainability: ++### - The change location should be bracketed by comments of this form. ++### - The original rule should be commented out, and the modified rule placed alongside. ++### - By doing this, maintainers can more easily compare to an upstream baseline. ++### ++### END CUSTOMIZATION + +- +-!!lookAheadHardBreak; +-# +-# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere +-# and only used for the line break rules. +-# +-# It is used in the implementation of the incredibly annoying rule LB 10 +-# which says to treat any combining mark that is not attached to a base +-# character as if it were of class AL (alphabetic). +-# +-# The problem occurs in the reverse rules. +-# +-# Consider a sequence like, with correct breaks as shown +-# LF ID CM AL AL +-# ^ ^ ^ +-# Then consider the sequence without the initial ID (ideographic) +-# LF CM AL AL +-# ^ ^ +-# Our CM, which in the first example was attached to the ideograph, +-# is now unattached, becomes an alpha, and joins in with the other +-# alphas. +-# +-# When iterating forwards, these sequences do not present any problems +-# When iterating backwards, we need to look ahead when encountering +-# a CM to see whether it attaches to something further on or not. +-# (Look-ahead in a reverse rule is looking towards the start) +-# +-# If the CM is unattached, we need to force a break. +-# +-# !!lookAheadHardBreak forces the run time state machine to +-# stop immediately when a look ahead rule ( '/' operator) matches, +-# and set the match position to that of the look-ahead operator, +-# no matter what other rules may be in play at the time. +-# +-# See rule LB 19 for an example. +-# ++!!chain; ++!!quoted_literals_only; + + $AI = [:LineBreak = Ambiguous:]; +-$DG = \u00B0; +-$AL = [[:LineBreak = Alphabetic:] $DG]; ++$AL = [:LineBreak = Alphabetic:]; + $BA = [:LineBreak = Break_After:]; ++$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. + $BB = [:LineBreak = Break_Before:]; + $BK = [:LineBreak = Mandatory_Break:]; + $B2 = [:LineBreak = Break_Both:]; + $CB = [:LineBreak = Contingent_Break:]; + $CJ = [:LineBreak = Conditional_Japanese_Starter:]; +-$CL = [[:LineBreak = Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; # tdf#31271 +-$CM = [:LineBreak = Combining_Mark:]; ++$CL = [:LineBreak = Close_Punctuation:]; ++# $CM = [:LineBreak = Combining_Mark:]; ++$CP = [:LineBreak = Close_Parenthesis:]; + $CR = [:LineBreak = Carriage_Return:]; ++$EB = [:LineBreak = EB:]; ++$EM = [:LineBreak = EM:]; + $EX = [:LineBreak = Exclamation:]; + $GL = [:LineBreak = Glue:]; + $HL = [:LineBreak = Hebrew_Letter:]; + $HY = [:LineBreak = Hyphen:]; + $H2 = [:LineBreak = H2:]; + $H3 = [:LineBreak = H3:]; +-$ID = [[:LineBreak = Ideographic:] - [\ufe30]]; +-$IN = [:LineBreak = Inseparable:]; +-$IS = [[:LineBreak = Infix_Numeric:] [\ufe30]]; ++$ID = [:LineBreak = Ideographic:]; ++$IN = [:LineBreak = Inseperable:]; ++$IS = [:LineBreak = Infix_Numeric:]; + $JL = [:LineBreak = JL:]; + $JV = [:LineBreak = JV:]; + $JT = [:LineBreak = JT:]; + $LF = [:LineBreak = Line_Feed:]; + $NL = [:LineBreak = Next_Line:]; ++# NS includes CJ for CSS strict line breaking. + $NS = [[:LineBreak = Nonstarter:] $CJ]; + $NU = [:LineBreak = Numeric:]; +-$OP = [[:LineBreak = Open_Punctuation:] - $DG]; ++$OP = [:LineBreak = Open_Punctuation:]; + $PO = [:LineBreak = Postfix_Numeric:]; +-$BS = \u005C; +-$PR = [[:LineBreak = Prefix_Numeric:] - $BS]; ++$PR = [:LineBreak = Prefix_Numeric:]; + $QU = [:LineBreak = Quotation:]; ++$RI = [:LineBreak = Regional_Indicator:]; + $SA = [:LineBreak = Complex_Context:]; + $SG = [:LineBreak = Surrogate:]; + $SP = [:LineBreak = Space:]; +-$SY = [[:LineBreak = Break_Symbols:] $BS]; ++$SY = [:LineBreak = Break_Symbols:]; + $WJ = [:LineBreak = Word_Joiner:]; + $XX = [:LineBreak = Unknown:]; + $ZW = [:LineBreak = ZWSpace:]; ++$ZWJ = [:LineBreak = ZWJ:]; ++ ++# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, ++# without a formal name. Because ICU rules require multiple uses of the expressions, ++# give them a single definition with a name ++ ++$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; ++$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; ++ ++$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; ++ ++# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly ++# list it in the numerous rules that use CM. ++# By LB1, SA characters with general categor of Mn or Mc also resolve to CM. ++ ++$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; ++$CMX = [[$CM] - [$ZWJ]]; + + # Dictionary character set, for triggering language-based break engines. Currently +-# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +-# 5.0 or later as the definition of Complex_Context was corrected to include all +-# characters requiring dictionary break. ++# limited to LineBreak=Complex_Context (SA). + +-$dictionary = [:LineBreak = Complex_Context:]; ++$dictionary = [$SA]; + + # + # Rule LB1. By default, treat AI (characters with ambiguous east Asian width), +-# SA (South East Asian: Thai, Lao, Khmer) ++# SA (Dictionary chars, excluding Mn and Mc) + # SG (Unpaired Surrogates) + # XX (Unknown, unassigned) + # as $AL (Alphabetic) + # +-$ALPlus = [$AL $AI $SA $SG $XX]; +- +-# +-# Combining Marks. X $CM* behaves as if it were X. Rule LB6. +-# +-$ALcm = $ALPlus $CM*; +-$BAcm = $BA $CM*; +-$BBcm = $BB $CM*; +-$B2cm = $B2 $CM*; +-$CLcm = $CL $CM*; +-$EXcm = $EX $CM*; +-$GLcm = $GL $CM*; +-$HLcm = $HL $CM*; +-$HYcm = $HY $CM*; +-$H2cm = $H2 $CM*; +-$H3cm = $H3 $CM*; +-$IDcm = $ID $CM*; +-$INcm = $IN $CM*; +-$IScm = $IS $CM*; +-$JLcm = $JL $CM*; +-$JVcm = $JV $CM*; +-$JTcm = $JT $CM*; +-$NScm = $NS $CM*; +-$NUcm = $NU $CM*; +-$OPcm = $OP $CM*; +-$POcm = $PO $CM*; +-$PRcm = $PR $CM*; +-$QUcm = $QU $CM*; +-$SYcm = $SY $CM*; +-$WJcm = $WJ $CM*; ++$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; + +-## ------------------------------------------------- + +-!!forward; +- +-# +-# Each class of character can stand by itself as an unbroken token, with trailing combining stuff +-# +-$ALPlus $CM+; +-$BA $CM+; +-$BB $CM+; +-$B2 $CM+; +-$CL $CM+; +-$EX $CM+; +-$GL $CM+; +-$HL $CM+; +-$HY $CM+; +-$H2 $CM+; +-$H3 $CM+; +-$ID $CM+; +-$IN $CM+; +-$IS $CM+; +-$JL $CM+; +-$JV $CM+; +-$JT $CM+; +-$NS $CM+; +-$NU $CM+; +-$OP $CM+; +-$PO $CM+; +-$PR $CM+; +-$QU $CM+; +-$SY $CM+; +-$WJ $CM+; ++## ------------------------------------------------- + + # + # CAN_CM is the set of characters that may combine with CM combining chars. +@@ -186,19 +126,15 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs + # + # AL_FOLLOW set of chars that can unconditionally follow an AL + # Needed in rules where stand-alone $CM s are treated as AL. +-# Chaining is disabled with CM because it causes other failures, +-# so for this one case we need to manually list out longer sequences. + # +-$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; +-$AL_FOLLOW_CM = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP]; +-$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; ++$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; + + + # + # Rule LB 4, 5 Mandatory (Hard) breaks. + # + $LB4Breaks = [$BK $CR $LF $NL]; +-$LB4NonBreaks = [^$BK $CR $LF $NL]; ++$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; + $CR $LF {100}; + + # +@@ -206,91 +142,124 @@ $CR $LF {100}; + # + $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. + $CAN_CM $CM* $LB4Breaks {100}; +-$CM+ $LB4Breaks {100}; ++^$CM+ $LB4Breaks {100}; + + # LB 7 x SP + # x ZW + $LB4NonBreaks [$SP $ZW]; + $CAN_CM $CM* [$SP $ZW]; +-$CM+ [$SP $ZW]; ++^$CM+ [$SP $ZW]; + + # + # LB 8 Break after zero width space ++# ZW SP* ÷ + # + $LB8Breaks = [$LB4Breaks $ZW]; + $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; ++$ZW $SP* / [^$SP $ZW $LB4Breaks]; + ++# LB 8a ZWJ x Do not break Emoji ZWJ sequences. ++# ++$ZWJ [^$CM]; + +-# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +-# $CM not covered by the above needs to behave like $AL ++# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL ++# $CM not covered by the above needs to behave like $AL + # See definition of $CAN_CM. + + $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. +-$CM+; ++^$CM+; + + # + # LB 11 Do not break before or after WORD JOINER & related characters. + # +-$CAN_CM $CM* $WJcm; +-$LB8NonBreaks $WJcm; +-$CM+ $WJcm; ++$CAN_CM $CM* $WJ; ++$LB8NonBreaks $WJ; ++^$CM+ $WJ; + +-$WJcm [^$CAN_CM]; +-$WJcm $CAN_CM $CM*; ++$WJ $CM* .; + + # +-# LB 12 Do not break before or after NBSP and related characters. ++# LB 12 Do not break after NBSP and related characters. ++# GL x + # +-# (!SP) x GL +-[$LB8NonBreaks-$SP] $CM* $GLcm; +-$CM+ $GLcm; ++$GL $CM* .; + +-# GL x +-$GLcm ($LB8Breaks | $SP); +-$GLcm [$LB8NonBreaks-$SP] $CM*; # Don't let a combining mark go onto $CR, $BK, etc. +- # TODO: I don't think we need this rule. +- # All but $CM will chain off of preceding rule. +- # $GLcm will pick up the CM case by itself. ++# ++# LB 12a Do not break before NBSP and related characters ... ++# [^SP BA HY] x GL ++# ++[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL; ++^$CM+ $GL; + + + + +-# +-# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. ++# LB 13 Don't break before ']' or '!' or '/', even after spaces. + # + $LB8NonBreaks $CL; + $CAN_CM $CM* $CL; +-$CM+ $CL; # by rule 10, stand-alone CM behaves as AL ++^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL ++ ++$LB8NonBreaks $CP; ++$CAN_CM $CM* $CP; ++^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL + + $LB8NonBreaks $EX; + $CAN_CM $CM* $EX; +-$CM+ $EX; # by rule 10, stand-alone CM behaves as AL +- +-$LB8NonBreaks $IS; +-$CAN_CM $CM* $IS; +-$CM+ $IS; # by rule 10, stand-alone CM behaves as AL ++^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL + + $LB8NonBreaks $SY; + $CAN_CM $CM* $SY; +-$CM+ $SY; # by rule 10, stand-alone CM behaves as AL ++^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL + + + # +-# LB 14 Do not break after OP, even after spaced ++# LB 14 Do not break after OP, even after spaces ++# Note subtle interaction with "SP IS /" rules in LB14a. ++# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules, ++# which is the desired behavior. ++# ++$OP $CM* $SP* .; ++ ++$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL ++ # by rule 8, CM following a SP is stand-alone. ++ ++ ++# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" ++# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. ++# See issue ICU-20303 ++ ++ ++$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; ++$SP $IS / [^ $CanFollowIS $NU $CM]; ++$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; ++ + # +-$OPcm $SP* $CAN_CM $CM*; +-$OPcm $SP* $CANT_CM; ++# LB 14b Do not break before numeric separators (IS), even after spaces. ++ ++[$LB8NonBreaks - $SP] $IS; ++$SP $IS $CM* [$CanFollowIS {eof}]; ++$SP $IS $CM* $ZWJ [^$CM $NU]; ++ ++$CAN_CM $CM* $IS; ++^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL + +-$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL + + # LB 15 +-# $QUcm $SP* $OPcm; ++ ++### BEGIN CUSTOMIZATION ++### i#83649: Allow line break between quote and opening punctuation. ++### This customization simply disables rule LB 15. ++### ++# $QU $CM* $SP* $OP; ++### ++### END CUSTOMIZATION + + # LB 16 +-$CLcm $SP* $NScm; ++($CL | $CP) $CM* $SP* $NS; + + # LB 17 +-$B2cm $SP* $B2cm; ++$B2 $CM* $SP* $B2; + + # + # LB 18 Break after spaces. +@@ -301,347 +270,134 @@ $LB18Breaks = [$LB8Breaks $SP]; + + # LB 19 + # x QU +-$LB18NonBreaks $CM* $QUcm; +-$CM+ $QUcm; ++$LB18NonBreaks $CM* $QU; ++^$CM+ $QU; + + # QU x +-$QUcm .?; +-$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. +- # TODO: I don't think this rule is needed. +- ++$QU $CM* .; + + # LB 20 + # $CB + # $CB +- ++# + $LB20NonBreaks = [$LB18NonBreaks - $CB]; + ++# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. ++# Originally added as a Finnish tailoring, now promoted to default ICU behavior. ++# Note: this is not default UAX-14 behaviour. See issue ICU-8151. ++# ++^($HY | $HH) $CM* $ALPlus; ++ + # LB 21 x (BA | HY | NS) + # BB x + # +-$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); ++$LB20NonBreaks $CM* ($BA | $HY | $NS); + +-$BBcm [^$CB]; # $BB x +-$BBcm $LB20NonBreaks $CM*; + +-# LB 21a Don't break after Hebrew + Hyphen +-# HL (HY | BA) x +-# +-$HLcm ($HYcm | $BAcm) [^$CB]?; ++^$CM+ ($BA | $HY | $NS); + +-# LB 22 +-($ALcm | $HLcm) $INcm; +-$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL +-$IDcm $INcm; +-$INcm $INcm; +-$NUcm $INcm; ++$BB $CM* [^$CB]; # $BB x ++$BB $CM* $LB20NonBreaks; + +- +-# $LB 23 +-$IDcm $POcm; +-$ALcm $NUcm; # includes $LB19 +-$HLcm $NUcm; +-$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL +-$NUcm $ALcm; +-$NUcm $HLcm; +- +-# +-# LB 24 +-# +-$PRcm $IDcm; +-$ALcm $PRcm; +-$PRcm ($ALcm | $HLcm); +-$POcm ($ALcm | $HLcm); +- +-# +-# LB 25 Numbers. +-# +-($PRcm | $POcm)? ($OPcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?; +- +-# LB 26 Do not break a Korean syllable ++# LB 21a Don't break after Hebrew + Hyphen ++# HL (HY | BA) x + # +-$JLcm ($JLcm | $JVcm | $H2cm | $H3cm); +-($JVcm | $H2cm) ($JVcm | $JTcm); +-($JTcm | $H3cm) $JTcm; +- +-# LB 27 Treat korean Syllable Block the same as ID (don't break it) +-($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; +-($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; +-$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); ++$HL $CM* ($HY | $BA) $CM* [^$CB]?; + ++# LB 21b (forward) Don't break between SY and HL ++# (break between HL and SY already disallowed by LB 13 above) ++$SY $CM* $HL; + +-# LB 28 Do not break between alphabetics ++# LB 22 Do not break before ellipses + # +-($ALcm | $HLcm) ($ALcm | $HLcm); +-$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL ++$LB20NonBreaks $CM* $IN; ++^$CM+ $IN; + +-# LB 29 +-$IScm ($ALcm | $NUcm); + ++# LB 23 + # +-# Rule 30 Do not break between letters, numbers or ordinary symbols +-# and opening or closing punctuation +-# +-($ALcm | $HLcm | $NUcm) $OPcm; +-$CM+ $OPcm; +-$CLcm ($ALcm | $HLcm | $NUcm); ++($ALPlus | $HL) $CM* $NU; ++^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL ++$NU $CM* ($ALPlus | $HL); + ++# LB 23a + # +-# Reverse Rules. +-# +-## ------------------------------------------------- ++$PR $CM* ($ID | $EB | $EM); ++($ID | $EB | $EM) $CM* $PO; + +-!!reverse; +- +-$CM+ $ALPlus; +-$CM+ $BA; +-$CM+ $BB; +-$CM+ $B2; +-$CM+ $CL; +-$CM+ $EX; +-$CM+ $GL; +-$CM+ $HL; +-$CM+ $HY; +-$CM+ $H2; +-$CM+ $H3; +-$CM+ $ID; +-$CM+ $IN; +-$CM+ $IS; +-$CM+ $JL; +-$CM+ $JV; +-$CM+ $JT; +-$CM+ $NS; +-$CM+ $NU; +-$CM+ $OP; +-$CM+ $PO; +-$CM+ $PR; +-$CM+ $QU; +-$CM+ $SY; +-$CM+ $WJ; +-$CM+; +- +- +-# +-# Sequences of the form (shown forwards) +-# [CANT_CM] [CM] [whatever] +-# The CM needs to behave as an AL +-# +-$AL_FOLLOW $CM+ / ( +- [$BK $CR $LF $NL $ZW {eof}] | +- $SP+ $CM+ $SP | +- $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to suppress this break. +- # LB14 says OP SP* x . +- # becomes OP SP* x AL +- # becomes OP SP* x CM+ AL_FOLLOW +- # +- # Further note: the $AL in [$AL {eof}] is only to work around +- # a rule compiler bug which complains about +- # empty sets otherwise. +- +-# +-# Sequences of the form (shown forwards) +-# [CANT_CM] [CM] [PR] +-# The CM needs to behave as an AL +-# This rule is concerned about getting the second of the two in place. +-# +- +-[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; +- +- +- +-# LB 4, 5, 5 +- +-$LB4Breaks [$LB4NonBreaks-$CM]; +-$LB4Breaks $CM+ $CAN_CM; +-$LF $CR; +- +- +-# LB 7 x SP +-# x ZW +-[$SP $ZW] [$LB4NonBreaks-$CM]; +-[$SP $ZW] $CM+ $CAN_CM; + +-# LB 8 Break after zero width space +- +- +-# LB 9,10 Combining marks. +-# X $CM needs to behave like X, where X is not $SP or controls. +-# $CM not covered by the above needs to behave like $AL +-# Stick together any combining sequences that don't match other rules. +-$CM+ $CAN_CM; +- +- +-# LB 11 +-$CM* $WJ $CM* $CAN_CM; +-$CM* $WJ [$LB8NonBreaks-$CM]; +- +- $CANT_CM $CM* $WJ; +-$CM* $CAN_CM $CM* $WJ; +- +-# LB 12 +-# x GL + # +-$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP]; ++# LB 24 ++# ++($PR | $PO) $CM* ($ALPlus | $HL); ++($ALPlus | $HL) $CM* ($PR | $PO); ++^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL + + # +-# GL x ++# LB 25 Numbers. + # +-$CANT_CM $CM* $GL; +-$CM* $CAN_CM $CM* $GL; ++(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))* ++ ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?; + ++### BEGIN CUSTOMIZATION ++### i#83229: Allow line break after hyphen in number range context. ++### The default ICU rules treat number ranges (e.g. 100-199) as a single token. This change forces ++### a break opportunity after the embedded '-', but only if followed by another numeral. ++### ++### This customization does not replace any existing rule. ++### Maintainers: note that this rule should consist of two instances of the LB 25 numbers rule, ++### separated by a hyphen and an explicit break. + +-# LB 13 +-$CL $CM+ $CAN_CM; +-$EX $CM+ $CAN_CM; +-$IS $CM+ $CAN_CM; +-$SY $CM+ $CAN_CM; ++((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))* ++ ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?) ++ ($HY $CM*) / ++((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))* ++ ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?); + +-$CL [$LB8NonBreaks-$CM]; +-$EX [$LB8NonBreaks-$CM]; +-$IS [$LB8NonBreaks-$CM]; +-$SY [$LB8NonBreaks-$CM]; ++### END CUSTOMIZATION + +-# Rule 13 & 14 taken together for an edge case. +-# Match this, shown forward +-# OP SP+ ($CM+ behaving as $AL) (CL | EX | IS | IY) +-# This really wants to chain at the $CM+ (which is acting as an $AL) +-# except for $CM chaining being disabled. +-[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP; ++### TODO ++### ((PrefixNumeric | PostfixNumeric) CombMark*) ? ((OpenPunc | Hyphen) CombMark*)? ++### (InfixNumeric CombMark*)? Numeric (CombMark* (Numeric | BreakSym | InfixNumeric))* ++### (CombMark* (ClosePunc | CloseParen))? (CombMark* (PrefixNumeric | PostfixNumeric))? + +-# LB 14 OP SP* x ++# LB 26 Do not break a Korean syllable + # +-$CM* $CAN_CM $SP* $CM* $OP; +- $CANT_CM $SP* $CM* $OP; +-$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP +- +- $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; +-$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; +-$SY $CM $SP+ $OP; # TODO: Experiment. Remove. +- +- +- +-# LB 15 +-# $CM* $OP $SP* $CM* $QU; +- +-# LB 16 +-$CM* $NS $SP* $CM* $CL; ++$JL $CM* ($JL | $JV | $H2 | $H3); ++($JV | $H2) $CM* ($JV | $JT); ++($JT | $H3) $CM* $JT; + +-# LB 17 +-$CM* $B2 $SP* $CM* $B2; +- +-# LB 18 break after spaces +-# Nothing explicit needed here. +- +- +-# +-# LB 19 +-# +-$CM* $QU $CM* $CAN_CM; # . x QU +-$CM* $QU $LB18NonBreaks; ++# LB 27 Treat korean Syllable Block the same as ID (don't break it) ++($JL | $JV | $JT | $H2 | $H3) $CM* $PO; ++$PR $CM* ($JL | $JV | $JT | $H2 | $H3); + + +-$CM* $CAN_CM $CM* $QU; # QU x . +- $CANT_CM $CM* $QU; +- +-# +-# LB 20 Break before and after CB. +-# nothing needed here. ++# LB 28 Do not break between alphabetics + # +- +-# LB 21 +-$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) +- +-$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . +-[^$CB] $CM* $BB; # +- +-# LB21a +-[^$CB] $CM* ($HY | $BA) $CM* $HL; +- +-# LB 22 +-$CM* $IN $CM* ($ALPlus | $HL); +-$CM* $IN $CM* $ID; +-$CM* $IN $CM* $IN; +-$CM* $IN $CM* $NU; +- +-# LB 23 +-$CM* $PO $CM* $ID; +-$CM* $NU $CM* ($ALPlus | $HL); +-$CM* ($ALPlus | $HL) $CM* $NU; +- +-# LB 24 +-$CM* $ID $CM* $PR; +-$CM* $PR $CM* $ALPlus; +-$CM* ($ALPlus | $HL) $CM* $PR; +-$CM* ($ALPlus | $HL) $CM* $PO; +- +-$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP; +-$CM* $NU+ $CM* $HY+ / $SP; +- +-# LB 25 +-($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?; +- +-# LB 26 +-$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; +-$CM* ($JT | $JV) $CM* ($H2 | $JV); +-$CM* $JT $CM* ($H3 | $JT); +- +-# LB 27 +-$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); +-$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); +-$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; +- +-# LB 28 +-$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); ++($ALPlus | $HL) $CM* ($ALPlus | $HL); ++^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL + + # LB 29 +-$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP]; ++$IS $CM* ($ALPlus | $HL); + + # LB 30 +-$CM* $OP $CM* ($ALPlus | $HL | $NU); +-$CM* ($ALPlus | $HL | $NU) $CM* ($CL | $SY)+ [^$SP]; +- +- +-## ------------------------------------------------- +- +-!!safe_reverse; +- +-# LB 7 +-$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; +-$CM+ $SP / .; +- +-# LB 9 +-$SP+ $CM* $OP; +- +-# LB 10 +-$SP+ $CM* $QU; +- +-# LB 11 +-$SP+ $CM* $CL; +-$SP+ $CM* $B2; +- +-# LB 21 +-$CM* ($HY | $BA) $CM* $HL; +- +-# LB 18 +-($CM* ($IS | $SY))+ $CM* $NU; +-$CL $CM* ($NU | $IS | $SY); +- +-# For dictionary-based break +-$dictionary $dictionary; +- +-## ------------------------------------------------- +- +-!!safe_forward; +- +-# Skip forward over all character classes that are involved in +-# rules containing patterns with possibly more than one char +-# of context. +-# +-# It might be slightly more efficient to have specific rules +-# instead of one generic one, but only if we could +-# turn off rule chaining. We don't want to move more +-# than necessary. +-# +-[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary]; +-$dictionary $dictionary; +- ++($ALPlus | $HL | $NU) $CM* $OP30; ++^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. ++$CP30 $CM* ($ALPlus | $HL | $NU); ++ ++# LB 30a Do not break between regional indicators. Break after pairs of them. ++# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. ++$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; ++$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; ++$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; ++# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' ++# because of the chain-out behavior difference. The rule must chain out only from the [set characters], ++# not from the preceding $RI or $CM, which it would be able to do if the set were optional. ++ ++# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier. ++$EB $CM* $EM; ++$ExtPictUnassigned $CM* $EM; ++ ++# LB 31 Break everywhere else. ++# Match a single code point if no other rule applies. ++.; +diff --git a/i18npool/source/breakiterator/data/sent.txt b/i18npool/source/breakiterator/data/sent.txt +deleted file mode 100644 +index 7fada89e6278..000000000000 +--- a/i18npool/source/breakiterator/data/sent.txt ++++ /dev/null +@@ -1,128 +0,0 @@ +-# +-# Copyright (C) 2002-2006, International Business Machines Corporation and others. +-# All Rights Reserved. +-# +-# file: sent.txt +-# +-# ICU Sentence Break Rules +-# See Unicode Standard Annex #29. +-# These rules are based on SA 29 version 5.0.0 +-# Includes post 5.0 changes to treat Japanese half width voicing marks +-# as Grapheme Extend. +-# +- +- +-$VoiceMarks = [\uff9e\uff9f]; +-$Thai = [:Script = Thai:]; +- +-# +-# Character categories as defined in TR 29 +-# +-$Sep = [\p{Sentence_Break = Sep}]; +-$Format = [\p{Sentence_Break = Format}]; +-$Sp = [\p{Sentence_Break = Sp}]; +-$Lower = [\p{Sentence_Break = Lower}]; +-$Upper = [\p{Sentence_Break = Upper}]; +-$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks]; +-$Numeric = [\p{Sentence_Break = Numeric}]; +-$ATerm = [\p{Sentence_Break = ATerm}]; +-$STerm = [\p{Sentence_Break = STerm}]; +-$Close = [\p{Sentence_Break = Close}]; +- +-# +-# Define extended forms of the character classes, +-# incorporate grapheme cluster + format chars. +-# Rules 4 and 5. +- +- +-$CR = \u000d; +-$LF = \u000a; +-$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks]; +- +-$SpEx = $Sp ($Extend | $Format)*; +-$LowerEx = $Lower ($Extend | $Format)*; +-$UpperEx = $Upper ($Extend | $Format)*; +-$OLetterEx = $OLetter ($Extend | $Format)*; +-$NumericEx = $Numeric ($Extend | $Format)*; +-$ATermEx = $ATerm ($Extend | $Format)*; +-$STermEx = $STerm ($Extend | $Format)*; +-$CloseEx = $Close ($Extend | $Format)*; +- +- +-## ------------------------------------------------- +- +-!!chain; +-!!forward; +- +-# Rule 3 - break after separators. Keep CR/LF together. +-# +-$CR $LF; +- +-$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*; +-$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*; +- +-# Rule 4 - Break after $Sep. +-# Rule 5 - Ignore $Format and $Extend +-# +-[^$Sep]? ($Extend | $Format)*; +- +- +-# Rule 6 +-$ATermEx $NumericEx; +- +-# Rule 7 +-$UpperEx $ATermEx $UpperEx; +- +-#Rule 8 +-# Note: follows errata for Unicode 5.0 boundary rules. +-$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*; +-$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; +- +-# Rule 8a +-($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx); +- +-#Rule 9, 10, 11 +-($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?; +- +-#Rule 12 +-[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai]; +-[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100}; +- +-## ------------------------------------------------- +- +-!!reverse; +- +-$SpEx_R = ($Extend | $Format)* $Sp; +-$ATermEx_R = ($Extend | $Format)* $ATerm; +-$STermEx_R = ($Extend | $Format)* $STerm; +-$CloseEx_R = ($Extend | $Format)* $Close; +- +-# +-# Reverse rules. +-# For now, use the old style inexact reverse rules, which are easier +-# to write, but less efficient. +-# TODO: exact reverse rules. It appears that exact reverse rules +-# may require improving support for look-ahead breaks in the +-# builder. Needs more investigation. +-# +- +-[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; +-#.*; +- +-# Explanation for this rule: +-# +-# It needs to back over +-# The $Sep at which we probably begin +-# All of the non $Sep chars leading to the preceding $Sep +-# The preceding $Sep, which will be the second one that the rule matches. +-# Any immediately preceding STerm or ATerm sequences. We need to see these +-# to get the correct rule status when moving forwards again. +-# +-# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match +-# the entire string. +-# +-# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be +-# at the beginning of the string at this point, and we don't want to fail. +-# Can only use {eof} once, and it is used later. +-# +- +-- +2.39.2 + diff --git a/debian/patches/icu-74.1.diff b/debian/patches/icu-74.1.diff new file mode 100644 index 0000000000..4cbd69c668 --- /dev/null +++ b/debian/patches/icu-74.1.diff @@ -0,0 +1,71 @@ +From ae182240328f20508c7a8936daf74a088627540b Mon Sep 17 00:00:00 2001 +From: Taichi Haradaguchi <20001722@ymail.ne.jp> +Date: Tue, 31 Oct 2023 19:46:23 +0900 +Subject: Update to ICU 74.1 + +https://icu.unicode.org/download/74 + +Unicode 15.1 +https://blog.unicode.org/2023/09/announcing-unicode-standard-version-151.html + +CLDR 44 +https://cldr.unicode.org/index/downloads/cldr-44 + +New Unicode blocks: +UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I + +Change-Id: Ic9196e10138663d07235f5ebd9cc4bf3a9750824 +Reviewed-on: https://gerrit.libreoffice.org/c/core/+/158749 +Tested-by: Eike Rathke +Reviewed-by: Eike Rathke +--- + configure.ac | 4 ++-- + download.lst | 8 ++++---- + external/icu/icu4c-khmerbreakengine.patch.1 | 1 - + include/svx/strings.hrc | 1 + + svx/source/dialog/charmap.cxx | 5 +++++ + 5 files changed, 12 insertions(+), 7 deletions(-) + +diff --git a/external/icu/icu4c-khmerbreakengine.patch.1 b/external/icu/icu4c-khmerbreakengine.patch.1 +index 605914014e96..db8ac50e6f75 100644 +--- a/external/icu/icu4c-khmerbreakengine.patch.1 ++++ b/external/icu/icu4c-khmerbreakengine.patch.1 +@@ -796,7 +796,6 @@ diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionaryda + if (wordCount < limit) { + if (values != nullptr) { + values[wordCount] = bt.getValue(); +- + diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h + --- icu.org/source/common/dictionarydata.h 2023-06-14 06:23:55.000000000 +0900 + +++ icu/source/common/dictionarydata.h 2023-06-26 17:43:53.097724900 +0900 +diff --git a/include/svx/strings.hrc b/include/svx/strings.hrc +index 13f896f04eeb..b8e69dc3dbe1 100644 +--- a/include/svx/strings.hrc ++++ b/include/svx/strings.hrc +@@ -1790,6 +1790,7 @@ + #define RID_SUBSETSTR_KAKTOVIK_NUMERALS NC_("RID_SUBSETMAP", "Kaktovik Numerals") + #define RID_SUBSETSTR_KAWI NC_("RID_SUBSETMAP", "Kawi") + #define RID_SUBSETSTR_NAG_MUNDARI NC_("RID_SUBSETMAP", "Nag Mundari") ++#define RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I NC_("RID_SUBSETMAP", "CJK Unified Ideographs Extension I") + + #define RID_SVXSTR_FRAMEDIR_LTR NC_("RID_SVXSTR_FRAMEDIR_LTR", "Left-to-right (LTR)") + #define RID_SVXSTR_FRAMEDIR_RTL NC_("RID_SVXSTR_FRAMEDIR_RTL", "Right-to-left (RTL)") +diff --git a/svx/source/dialog/charmap.cxx b/svx/source/dialog/charmap.cxx +index ed0c626b59c6..a73b0e263d60 100644 +--- a/svx/source/dialog/charmap.cxx ++++ b/svx/source/dialog/charmap.cxx +@@ -1923,6 +1923,11 @@ void SubsetMap::InitList() + case UBLOCK_NAG_MUNDARI: + aAllSubsets.emplace_back( 0x1E4D0, 0x1E4FF, SvxResId(RID_SUBSETSTR_NAG_MUNDARI) ); + break; ++#endif ++#if (U_ICU_VERSION_MAJOR_NUM >= 74) ++ case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I: ++ aAllSubsets.emplace_back( 0x2EBF0, 0x2EE5F, SvxResId(RID_SUBSETSTR_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I) ); ++ break; + #endif + } + +-- +cgit v1.2.3 + diff --git a/debian/patches/reviewed-breakIterator-customizations.diff b/debian/patches/reviewed-breakIterator-customizations.diff new file mode 100644 index 0000000000..80f9bd814e --- /dev/null +++ b/debian/patches/reviewed-breakIterator-customizations.diff @@ -0,0 +1,1269 @@ +From fb94cc0d1348140d03c2826771c57255ff74a94a Mon Sep 17 00:00:00 2001 +From: Jonathan Clark +Date: Thu, 11 Apr 2024 16:42:39 -0600 +Subject: [PATCH] tdf#49885 Reviewed BreakIterator customizations + +This change completes the review of BreakIterator rule customizations, +and adds unit tests for relevant customizations. + +Change-Id: I06678fcccfc48d020aac64dd9f58ff36a763af30 +Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166017 +Tested-by: Jenkins +Reviewed-by: Eike Rathke +--- + i18npool/qa/cppunit/test_breakiterator.cxx | 559 +++++++++++++++++++ + i18npool/source/breakiterator/data/README | 612 ++++----------------- + 2 files changed, 668 insertions(+), 503 deletions(-) + +diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx +index 0f2629fe05ec..b33466bee46d 100644 +--- a/i18npool/qa/cppunit/test_breakiterator.cxx ++++ b/i18npool/qa/cppunit/test_breakiterator.cxx +@@ -31,6 +31,7 @@ public: + + void testLineBreaking(); + void testWordBoundaries(); ++ void testSentenceBoundaries(); + void testGraphemeIteration(); + void testWeak(); + void testAsian(); +@@ -43,9 +44,18 @@ public: + void testJapanese(); + void testChinese(); + ++ void testLegacyDictWordPrepostDash_de_DE(); ++ void testLegacyDictWordPrepostDash_nds_DE(); ++ void testLegacyDictWordPrepostDash_nl_NL(); ++ void testLegacyDictWordPrepostDash_sv_SE(); ++ void testLegacyHebrewQuoteInsideWord(); ++ void testLegacySurrogatePairs(); ++ void testLegacyWordCountCompat(); ++ + CPPUNIT_TEST_SUITE(TestBreakIterator); + CPPUNIT_TEST(testLineBreaking); + CPPUNIT_TEST(testWordBoundaries); ++ CPPUNIT_TEST(testSentenceBoundaries); + CPPUNIT_TEST(testGraphemeIteration); + CPPUNIT_TEST(testWeak); + CPPUNIT_TEST(testAsian); +@@ -57,6 +67,13 @@ public: + #endif + CPPUNIT_TEST(testJapanese); + CPPUNIT_TEST(testChinese); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_de_DE); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL); ++ CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE); ++ CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord); ++ CPPUNIT_TEST(testLegacySurrogatePairs); ++ CPPUNIT_TEST(testLegacyWordCountCompat); + CPPUNIT_TEST_SUITE_END(); + + private: +@@ -118,6 +135,173 @@ void TestBreakIterator::testLineBreaking() + } + } + ++ // i#22602: writer breaks word after dot immediately followed by a letter ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ //Here we want the line break to leave ./bar/baz clumped together on the next line ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "foo ./bar/baz", strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period", ++ static_cast(4), aResult.breakIndex); ++ } ++ } ++ ++ // i#81448: slash and backslash make non-breaking spaces of preceding spaces ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Per the bug, the line break should leave ...BE clumped together on the next line. ++ // However, the current behavior does not wrap the string at all. This test asserts the ++ // current behavior as a point of reference. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "THIS... ...BE", strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(0), aResult.breakIndex); ++ } ++ } ++ ++ // i#81448: slash and backslash make non-breaking spaces of preceding spaces ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // The line break should leave /BE clumped together on the next line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "THIS... /BE", strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(8), aResult.breakIndex); ++ } ++ } ++ ++ // i#80548: Bad word wrap between dash and word ++ { ++ aLocale.Language = "fi"; ++ aLocale.Country = "FI"; ++ ++ { ++ // Per the bug, the line break should leave -bar clumped together on the next line. ++ // However, this change was reverted at some point. This test asserts the new behavior. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", ++ static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // i#80645: Line erroneously breaks at backslash ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave C:\Program Files\ on the first line ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ } ++ } ++ ++ // i#80841: Words separated by hyphens will always break to next line ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave toll- on the first line ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "toll-free", strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // i#83464: Line break between letter and $ ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave US$ clumped on the next line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "word US$ 123", strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // Unknown bug number: "fix line break problem of dot after letter and before number" ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave US$ clumped on the next line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "word L.5 word", strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(5), aResult.breakIndex); ++ } ++ } ++ ++ // i#83229: Wrong line break when word contains a hyphen ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ { ++ // Here we want the line break to leave 100- clumped on the first line. ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(9), aResult.breakIndex); ++ } ++ } ++ ++ // i#83649: Line break should be between typographical quote and left bracket ++ { ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ { ++ // Here we want the line break to leave »angetan werden« on the first line ++ const OUString str = u"»angetan werden« [Passiv]"_ustr; ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); ++ } ++ } ++ ++ // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters ++ { ++ aLocale.Language = "zh"; ++ aLocale.Country = "HK"; ++ ++ { ++ // Per the bug, this should break at the ideographic comma. However, this change has ++ // been reverted at some point. This test only verifies current behavior. ++ const OUString str = u"word word、word word"_ustr; ++ i18n::LineBreakResults aResult = m_xBreak->getLineBreak( ++ str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(13), aResult.breakIndex); ++ } ++ } ++ ++ // i#80891: Character in the forbidden list sometimes appears at the start of line ++ { ++ aLocale.Language = "zh"; ++ aLocale.Country = "HK"; ++ ++ { ++ // Per the bug, the ideographic two-dot leader should be a forbidden character. However, ++ // this change seems to have been reverted or broken at some point. ++ const OUString str = u"電話︰電話"_ustr; ++ i18n::LineBreakResults aResult ++ = m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(static_cast(2), aResult.breakIndex); ++ } ++ } ++ + //See https://bz.apache.org/ooo/show_bug.cgi?id=19716 + { + aLocale.Language = "en"; +@@ -160,6 +344,20 @@ void TestBreakIterator::testLineBreaking() + CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast(5), aResult.breakIndex); + } + } ++ ++ // i#65267: Comma is badly broken at end of line ++ // - The word should be wrapped along with the comma ++ { ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ { ++ auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort", ++ strlen("Wort -prinzessinnen,"), aLocale, 0, ++ aHyphOptions, aUserOptions); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex); ++ } ++ } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=49629 +@@ -601,6 +799,174 @@ void TestBreakIterator::testWordBoundaries() + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + } ++ ++ // i#55778: Words containing numbers get broken up ++ { ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ static constexpr OUString aTest = u"first i18n third"_ustr; ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos); ++ } ++ ++ // i#56347: "BreakIterator patch for Hungarian" ++ // Rules for Hungarian affixes after numbers and certain symbols ++ { ++ auto mode = i18n::WordType::DICTIONARY_WORD; ++ aLocale.Language = "hu"; ++ aLocale.Country = "HU"; ++ ++ OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); ++ } ++ ++ // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) ++ // Rules for Hungarian affixes after numbers and certain symbols in edit mode. ++ // The patch was merged, but the original bug was never closed and the current behavior seems ++ // identical to the ICU default behavior. Added this test to ensure that doesn't change. ++ { ++ auto mode = i18n::WordType::ANY_WORD; ++ aLocale.Language = "hu"; ++ aLocale.Country = "HU"; ++ ++ OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos); ++ ++ aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testSentenceBoundaries() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ // Trivial characteristic test for sentence boundary detection ++ { ++ OUString aTest("This is a sentence. This is a different sentence."); ++ ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale)); ++ } ++ ++ // i#24098: i18n API beginOfSentence/endOfSentence ++ // fix beginOfSentence, ... when cursor is on the beginning of the sentence ++ { ++ OUString aTest("This is a sentence. This is a different sentence."); ++ ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale)); ++ } ++ ++ // i#24098: i18n API beginOfSentence/endOfSentence ++ // "skip preceding space for beginOfSentence" ++ { ++ OUString aTest("This is a sentence. This is a different sentence."); ++ ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale)); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale)); ++ } + } + + //See https://bugs.libreoffice.org/show_bug.cgi?id=40292 +@@ -1043,6 +1409,199 @@ void TestBreakIterator::testChinese() + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } + } ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_de_DE() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "de"; ++ aLocale.Country = "DE"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "nds"; ++ aLocale.Country = "DE"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_nl_NL() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "nl"; ++ aLocale.Country = "NL"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE() ++{ ++ lang::Locale aLocale; ++ aLocale.Language = "sv"; ++ aLocale.Country = "SE"; ++ ++ { ++ auto aTest = u"Arbeits- -nehmer"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyHebrewQuoteInsideWord() ++{ ++ lang::Locale aLocale; ++ ++ aLocale.Language = "he"; ++ aLocale.Country = "IL"; ++ ++ { ++ auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; ++ ++ i18n::Boundary aBounds ++ = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacySurrogatePairs() ++{ ++ lang::Locale aLocale; ++ ++ aLocale.Language = "ja"; ++ aLocale.Country = "JP"; ++ ++ // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs. ++ // and many others to address bugs: i#75631 i#75633 i#75412 etc. ++ // ++ // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test. ++ { ++ const sal_Unicode buf[] = { u"X 𠮟 X" }; ++ OUString aTest(buf, SAL_N_ELEMENTS(buf)); ++ ++ auto aBounds ++ = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); ++ ++ aBounds ++ = m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); ++ CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); ++ } ++} ++ ++void TestBreakIterator::testLegacyWordCountCompat() ++{ ++ lang::Locale aLocale; ++ ++ aLocale.Language = "en"; ++ aLocale.Country = "US"; ++ ++ // i#80815: "Word count differs from MS Word" ++ // This is a characteristic test for word count using test data from the linked bug. ++ { ++ const OUString str = u"" ++ "test data for word count issue #80815\n" ++ "fo\\\'sforos\n" ++ "archipi\\\'elago\n" ++ "do\\^me\n" ++ "f**k\n" ++ "\n" ++ "battery-driven\n" ++ "and/or\n" ++ "apple(s)\n" ++ "money+opportunity\n" ++ "Micro$oft\n" ++ "\n" ++ "300$\n" ++ "I(not you)\n" ++ "a****n\n" ++ "1+3=4\n" ++ "\n" ++ "aaaaaaa.aaaaaaa\n" ++ "aaaaaaa,aaaaaaa\n" ++ "aaaaaaa;aaaaaaa\n"_ustr; ++ ++ int num_words = 0; ++ sal_Int32 next_pos = 0; ++ int iter_guard = 0; ++ while (true) ++ { ++ CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100); ++ ++ auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT); ++ ++ if (aBounds.endPos < next_pos) ++ { ++ break; ++ } ++ ++ next_pos = aBounds.endPos; ++ ++num_words; ++ } ++ ++ CPPUNIT_ASSERT_EQUAL(23, num_words); ++ } ++} ++ + void TestBreakIterator::setUp() + { + BootstrapFixtureBase::setUp(); +diff --git a/i18npool/source/breakiterator/data/README b/i18npool/source/breakiterator/data/README +index 6246b80ae77f..76e3e37c3faf 100644 +--- a/i18npool/source/breakiterator/data/README ++++ b/i18npool/source/breakiterator/data/README +@@ -9,411 +9,108 @@ At various stages these copies have been customized and are now horribly out of + sync. It unclear which diffs from the base versions are deliberate and which + are now accidental :-( + +-We need to review the various issues referenced in the commits that caused +-customizations and see if they're still relevant or not, write regression tests +-for them, if any are still relevant then apply the changes back on top of the +-latest versions. ++The various issues and customizations have been reviewed, with tests written for ++customizations that are still relevant. However, these files are still extremely ++out-of-date and need to be refreshed. Relevant customizations should be reapplied ++on top of a current version. + +-to-review, later are ok: +- +-commit e1ad946ef5db3f7c0a540207d0f0fd85799e3b66 +-Author: Release Engineers +-Date: Thu Aug 6 18:13:57 2009 +0000 +- +- CWS-TOOLING: integrate CWS tl73 +- 2009-07-31 15:29:33 +0200 tl r274535 : #i64400# dash/hyphen should not break words +- +-commit 9964a76ef58786bba47d409970512d7ded6c8889 +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:53:05 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.1.2); FILE ADDED +- 2008/04/25 17:06:26 khong 1.1.2.3: i55063, make period a sentence delimiter +- 2008/04/25 06:40:50 khong 1.1.2.2: i55063, make space as Thai sentence delimiter +- 2008/04/24 03:19:10 khong 1.1.2.1: i55063, set Thai letters as sentence delimiter for Thai and English mixed text +- +-commit e4a6e4284dae1ca6fbfa7d1e43690dbf87d796cd +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:52:44 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.9.12); FILE MERGED +- 2008/06/17 20:22:30 khong 1.9.12.2: i83229 fix the problem of leading hyphen for numbers +- 2008/04/23 06:20:16 khong 1.9.12.1: i72868, i80891, i83229, fix Chinese punctuations and hyphen for line breakiterator +- +-commit 55dff22611659a1567c968fbf9e512a2765ab62e +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:52:07 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.33.36); FILE MERGED +- 2008/06/05 22:18:29 khong 1.33.36.2: RESYNC: (1.33-1.35); FILE MERGED +- 2008/04/23 06:11:55 khong 1.33.36.1: i55063, enable language specific sentence breakiterator +- +-commit 1c2b8095631a3c2d2f396bf50a8f0c62f49be65c +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:51:12 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.12.140); FILE MERGED +- 2008/06/05 22:18:26 khong 1.12.140.2: RESYNC: (1.12-1.13); FILE MERGED +- 2008/04/23 06:04:53 khong 1.12.140.1: i87530 avoid breaking line before un-completed cell +- +-commit 9bbdb52df370c69c0f7eba387a2068ee80bd7994 +-Author: Rüdiger Timm +-Date: Wed Jul 2 07:50:43 2008 +0000 +- +- INTEGRATION: CWS i18n41 (1.25.2); FILE MERGED +- 2008/06/05 22:18:23 khong 1.25.2.2: RESYNC: (1.25-1.26); FILE MERGED +- 2008/04/23 06:09:02 khong 1.25.2.1: i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts +- +-commit 8dcdd3ca268f78295731b86797c2b8cd447ba667 +-Author: Kurt Zenker +-Date: Tue May 20 13:36:01 2008 +0000 +- +- INTEGRATION: CWS i18n43_DEV300 (1.33.38); FILE MERGED +- 2008/04/29 21:51:51 khong 1.33.38.1: #i88411# apply the patch from Coleman Kane to fix icu setBreakType issue +- +-commit bedef98c24ef9ada6aaffe9bc5284d9759a31a9a +-Author: Kurt Zenker +-Date: Wed Apr 2 08:49:09 2008 +0000 +- +- INTEGRATION: CWS i18n40 (1.2.314); FILE MERGED +- 2008/03/19 06:30:23 khong 1.2.314.2: #i80815# count dash like MS Word +- 2008/03/15 07:32:44 khong 1.2.314.1: #i80815# count punctuation as word +- +-commit 59144104b3f91a2e6ed816f0bde0fdb91ea218d7 +-Author: Kurt Zenker +-Date: Wed Apr 2 08:48:53 2008 +0000 +- +- INTEGRATION: CWS i18n40 (1.24.44); FILE MERGED +- 2008/03/19 18:56:42 khong 1.24.44.2: i80815 make word count feature like MS Word +- 2008/03/15 07:31:38 khong 1.24.44.1: #i80815# count punctuation as word +- +-commit 3f0b51776602c45e8aca991450fcbb30f2484ae5 +-Author: Vladimir Glazounov +-Date: Mon Jan 28 14:33:46 2008 +0000 +- +- INTEGRATION: CWS i18n39 (1.8.4); FILE MERGED +- 2007/12/12 17:45:45 khong 1.8.4.3: b6634800# fix line break problem of dot after letter and before number +- 2007/12/08 01:05:52 khong 1.8.4.2: #i83649# fixed the problem of line break between quotation mark and open bracket +- 2007/12/07 23:44:30 khong 1.8.4.1: #i83464# fix the problem of line break between letter and 1326 +- +-commit 5d8ef209b1f63d1c8ea5014bdbef96660b355423 +-Author: Vladimir Glazounov +-Date: Tue Oct 23 08:09:00 2007 +0000 +- +- INTEGRATION: CWS i18n38 (1.7.4); FILE MERGED +- 2007/09/19 00:08:04 khong 1.7.4.3: i81448 fixed dot line break issue +- 2007/09/10 23:57:12 khong 1.7.4.2: i81440 fix the problem of line break on punctuations +- 2007/09/10 22:55:46 khong 1.7.4.1: i81448 fix problem of line break on symbols +- +-commit a2f3b48cacfcef338ca5e37acde34c83876e082e +-Author: Vladimir Glazounov +-Date: Tue Oct 23 08:08:47 2007 +0000 +- +- INTEGRATION: CWS i18n38 (1.32.10); FILE MERGED +- 2007/09/18 20:32:39 khong 1.32.10.1: i81519 set break type icu breakiterator +- +-commit 1967d8fb182b3101dee4f715e78be384400bc1e8 +-Author: Kurt Zenker +-Date: Wed Sep 5 16:37:28 2007 +0000 +- +- INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED +- 2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator +- 2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem +- +-commit d2c2baf1a31d281d20e8b4d4c806dda027b2d5a3 +-Author: Vladimir Glazounov +-Date: Tue Aug 28 11:46:45 2007 +0000 +- +- INTEGRATION: CWS i18n36_SRC680 (1.5.20.1.2); FILE MERGED +- 2007/08/22 17:12:36 khong 1.5.20.1.2.1: i80841 fix hyphen line break problem +- +-commit d56bedfb425cf77f176f143455e4a9fb6ce65540 +-Author: Vladimir Glazounov +-Date: Tue Aug 28 11:46:34 2007 +0000 +- +- INTEGRATION: CWS i18n36_SRC680 (1.21.2.1.2); FILE MERGED +- 2007/08/22 20:02:28 khong 1.21.2.1.2.2: i80923 fix infinite loop problem +- 2007/08/22 17:11:44 khong 1.21.2.1.2.1: i80923 fix a infinite loop +- +-commit 8a36b196925a5561eabde0a0ef293c73fcb5add3 +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:48 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.5.22); FILE MERGED +- 2007/08/13 22:26:12 khong 1.5.22.1: i80548 i80645 fix dash and backslash issues in line breakiterator +- +-commit c00b2b49bad765144f90552139e63d87d520d1cf +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:36 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.15.4); FILE MERGED +- 2007/08/13 22:33:38 khong 1.15.4.1: i86439 fix surrogate characters handling issues +- +-commit 3fc5fbc71d4c244d7c8002aa530481741e585bd4 +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:23 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.31.4); FILE MERGED +- 2007/08/13 22:33:37 khong 1.31.4.1: i86439 fix surrogate characters handling issues +- +-commit ee44b43881e7c82c379931f111c452a477b73341 +-Author: Ivo Hinkelmann +-Date: Fri Aug 17 13:58:11 2007 +0000 +- +- INTEGRATION: CWS i18n34 (1.21.4); FILE MERGED +- 2007/08/14 08:38:53 khong 1.21.4.2: i86439 fix surrogate characters handling issues +- 2007/08/13 22:33:37 khong 1.21.4.1: i86439 fix surrogate characters handling issues +- +-commit f47369dbbc385f8968ad43e43cba293a29a4c2df +-Author: Jens-Heiner Rechtien +-Date: Tue Jul 31 16:09:13 2007 +0000 +- +- INTEGRATION: CWS i18n32 (1.29.14); FILE MERGED +- 2007/07/24 20:39:44 khong 1.29.14.1: #i79148# fix a local word breakiterator rules loading issue +- +-commit 2791553b4e3fc5e04b96d0b2fd119d9fba1946bc +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:51 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.14.60); FILE MERGED +- 2007/07/16 22:18:44 khong 1.14.60.4: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/07/13 20:37:32 khong 1.14.60.3: #i75632# use ICU characters properties +- 2007/07/04 01:17:22 khong 1.14.60.2: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/06/27 04:33:11 khong 1.14.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 1c79a2bf1e89ac4eb409922ab7eb8ad3cacc688a +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:39 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.8.60); FILE MERGED +- 2007/06/27 04:33:11 khong 1.8.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 517bbaddbaf81a5a6bb00979944cad13a1575d50 +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:27 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.28.14); FILE MERGED +- 2007/07/13 20:37:32 khong 1.28.14.5: #i75632# use ICU characters properties +- 2007/07/04 01:17:22 khong 1.28.14.4: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/06/27 23:25:58 khong 1.28.14.3: i75412 handle surrogate pair characters +- 2007/06/27 05:33:20 khong 1.28.14.2: RESYNC: (1.28-1.29); FILE MERGED +- 2007/06/27 04:33:11 khong 1.28.14.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 0154e3492f2527535c0d648274e7ff674674318b +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:14 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.14.42); FILE MERGED +- 2007/06/27 05:33:03 khong 1.14.42.2: RESYNC: (1.14-1.15); FILE MERGED +- 2007/06/27 04:33:11 khong 1.14.42.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit e2a5a2532ee187669980adb7bfa747c7803c330a +-Author: Rüdiger Timm +-Date: Thu Jul 26 08:08:02 2007 +0000 +- +- INTEGRATION: CWS i18n31 (1.19.60); FILE MERGED +- 2007/07/13 20:37:32 khong 1.19.60.4: #i75632# use ICU characters properties +- 2007/07/04 01:17:22 khong 1.19.60.3: i75631 i75632 i75633 i75412 handle surrogate pair characters +- 2007/06/27 05:00:48 khong 1.19.60.2: i75231 handle surrogate pair characters +- 2007/06/27 04:33:11 khong 1.19.60.1: i75631 i75632 i75633 i75412 handle surrogate pair characters +- +-commit 80a26a7d4720b5b8cfa0acc624b28014c96d9948 +-Author: Jens-Heiner Rechtien +-Date: Tue Jun 26 16:41:02 2007 +0000 +- +- INTEGRATION: CWS ause081 (1.2.332); FILE MERGED +- 2007/06/21 10:53:19 hjs 1.2.332.1: #i78393# remove component_getDescriptionFunc from exports +- +-commit c2801db6b04bf6f0dbb07727c91b2c66e7e027b8 +-Author: Ivo Hinkelmann +-Date: Wed Jun 6 11:17:38 2007 +0000 +- +- INTEGRATION: CWS i18n30 (1.4.24); FILE MERGED +- 2007/05/08 21:32:18 khong 1.4.24.1: #i73903# update line breakiterator rule to icu3.6 style +- +-commit ea290668f78475c3b277c9e44bf5622ccb4dcec8 +-Author: Ivo Hinkelmann +-Date: Wed Jun 6 11:17:25 2007 +0000 +- +- INTEGRATION: CWS i18n30 (1.28.4); FILE MERGED +- 2007/05/08 21:47:00 khong 1.28.4.3: #i75412# remove fix from cws i18n30, move it to other cws to fix with other Japanese surrogate issues +- 2007/03/20 18:39:58 khong 1.28.4.2: #i72589# fixed BS problem for surrogate characters +- 2007/03/13 19:11:44 khong 1.28.4.1: #i75319# fixed ANY_WORD rule loading problem +- +-commit b6308a6e322fd4eaa7845793beb70900624f351c +-Author: Ivo Hinkelmann +-Date: Wed Jun 6 11:17:12 2007 +0000 +- +- INTEGRATION: CWS i18n30 (1.14.32); FILE MERGED +- 2007/05/08 21:44:15 khong 1.14.32.1: #i76706# fix infinite loop for CJK word breakiterator for text mixed with Latin and CJK characters +- +-commit e068e0e9aa9405ea4016ad19e9a963129adfed79 +-Author: Rüdiger Timm +-Date: Thu Jan 25 08:35:42 2007 +0000 +- +- INTEGRATION: CWS i18n28 (1.1.2); FILE ADDED +- 2006/12/06 05:52:39 khong 1.1.2.1: #i64400# add an optional breakiterator entry in localedata +- +-commit 8d6f35a46085bb420e8896505504b376d17b842a +-Author: Rüdiger Timm +-Date: Thu Jan 25 08:35:31 2007 +0000 +- +- INTEGRATION: CWS i18n28 (1.24.36); FILE MERGED +- 2006/12/19 17:27:58 khong 1.24.36.2: RESYNC: (1.24-1.25); FILE MERGED +- 2006/12/06 05:52:38 khong 1.24.36.1: #i64400# add an optional breakiterator entry in localedata +- +-commit 633d34fa33330339ab6795ce3703477216e0062e +-Author: Kurt Zenker +-Date: Tue Dec 12 15:14:36 2006 +0000 +- +- INTEGRATION: CWS icuupgrade (1.9.24); FILE MERGED +- 2006/10/11 06:11:11 khong 1.9.24.4: RESYNC: (1.10-1.11); FILE MERGED +- 2006/07/07 10:57:40 hdu 1.9.24.3: RESYNC: (1.9-1.10); FILE MERGED +- 2006/06/30 01:31:40 khong 1.9.24.2: #i53388# upgrade icu to 3.4.1 +- 2006/06/15 19:16:55 khong 1.9.24.1: #i60645# upgrade icu to 3.4.1 +- +-commit 5d46dabe95271c846601a2575d3304fd5b4b24f1 +-Author: Kurt Zenker +-Date: Tue Dec 12 15:14:05 2006 +0000 +- +- INTEGRATION: CWS icuupgrade (1.22.20); FILE MERGED +- 2006/11/11 07:12:47 khong 1.22.20.6: #142664# fix breakiterator crash problem +- 2006/10/11 06:10:51 khong 1.22.20.5: RESYNC: (1.23-1.24); FILE MERGED +- 2006/09/06 01:00:31 khong 1.22.20.4: #i60645# upgrade to icu 3.6 +- 2006/07/07 10:57:32 hdu 1.22.20.3: RESYNC: (1.22-1.23); FILE MERGED +- 2006/06/30 01:31:40 khong 1.22.20.2: #i53388# upgrade icu to 3.4.1 +- 2006/06/20 14:27:26 hdu 1.22.20.1: #i60645# fix crash when udata_open failed +- +-commit 7431d816cdfc47b08978c0afd1f6503644bb11b8 +-Author: Kurt Zenker +-Date: Mon Nov 6 13:40:05 2006 +0000 +- +- INTEGRATION: CWS i18n27 (1.3.142); FILE MERGED +- 2006/10/10 21:10:57 khong 1.3.142.1: #i65267# fix line break rule +- +-commit d7471e1462ffd9baeb3449eb86ccbb649e32b233 +-Author: Kurt Zenker +-Date: Mon Nov 6 13:39:52 2006 +0000 +- +- INTEGRATION: CWS i18n27 (1.1.2); FILE ADDED +- 2006/10/10 21:08:55 khong 1.1.2.1: #i56348# add Hungarian word break rule for edit mode +- +-commit 1b65b0b886e2cb16382bc11770230fb6a140f33b +-Author: Jens-Heiner Rechtien +-Date: Tue Oct 24 12:53:13 2006 +0000 +- +- INTEGRATION: CWS tl29 (1.12.24); FILE MERGED +- 2006/09/20 01:24:53 khong 1.12.24.1: #i69482# fixed mismatch of nextWord and getWordBoundary +- +-commit 97d89862a2285071202cc8010d888ffcbf96279a +-Author: Jens-Heiner Rechtien +-Date: Thu Nov 17 19:30:35 2005 +0000 +- +- INTEGRATION: CWS i18n23 (1.20.22); FILE MERGED +- 2005/11/17 20:00:37 khong 1.20.22.3: RESYNC: (1.20-1.21); FILE MERGED +- 2005/11/17 19:45:05 khong 1.20.22.2: #i57866# merge cws i18n23 and thaiissues +- 2005/11/15 21:10:24 khong 1.20.22.1: #i57866# fix line breakiterator problem +- +-commit 05fadde6f025bcaafca4f3093e88be3cc1bb6836 +-Author: Oliver Bolte +-Date: Wed Nov 16 09:18:37 2005 +0000 +- +- INTEGRATION: CWS thaiissues (1.20.6); FILE MERGED +- 2005/10/26 20:42:40 khong 1.20.6.2: use icu thai linke break algorithm for thai breakiterator +- 2005/10/26 13:36:24 fme 1.20.6.1: #i55716# Handling of WORDJOINER +- +-commit a10b0e70c641d7438c557ef718c6942b3abffaec +-Author: Oliver Bolte +-Date: Wed Nov 16 09:18:25 2005 +0000 +- +- INTEGRATION: CWS thaiissues (1.8.6); FILE MERGED +- 2005/10/26 20:42:39 khong 1.8.6.1: use icu thai linke break algorithm for thai breakiterator +- +-commit 4a1f1586173839d532f90507c72306bc9e2aec56 +-Author: Oliver Bolte +-Date: Wed Nov 16 09:18:11 2005 +0000 +- +- INTEGRATION: CWS thaiissues (1.9.4); FILE MERGED +- 2005/10/28 17:54:39 khong 1.9.4.1: Fix a bug in ctl line break when there is word joiner character +- +-commit beb2a536738ba761a92f8266570f1859c85f94ae +-Author: Rüdiger Timm +-Date: Tue Nov 8 15:59:16 2005 +0000 +- +- INTEGRATION: CWS siloch (1.3.50); FILE MERGED +- 2005/10/26 10:55:05 er 1.3.50.1: #i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking; contributed by Nemeth Laszlo +- +-commit 939e7c2bc93c13b6740051beeb08c5883b65ffce +-Author: Kurt Zenker +-Date: Fri Nov 4 14:33:30 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED +- 2005/10/21 00:35:09 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit 51594ef552a872b9868e5c7a025a68665488a016 +-Author: Kurt Zenker +-Date: Fri Nov 4 14:33:16 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.2.2); FILE MERGED +- 2005/10/21 00:35:08 khong 1.2.2.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit f4fe39909c7ed645a8b387cf66de249572226ad6 +-Author: Kurt Zenker +-Date: Fri Nov 4 14:33:03 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.3.46); FILE MERGED +- 2005/10/21 00:35:08 khong 1.3.46.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit 7f8af14611e66655ea7354083eafd71afc9703e3 +-Author: Kurt Zenker +-Date: Fri Nov 4 14:32:41 2005 +0000 +- +- INTEGRATION: CWS i18n21 (1.4.46); FILE MERGED +- 2005/10/21 00:35:07 khong 1.4.46.1: #i55778 reverse back last change, treat letter and number combination as one word. +- +-commit 924e158b9d871fbf7500e9215540e26aa95b3b20 +-Author: Rüdiger Timm +-Date: Mon Oct 17 14:43:17 2005 +0000 +- +- INTEGRATION: CWS i18n20 (1.1.2); FILE ADDED +- 2005/09/22 23:47:49 khong 1.1.2.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- +-commit a428a8927006a10ccfe7182e6fe5a8b677281eca +-Author: Rüdiger Timm +-Date: Mon Oct 17 14:42:30 2005 +0000 +- +- INTEGRATION: CWS i18n20 (1.18.32); FILE MERGED +- 2005/09/23 15:59:13 khong 1.18.32.6: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/09/23 08:09:54 khong 1.18.32.5: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/09/23 07:38:03 khong 1.18.32.4: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule +- 2005/09/22 23:47:48 khong 1.18.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/08/26 23:34:37 khong 1.18.32.2: #i50172# add cell breakiterator rule for Tamil +- 2005/08/26 23:31:59 khong 1.18.32.1: #i50172# add cell breakiterator rule for Tamil +- +-commit f518f78557931b81e06fd7b31bb22c6639e5e553 +-Author: Rüdiger Timm +-Date: Mon Oct 17 14:42:14 2005 +0000 +- +- INTEGRATION: CWS i18n20 (1.6.32); FILE MERGED +- 2005/09/23 15:59:13 khong 1.6.32.3: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- 2005/09/23 07:38:02 khong 1.6.32.2: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule +- 2005/09/22 23:47:48 khong 1.6.32.1: #i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. +- +-commit 9b870055ecd043d1d4fadeacd351f8739e1979a0 +-Author: Vladimir Glazounov +-Date: Fri Feb 25 09:08:13 2005 +0000 +- +- INTEGRATION: CWS i18n16 (1.16.22); FILE MERGED +- 2005/02/04 19:05:45 khong 1.16.22.3: #i41671# use ICU rules for Thai breakiterator +- 2005/01/24 21:56:34 khong 1.16.22.2: #i35285# merge cws i18n16 with top version 1.17 +- 2005/01/12 01:12:41 khong 1.16.22.1: #i35285# remove uprv_malloc, use udata_open for loading icu rule breakiterator +- +-commit 29b9e86f5dac388d7aaced24d3826ac9331b03e3 +-Author: Vladimir Glazounov +-Date: Fri Feb 25 09:07:59 2005 +0000 ++done, regression tests added: + +- INTEGRATION: CWS i18n16 (1.5.22); FILE MERGED +- 2005/02/04 19:05:45 khong 1.5.22.1: #i41671# use ICU rules for Thai breakiterator ++#112623# update Japanese word breakiterator dictionary ++#i50172# add cell breakiterator rule for Tamil ++#i80412# indic cursoring ++#i107843# em-dash/en-dash breakiterator fix for spell checking ++#i103552# Japanese word for 'shutdown' added to ja.dic ++#i113785# ligatures for spell checking will no longer break words ++An opening quote should not be counted as a word by word count tool (regression test in writer) ++fdo#31271 wrong line break with ( ++#i89042# word count fix (regression test is in writer) ++#i58513# add break iterator rules for Finish ++#i19716# fix wrong line break on bracket characters ++#i21290# extend Greek script type ++#i21907# fix isBeginWord and isEndWord problem ++#i85411# Apply patch for ZWSP ++#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break ++#i13451# add '-' as midLetter for Catalan dictionary word breakiterator ++#i13494# fix word breakiterator rule to handle punctuations and signs correctly ++#i29548# Fix Thai word breakiterator problem ++#i11993# #i14904# fix word breakiterator issues ++#i64400# dash/hyphen should not break words (de/nds/nl/sv) ++#i22602# make dot stick on beginning of a word when doing line break ++#i24098# skip preceding space for beginOfSentence ++#i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence ++#i51661# add quotation mark as middle letter for Hebrew in word breakiterator rule. ++#i50172# add cell breakiterator rule for Tamil ++#i55778# reverse back last change, treat letter and number combination as one word. ++#i56347# apply patch to recognize suffixes of numbers in Hungarian spellchecking ++#i56348# add Hungarian word break rule for edit mode ++#i65267# fix line break rule ++#i86439# many changes to implement, tweak, debug UTF-16 surrogate pair handling ++#i75631# " ++#i75632# " ++#i75633# " ++#i75412# " ++#i80645# fix backslash issues in line breakiterator ++#i80841# fix hyphen line break problem ++#i81448# fixed dot line break issue ++#i81448# fix the problem of line break on punctuations (commit message says i81440) ++#i81448# fix problem of line break on symbols ++#i83649# fixed the problem of line break between quotation mark and open bracket ++#i83464# fix the problem of line break between letter and 1326 ++b6634800# fix line break problem of dot after letter and before number ++#i83229# fix the problem of leading hyphen for numbers ++#i80815# count words like MS Word ++ ++likely superseded: ++ ++#i21392# Obscure line break behavior mismatch in string of symbols between MSO and LO. ++#i80548# "fix dash issues in line breakiterator" - fix no longer works ++#i72868# "fix Chinese punctuation for line breakiterator" - fix no longer works ++#i80891# "fix Chinese punctuation for line breakiterator" - fix no longer works ++ ++#i27711# Adding/tweaking/removing languages later added to ICU. ++#i33756# " ++#i41671# " ++#i41671# " ++#i55063# " ++#i24850# ICU upgrades, internal bug fixes, or other work-arounds. ++#i24098# " ++#112772# " ++#i35285# " ++4a1f1586173839d532f90507c72306bc9e2aec56 " ++a10b0e70c641d7438c557ef718c6942b3abffaec " ++05fadde6f025bcaafca4f3093e88be3cc1bb6836 " ++#i57866# " ++#i57866# " ++#i69482# " ++#142664# " ++#i60645# " ++#i53388# " ++#i60645# " ++#i78393# " ++#i73903# " ++#i75412# " ++#i72589# " ++#i75319# " ++#i76706# " ++#i64400# " ++#i64400# " ++#i79148# " ++#i55063# " ++#i87530# " ++#i88041# " ++#i88411# " ++#i80923# " ++#i80923# " ++#i81519# " ++ ++ ++suspect: ++ ++ ++- The intentions behind the following commits are unclear, as the referenced bugs were in the ++StarOffice internal bug tracker. These changes are contemporaneous with TR14 Revision 17, and seem ++to be part of an effort to backport upstream rule changes across multiple language customizations. + + commit 746ea3d8c29b27b23af3433446f66db0ad3096d6 + Author: Oliver Bolte +@@ -436,108 +133,17 @@ Date: Tue Jan 11 10:18:51 2005 +0000 + INTEGRATION: CWS i18n15 (1.3.36); FILE MERGED + 2004/09/04 02:03:53 khong 1.3.36.1: #117685# make dictionary word contain only letter or only number, dot can be in middle or end of a word, but only one. + +-commit e5a62ce85bebcc9fb2bf0e5b9aced5fc7748055b +-Author: Oliver Bolte +-Date: Tue Jan 11 10:18:37 2005 +0000 +- +- INTEGRATION: CWS i18n15 (1.16.4); FILE MERGED +- 2004/10/07 18:19:11 khong 1.16.4.1: #i33756# update Hungarian breakiterator +- +-commit d2a6a31e6981800c2a920f8c6ff901c341a0466e +-Author: Kurt Zenker +-Date: Fri Jul 30 13:38:57 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.8.92); FILE MERGED +- 2004/06/14 23:24:16 khong 1.8.92.2: #112772# Japanese word breakiterator is not correct +- 2004/06/11 19:23:04 khong 1.8.92.1: #112772# Japanese word breakiterator is not correct + +-commit d6b8dabc3dc4811e1152d411a8428ccb334d16ab +-Author: Kurt Zenker +-Date: Fri Jul 30 13:38:17 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.7.162); FILE MERGED +- 2004/06/11 19:23:04 khong 1.7.162.1: #112772# Japanese word breakiterator is not correct +- +-commit 9ea4c16a699ac7cf5e255a19653651ac993f022b +-Author: Kurt Zenker +-Date: Fri Jul 30 13:38:05 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.9.92); FILE MERGED +- 2004/06/11 19:23:04 khong 1.9.92.1: #112772# Japanese word breakiterator is not correct ++- The intention behind the following commit is unclear, as the bug references are incorrect and no ++good candidates were immediately apparent. Based on the text of the commit, however, it appears to ++be a simple bug fix for skipSpace(). This function has also had a great deal of churn since this ++commit, further suggesting it is no longer pertinent. + +-commit 2887ecb5554eee699e1dce4ffbc2dfcf71a54a41 ++commit 1967d8fb182b3101dee4f715e78be384400bc1e8 + Author: Kurt Zenker +-Date: Fri Jul 30 13:37:54 2004 +0000 +- +- INTEGRATION: CWS i18n13 (1.15.18); FILE MERGED +- 2004/06/17 20:29:38 khong 1.15.18.2: # +- 2004/06/02 04:54:24 khong 1.15.18.1: #i11993# fix getWordBoundary problem when position is on the end of the word. +- +-commit 606556eed208d1218f950df2200510a7e19af1d9 +-Author: Oliver Bolte +-Date: Fri May 28 15:33:28 2004 +0000 +- +- INTEGRATION: CWS i18n12 (1.1.2); FILE ADDED +- 2004/04/30 14:37:52 er 1.1.2.1: #i27711# Hungarian breakiterator (provided by Timar Andras) +- +-commit 9710ca90166c18c0a92f7f0246a7c2f7dae87ebc +-Author: Oliver Bolte +-Date: Fri May 28 15:33:17 2004 +0000 +- +- INTEGRATION: CWS i18n12 (1.4.22); FILE MERGED +- 2004/04/13 11:55:32 er 1.4.22.1: #i27711# Hungarian breakiterator +- +-commit b138663ef4f4ade38fb42f8a2f567527cf15949b +-Author: Oliver Bolte +-Date: Fri May 28 15:33:02 2004 +0000 +- +- INTEGRATION: CWS i18n12 (1.13.22); FILE MERGED +- 2004/04/30 11:25:47 er 1.13.22.2: RESYNC: (1.13-1.14); FILE MERGED +- 2004/04/13 11:55:32 er 1.13.22.1: #i27711# Hungarian breakiterator +- +-commit f5bc5f04e4de8fa502d498a99f4ef6a340d796c0 +-Author: Oliver Bolte +-Date: Wed Mar 17 08:02:14 2004 +0000 +- +- INTEGRATION: CWS i18n11 (1.13.14); FILE MERGED +- 2004/02/04 02:09:04 khong 1.13.14.2: #i24098# skip preceding space for beginOfSentence +- 2004/01/06 19:41:49 khong 1.13.14.1: #i24098# fix beginOfSentence, which did not work correctly when cursor is on the beginning of the sentence +- +-commit 16401a5b865b5da8a2dd70057e8b048e9b797d5a +-Author: Oliver Bolte +-Date: Wed Mar 17 08:02:01 2004 +0000 +- +- INTEGRATION: CWS i18n11 (1.12.14); FILE MERGED +- 2004/02/10 14:21:13 er 1.12.14.3: RESYNC: (1.12-1.13); FILE MERGED +- 2004/02/05 16:45:30 khong 1.12.14.2: #i24850# fix the problem in previousCharBlock, when target char block is in position 1 +- 2004/02/04 02:13:48 khong 1.12.14.1: #i24098# check boundary condition for Sentence, Script, CharBlock breakiterator +- +-commit 4da98b648497af30de0fcf1a16e649ce18b0564f +-Author: Jens-Heiner Rechtien +-Date: Mon Mar 8 16:17:05 2004 +0000 +- +- INTEGRATION: CWS i18n09 (1.2.2); FILE MERGED +- 2003/12/04 23:45:37 khong 1.2.2.3: #i22602# make dot stick on beginning of a word when doing line break +- 2003/12/04 23:12:37 khong 1.2.2.2: #i21392# change line break rule to match with MS office ++Date: Wed Sep 5 16:37:28 2007 +0000 + +-done, regression tests added: ++ INTEGRATION: CWS i18n37 (1.22.6); FILE MERGED ++ 2007/09/03 18:27:39 khong 1.22.6.2: i8132 fixed a problem in skipping space for word breakiterator ++ 2007/08/31 21:30:30 khong 1.22.6.1: i81158 fix skipping space problem + +-#112623# update Japanese word breakiterator dictionary +-#i50172# add cell breakiterator rule for Tamil +-#i80412# indic cursoring +-#i107843# em-dash/en-dash breakiterator fix for spell checking +-#i103552# Japanese word for 'shutdown' added to ja.dic +-#i113785# ligatures for spell checking will no longer break words +-An opening quote should not be counted as a word by word count tool (regression test in writer) +-fdo#31271 wrong line break with ( +-#i89042# word count fix (regression test is in writer) +-#i58513# add break iterator rules for Finish +-#i19716# fix wrong line break on bracket characters +-#i21290# extend Greek script type +-#i21907# fix isBeginWord and isEndWord problem +-#i85411# Apply patch for ZWSP +-#i17155# fix line breakiterator rule to make slash and hyphen as part of word when doing line break +-#i13451# add '-' as midLetter for Catalan dictionary word breakiterator +-#i13494# fix word breakiterator rule to handle punctuations and signs correctly +-#i29548# Fix Thai word breakiterator problem +-#i11993# #i14904# fix word breakiterator issues +-- +2.39.2 + diff --git a/debian/patches/series b/debian/patches/series index 5ca46579bf..65bdb5df83 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -50,3 +50,7 @@ fix-system-abseil-build.diff fix-riscv64-bridge.diff pdfium-ports.diff split-sdbc-firebird-mariadb.diff +use-PyConfig.diff +reviewed-breakIterator-customizations.diff +breakiterator-updates.diff +icu-74.1.diff diff --git a/debian/patches/use-PyConfig.diff b/debian/patches/use-PyConfig.diff new file mode 100644 index 0000000000..2cf1270d21 --- /dev/null +++ b/debian/patches/use-PyConfig.diff @@ -0,0 +1,80 @@ +From da0e9240bf6505ac3a67ff985705950566c66144 Mon Sep 17 00:00:00 2001 +From: Ilmari Lauhakangas +Date: Thu, 21 Dec 2023 12:01:50 +0200 +Subject: tdf#158447 Use PyConfig for setting Python home directory with Python + >= 3.8 + +Change-Id: Ic5b7c60613b22f5215cb1a2a13fecf3e0946ca49 +Reviewed-on: https://gerrit.libreoffice.org/c/core/+/161089 +Reviewed-by: Ilmari Lauhakangas +Tested-by: Jenkins +Reviewed-by: Noel Grandin +Tested-by: Ilmari Lauhakangas +--- + pyuno/source/loader/pyuno_loader.cxx | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +diff --git a/pyuno/source/loader/pyuno_loader.cxx b/pyuno/source/loader/pyuno_loader.cxx +index 008d58634947..1e00773761e7 100644 +--- a/pyuno/source/loader/pyuno_loader.cxx ++++ b/pyuno/source/loader/pyuno_loader.cxx +@@ -114,7 +114,11 @@ static PyRef getObjectFromLoaderModule( const char * func ) + return object; + } + ++#if PY_VERSION_HEX >= 0x03080000 ++static void setPythonHome ( const OUString & pythonHome, PyConfig * config ) ++#else + static void setPythonHome ( const OUString & pythonHome ) ++#endif + { + OUString systemPythonHome; + osl_getSystemPathFromFileURL( pythonHome.pData, &(systemPythonHome.pData) ); +@@ -138,9 +142,11 @@ static void setPythonHome ( const OUString & pythonHome ) + PyErr_SetString(PyExc_SystemError, "python home path is too long"); + return; + } +-SAL_WNODEPRECATED_DECLARATIONS_PUSH +- Py_SetPythonHome(wide); // deprecated since python 3.11 +-SAL_WNODEPRECATED_DECLARATIONS_POP ++#if PY_VERSION_HEX >= 0x03080000 ++ config->home = wide; ++#else ++ Py_SetPythonHome(wide); ++#endif + } + + static void prependPythonPath( std::u16string_view pythonPathBootstrap ) +@@ -192,11 +198,17 @@ void pythonInit() { + if ( Py_IsInitialized()) // may be inited by getComponentContext() already + return; + ++#if PY_VERSION_HEX >= 0x03080000 ++ PyConfig config; ++#endif + OUString pythonPath; + OUString pythonHome; + OUString path( "$BRAND_BASE_DIR/" LIBO_ETC_FOLDER "/" SAL_CONFIGFILE("pythonloader.uno" )); + rtl::Bootstrap::expandMacros(path); //TODO: detect failure + rtl::Bootstrap bootstrap(path); ++#if PY_VERSION_HEX >= 0x03080000 ++ PyConfig_InitPythonConfig( &config ); ++#endif + + // look for pythonhome + bootstrap.getFrom( "PYUNO_LOADER_PYTHONHOME", pythonHome ); +@@ -205,7 +217,11 @@ void pythonInit() { + // pythonhome+pythonpath must be set before Py_Initialize(), otherwise there appear warning on the console + // sadly, there is no api for setting the pythonpath, we have to use the environment variable + if( !pythonHome.isEmpty() ) ++#if PY_VERSION_HEX >= 0x03080000 ++ setPythonHome( pythonHome, &config ); ++#else + setPythonHome( pythonHome ); ++#endif + + if( !pythonPath.isEmpty() ) + prependPythonPath( pythonPath ); +-- +cgit v1.2.3 + -- cgit v1.2.3