From 5b688b03a916a0f6127c7aba891bf613cff0de0b Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Wed, 17 Apr 2024 09:09:50 -0600 Subject: [PATCH] tdf#49885 BreakIterator rule upgrades This change re-bases the BreakIterator rule customizations on top of a clean copy of the ICU 74.2 rules. Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4 --- i18npool/CustomTarget_breakiterator.mk | 6 +- i18npool/qa/cppunit/test_breakiterator.cxx | 356 +++++---- .../source/breakiterator/data/dict_word.txt | 267 ++++--- .../breakiterator/data/dict_word_he.txt | 139 ---- .../breakiterator/data/dict_word_hu.txt | 324 +++++---- .../breakiterator/data/dict_word_nodash.txt | 147 ---- .../data/dict_word_prepostdash.txt | 288 +++++--- .../source/breakiterator/data/edit_word.txt | 261 ++++--- .../breakiterator/data/edit_word_he.txt | 142 ---- .../breakiterator/data/edit_word_hu.txt | 294 +++++--- i18npool/source/breakiterator/data/line.txt | 680 ++++++------------ i18npool/source/breakiterator/data/sent.txt | 128 ---- 12 files changed, 1307 insertions(+), 1725 deletions(-) delete mode 100644 i18npool/source/breakiterator/data/dict_word_he.txt delete mode 100644 i18npool/source/breakiterator/data/dict_word_nodash.txt delete mode 100644 i18npool/source/breakiterator/data/edit_word_he.txt delete mode 100644 i18npool/source/breakiterator/data/sent.txt diff --git a/i18npool/CustomTarget_breakiterator.mk b/i18npool/CustomTarget_breakiterator.mk index 8229a5e8f314..ef951142837a 100644 --- a/i18npool/CustomTarget_breakiterator.mk +++ b/i18npool/CustomTarget_breakiterator.mk @@ -45,16 +45,12 @@ endif i18npool_BRKTXTS := \ count_word.brk \ - $(call gb_Helper_optional_locale,he,dict_word_he.brk) \ $(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \ - dict_word_nodash.brk \ dict_word_prepostdash.brk \ dict_word.brk \ - $(call gb_Helper_optional_locale,he,edit_word_he.brk) \ $(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \ edit_word.brk \ - line.brk \ - sent.brk + line.brk # 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules. # The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools, diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index b33466bee46d..2a35b2eee58f 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -184,11 +184,10 @@ void TestBreakIterator::testLineBreaking() { // Per the bug, the line break should leave -bar clumped together on the next line. - // However, this change was reverted at some point. This test asserts the new behavior. i18n::LineBreakResults aResult = m_xBreak->getLineBreak( "foo -bar", strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions); CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash", - static_cast(5), aResult.breakIndex); + static_cast(4), aResult.breakIndex); } } @@ -198,11 +197,29 @@ void TestBreakIterator::testLineBreaking() aLocale.Country = "US"; { - // Here we want the line break to leave C:\Program Files\ on the first line + // Note that the current behavior deviates from the original fix for this bug. + // + // The original report was filed due to wrapping all of "\Program Files\aaaa" to the + // next line, even though only "aaaa" overflowed. The original fix was to simply make + // U+005C reverse solidus (backslash) a breaking character. + // + // However, the root cause for this bug was not the behavior of '\', but rather some + // other bug making all of "\Program Files\" behave like a single token, despite it + // even containing whitespace. + // + // Reverting to the ICU line rules fixes this root issue. Now, in the following, + // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also + // consistent with the behavior of other office programs. i18n::LineBreakResults aResult = m_xBreak->getLineBreak( "C:\\Program Files\\LibreOffice", strlen("C:\\Program Files\\Libre"), aLocale, 0, aHyphOptions, aUserOptions); - CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + CPPUNIT_ASSERT_EQUAL(static_cast(11), aResult.breakIndex); + + // An identical result should be generated for solidus. + aResult = m_xBreak->getLineBreak( + "C:/Program Files/LibreOffice", strlen("C:/Program Files/Libre"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(11), aResult.breakIndex); } } @@ -251,23 +268,125 @@ void TestBreakIterator::testLineBreaking() aLocale.Country = "US"; { + // The root cause for this bug was the Unicode standard introducing special treatment + // for '-' in a number range context. This change makes number ranges (e.g. "100-199") + // behave as if they are single tokens for the purposes of line breaking. Unfortunately, + // this caused a significant appearance change to existing documents. + // + // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping + // number ranges as a single token is consistent with other applications, including web + // browsers, and other office suites as mentioned in the bug discussion. Removing this + // customization seems like it would be a major change, however. + // // Here we want the line break to leave 100- clumped on the first line. + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( "word 100-199 word", strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions); - CPPUNIT_ASSERT_EQUAL(static_cast(9), aResult.breakIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); + } + + { + // From the same bug: "the leading minus must stay with numbers and strings" + + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "range of -100.000 to 100.000", strlen("range of -1"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); + + constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr; + aResult = m_xBreak->getLineBreak( + str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex); } - } - // i#83649: Line break should be between typographical quote and left bracket - { aLocale.Language = "de"; aLocale.Country = "DE"; { - // Here we want the line break to leave »angetan werden« on the first line + // From the same bug: "the leading minus must stay with numbers and strings" + + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "EURO is -10,50", strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex); + + // Also the mathematical minus sign: + + constexpr OUString str = u"EURO is \u221210,50"_ustr; + aResult = m_xBreak->getLineBreak( + str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex); + } + + { + // From the same bug: "the leading minus must stay with numbers and strings" + + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + "und -kosten", strlen("und -ko"), aLocale, 0, + aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex); + + // But not the non-breaking hyphen: + + constexpr OUString str = u"und \u2011"_ustr; + aResult = m_xBreak->getLineBreak( + str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex); + } + } + + // i#83649: "Line break should be between typographical quote and left bracket" + // - Actually: Spaces between quotation mark and opening punctuation not treated as a break. + // - Note that per the Unicode standard, prohibiting breaks in this context is intentional + // because it may cause issues in certain languages due to the various ways quotation + // characters are used. + // - We do it anyway by customizing the ICU line breaking rules. + { + { + // This uses the sample text provided in the bug report. Based on usage, it is assumed + // they were in the de_DE locale. + + aLocale.Language = "de"; + aLocale.Country = "DE"; + + // Per the bug report, it is expected that »angetan werden« remains on the first line. const OUString str = u"»angetan werden« [Passiv]"_ustr; i18n::LineBreakResults aResult = m_xBreak->getLineBreak( - str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions); + str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + + // The same result should be returned for this and the first case. + const OUString str2 = u"»angetan werden« Passiv"_ustr; + aResult = m_xBreak->getLineBreak( + str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + + // Under ICU rules, no amount of spaces would cause this to wrap. + const OUString str3 = u"»angetan werden« [Passiv]"_ustr; + aResult = m_xBreak->getLineBreak( + str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(20), aResult.breakIndex); + + // However, tabs will + const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr; + aResult = m_xBreak->getLineBreak( + str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + } + + { + // The same behavior is seen in English + + aLocale.Language = "en"; + aLocale.Country = "US"; + + const OUString str = u"\"angetan werden\" [Passiv]"_ustr; + i18n::LineBreakResults aResult = m_xBreak->getLineBreak( + str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); + + const OUString str2 = u"\"angetan werden\" Passiv"_ustr; + aResult = m_xBreak->getLineBreak( + str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions); CPPUNIT_ASSERT_EQUAL(static_cast(17), aResult.breakIndex); } } @@ -355,7 +474,7 @@ void TestBreakIterator::testLineBreaking() auto res = m_xBreak->getLineBreak("Wort -prinzessinnen, wort", strlen("Wort -prinzessinnen,"), aLocale, 0, aHyphOptions, aUserOptions); - CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex); } } } @@ -638,7 +757,8 @@ void TestBreakIterator::testWordBoundaries() CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); } - //See https://bz.apache.org/ooo/show_bug.cgi?id=85411 + // i#85411: ZWSP should be a word separator for spellchecking + // - This fix was applied to both dict and edit customizations for (int j = 0; j < 3; ++j) { switch (j) @@ -660,21 +780,23 @@ void TestBreakIterator::testWordBoundaries() break; } - static constexpr OUString aTest = - u"I\u200Bwant\u200Bto\u200Bgo"_ustr; + static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr; sal_Int32 nPos = 0; - sal_Int32 aExpected[] = {1, 6, 9, 12}; + sal_Int32 aExpected[] = { 1, 6, 9, 12 }; size_t i = 0; do { CPPUNIT_ASSERT(i < std::size(aExpected)); - nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, - i18n::WordType::DICTIONARY_WORD, true).endPos; - CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos); + auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos); + auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos); + nPos = dwPos.endPos; ++i; - } - while (nPos++ < aTest.getLength()); + } while (nPos++ < aTest.getLength()); CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i); } @@ -814,121 +936,45 @@ void TestBreakIterator::testWordBoundaries() } // i#56347: "BreakIterator patch for Hungarian" - // Rules for Hungarian affixes after numbers and certain symbols - { - auto mode = i18n::WordType::DICTIONARY_WORD; - aLocale.Language = "hu"; - aLocale.Country = "HU"; - - OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; - - aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); - } - // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian) - // Rules for Hungarian affixes after numbers and certain symbols in edit mode. - // The patch was merged, but the original bug was never closed and the current behavior seems - // identical to the ICU default behavior. Added this test to ensure that doesn't change. + // Rules for Hungarian affixes after numbers and certain symbols { - auto mode = i18n::WordType::ANY_WORD; aLocale.Language = "hu"; aLocale.Country = "HU"; OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr; - aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos); - - aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + for (auto mode : + { i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES }) + { + aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); - aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true); - CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); - CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos); + } } } @@ -967,6 +1013,56 @@ void TestBreakIterator::testSentenceBoundaries() CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale)); CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale)); } + + // i#55063: Sentence selection in Thai should select a space-delimited phrase. + // - This customization broke at some point. It works in an English locale in a synthetic test + // like this one, but does not work in the Thai locale, nor on Thai text in practice. + { + static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr; + + aLocale.Language = "en"; + aLocale.Country = "US"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale)); + + aLocale.Language = "th"; + aLocale.Country = "TH"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale)); + } + + // i#55063: Thai phrases should delimit English sentence selection. + // - This customization broke at some point. It works in an English locale in a synthetic test + // like this one, but does not work in the Thai locale, nor on Thai text in practice. + { + static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr; + + aLocale.Language = "en"; + aLocale.Country = "US"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale)); + + aLocale.Language = "th"; + aLocale.Country = "TH"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale)); + } + + // i#55063: Characteristic test for English text delimiting Thai phrases (sentences) + // - English text should not delimit Thai phrases. + { + static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr; + + aLocale.Language = "en"; + aLocale.Country = "US"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale)); + + aLocale.Language = "th"; + aLocale.Country = "TH"; + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale)); + CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale)); + } } //See https://bugs.libreoffice.org/show_bug.cgi?id=40292 @@ -1501,6 +1597,7 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord() aLocale.Language = "he"; aLocale.Country = "IL"; + // i#51661: Add quotation mark as middle letter for Hebrew { auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; @@ -1514,6 +1611,21 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord() CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); } + + // i#51661: Add quotation mark as middle letter for Hebrew + { + auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr; + + i18n::Boundary aBounds = m_xBreak->getWordBoundary( + aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + } } void TestBreakIterator::testLegacySurrogatePairs() diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index b1666f44daab..f804b0eec214 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -1,148 +1,199 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### END CUSTOMIZATION +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; -[[:P:][:S:]]*; +## ------------------------------------------------- +# Rule 3 - CR x LF # -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | \n \r) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/dict_word_he.txt b/i18npool/source/breakiterator/data/dict_word_he.txt deleted file mode 100644 index 40197d92a431..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_he.txt +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Katakana - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -# [:IDEOGRAPHIC:] $Extend* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_hu.txt b/i18npool/source/breakiterator/data/dict_word_hu.txt index b0a0276b36a8..88648e6e5716 100644 --- a/i18npool/source/breakiterator/data/dict_word_hu.txt +++ b/i18npool/source/breakiterator/data/dict_word_hu.txt @@ -1,176 +1,222 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - - -# Fix spelling of a)-ban, b)-ben, when the letter is a reference -# resulting bad word breaking "ban" and "ben" -# (reference fields are not expanded in spell checking, yet, only -# for grammar checking). - -$PrefixLetter = [[:name = RIGHT PARENTHESIS:]]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:] - [:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = RIGHT DOUBLE QUOTATION MARK:] - [:name = LEFT PARENTHESIS:] - [:name = RIGHT PARENTHESIS:] - [:name = RIGHT SQUARE BRACKET:] - [:name = EXCLAMATION MARK:] - [:name = QUESTION MARK:] - [:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$SufixLetter = [:name= FULL STOP:]; - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - +$Han = [:Han:]; + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; + +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. +### tdf#116072: Extend MidLetter in Hungarian word breaking +### i#56347: BreakIterator patch for Hungarian +### i#56348: Special chars in first pos not handled by spell checking for Hungarian + +$Symbols_hu = [[:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] + [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] + [:name = DEGREE SIGN:] + [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] + [:name = EN DASH:] + [:name = EM DASH:]]; + +#$ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; + +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; + +$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:] + [:name = LEFT PARENTHESIS:] + [:name = RIGHT PARENTHESIS:] + [:name = RIGHT SQUARE BRACKET:] + [:name = EXCLAMATION MARK:] + [:name = QUESTION MARK:] + $Symbols_hu]; + +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu]; + +### END CUSTOMIZATION + +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; + + +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. + +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; + +# TODO: check if handling of katakana in dictionary makes rules incorrect/void + +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + + +## ------------------------------------------------- + +# Rule 3 - CR x LF +# +$CR $LF; -#################################################################################### +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Word Break Rules. Definitions and Rules specific to word break begin Here. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -#################################################################################### +$WSegSpace $WSegSpace; -$Format = [[:Cf:] - $TheZWSP]; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. +$ExFm = [$Extend $Format $ZWJ]; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# rule 5 +# Do not break between most letters. # -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; -[[:P:][:S:]]*; +# rule 8 -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$Numeric $ExFm* $Numeric; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +# rule 9 -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | \n \r) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/dict_word_nodash.txt b/i18npool/source/breakiterator/data/dict_word_nodash.txt deleted file mode 100644 index 279cc50e5b66..000000000000 --- a/i18npool/source/breakiterator/data/dict_word_nodash.txt +++ /dev/null @@ -1,147 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: dict_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ]; - -$SufixLetter = [:name= FULL STOP:]; - - -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; - -[[:P:][:S:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt index fb29b478af21..b39503d1b405 100644 --- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt +++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt @@ -1,157 +1,221 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: dict_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; +############################################################################## -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION -# list of dashes or hyphens that should be accepted as part of the word if a single one of these -# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to -# be part of the word in order to have it properly spell checked etc. -$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ]; +!!chain; +!!quoted_literals_only; -$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] - [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] - [:name = HYPHEN-MINUS:] ]; +# +# Character Class Definitions. +# -$SufixLetter = [:name= FULL STOP:]; - +$Han = [:Han:]; -$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] - [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] - [:name = PRIME:]]; -$Numeric = [:LineBreak = Numeric:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### Unknown issue number: Dictionary words can contain hyphens +### tdf#49885: Sync custom BreakIterator rules with ICU originals +### - ICU is now more permissive about punctuation inside words. +### - For compatibility, exclude certain characters that were previously excluded. -$TheZWSP = \u200b; +$IncludedML = [:name = HYPHEN-MINUS:]; +$ExcludedML = [[:name = COLON:] + [:name = GREEK ANO TELEIA:] + [:name = PRESENTATION FORM FOR VERTICAL COLON:] + [:name = SMALL COLON:] + [:name = FULLWIDTH COLON:]]; -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; +### END CUSTOMIZATION +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages +### This part of the customization does not replace any rules. +$PrePostHyphen = [:name = HYPHEN-MINUS:]; -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +### END CUSTOMIZATION -$Format = [[:Cf:] - $TheZWSP]; +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$SufixLetterEx= $SufixLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; +## ------------------------------------------------- + +# Rule 3 - CR x LF # -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -# At most one leading or trailing dash/hyphen should be accepted as well. -# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to -# be part of the word in order to have it properly spell checked etc. -$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; +$WSegSpace $WSegSpace; -[[:P:][:S:]]*; +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$ExFm = [$Extend $Format $ZWJ]; -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?; + +### END CUSTOMIZATION + +# rule 6 and 7 + +### BEGIN CUSTOMIZATION +### Unknown issue number: Allow leading and trailing hyphens in certain languages + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; +($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200}; + +### END CUSTOMIZATION + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | \n \r) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/edit_word.txt b/i18npool/source/breakiterator/data/edit_word.txt index 92b344c19d41..14fc221aa96e 100644 --- a/i18npool/source/breakiterator/data/edit_word.txt +++ b/i18npool/source/breakiterator/data/edit_word.txt @@ -1,142 +1,199 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: edit_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This change subtracts undesired characters from the above families -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +# $MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +### END CUSTOMIZATION +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; +## ------------------------------------------------- +# Rule 3 - CR x LF # -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 + +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | \n \r) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This customization does not replace any rules. +[[:P:][:S:]-[:name = FULL STOP:]]* +[[:name = FULL STOP:]]*; +### END CUSTOMIZATION +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/edit_word_he.txt b/i18npool/source/breakiterator/data/edit_word_he.txt deleted file mode 100644 index 0b5908814e08..000000000000 --- a/i18npool/source/breakiterator/data/edit_word_he.txt +++ /dev/null @@ -1,142 +0,0 @@ -# -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: edit_word.txt -# -# ICU Word Break Rules -# See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 -# - - - -#################################################################################### -# -# Character class definitions from TR 29 -# -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; - -# -# Character Class Definitions. -# The names are those from TR29. -# -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; - - - - -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### - -$Format = [[:Cf:] - $TheZWSP]; - - - -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# - - -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; - - -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; - -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; - -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; - -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; - -# -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; - -# -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. -# -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; - -# -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) -# - -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. -# -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; - -#!.*; -! ($NonStarters* | \n \r) .; - diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt index 4a08acab0029..389ad2bacc13 100644 --- a/i18npool/source/breakiterator/data/edit_word_hu.txt +++ b/i18npool/source/breakiterator/data/edit_word_hu.txt @@ -1,159 +1,215 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. -# All Rights Reserved. +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. # -# file: edit_word.txt +# file: word.txt # -# ICU Word Break Rules +# ICU Word Break Rules # See Unicode Standard Annex #29. -# These rules are based on Version 4.0.0, dated 2003-04-17 +# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 # +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. - - -#################################################################################### +############################################################################## # # Character class definitions from TR 29 # -#################################################################################### -$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] - [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] - [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; - -$Ideographic = [:Ideographic:]; -$Hangul = [:Script = HANGUL:]; - -$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] - [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] - [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] - [:name = DIGIT ZERO:] - [:name = DIGIT ONE:] - [:name = DIGIT TWO:] - [:name = DIGIT THREE:] - [:name = DIGIT FOUR:] - [:name = DIGIT FIVE:] - [:name = DIGIT SIX:] - [:name = DIGIT SEVEN:] - [:name = DIGIT EIGHT:] - [:name = DIGIT NINE:] - - $Ideographic - - $Katakana - - $Hangul - - [:Script = Thai:] - - [:Script = Lao:] - - [:Script = Hiragana:]]; - -$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] - [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] - [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:] - [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] - [:name = EN DASH:] [:name = EM DASH:] - [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; - -$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; -$Numeric = [:LineBreak = Numeric:]; - - -$TheZWSP = \u200b; +############################################################################## + +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION + +!!chain; +!!quoted_literals_only; + # # Character Class Definitions. -# The names are those from TR29. # -$CR = \u000d; -$LF = \u000a; -$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; -$Extend = [[:Grapheme_Extend = TRUE:]]; +$Han = [:Han:]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}-$Han]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [\p{Extended_Pictographic}]; +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This change subtracts undesired characters from the above families +### i#56347: BreakIterator patch for Hungarian +### i#56348: Special chars in first pos not handled by spell checking for Hungarian -#################################################################################### -# -# Word Break Rules. Definitions and Rules specific to word break begin Here. -# -#################################################################################### +$Symbols_hu = [[:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] + [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] + [:name = DEGREE SIGN:] + [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] + [:name = EN DASH:] + [:name = EM DASH:]]; -$Format = [[:Cf:] - $TheZWSP]; +# $ALetter = [\p{Word_Break = ALetter}]; +$ALetter = [\p{Word_Break = ALetter} $Symbols_hu]; +# $MidLetter = [\p{Word_Break = MidLetter}]; +$MidLetter = [\p{Word_Break = MidLetter} $Symbols_hu]; +# $MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]]; -# Rule 3: Treat a grapheme cluster as if it were a single character. -# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters -# because we don't need to find the boundaries between adjacent syllables - -# they won't be word boundaries. -# +# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]]; +### END CUSTOMIZATION -# -# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. -# -$ALetterEx = $ALetter $Extend*; -$NumericEx = $Numeric $Extend*; -$MidNumEx = $MidNum $Extend*; -$MidLetterEx = $MidLetter $Extend*; -$KatakanaEx = $Katakana $Extend*; -$IdeographicEx= $Ideographic $Extend*; -$HangulEx = $Hangul $Extend*; -$FormatEx = $Format $Extend*; +$Hiragana = [:Hiragana:]; +$Ideographic = [\p{Ideographic}]; -# -# Numbers. Rules 8, 11, 12 form the TR. -# -$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; -$NumberSequence {100}; +# Dictionary character set, for triggering language-based break engines. Currently +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode +# 5.0 or later as the definition of Complex_Context was corrected to include all +# characters requiring dictionary break. -# -# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 -# - must include at least one letter. -# - may include both letters and numbers. -# - may include MideLetter, MidNumber punctuation. -# -$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 -($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$KanaKanji $HangulSyllable]; +$dictionary = [$ComplexContext $dictionaryCJK]; -# Punctuations by themselves -[[:P:][:S:]-[:name = FULL STOP:]]*; -[[:name = FULL STOP:]]*; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void -# -# Do not break between Katakana. Rule #13. -# -$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; -[:Hiragana:] $Extend* {300}; +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + +## ------------------------------------------------- + +# Rule 3 - CR x LF # -# Ideographic Characters. Stand by themselves as words. -# Separated from the "Everything Else" rule, below, only so that they -# can be tagged with a return value. TODO: is this what we want? -# -$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; -$HangulEx ($FormatEx* $HangulEx)* {400}; +$CR $LF; +# Rule 3c Do not break within emoji zwj sequences. +# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed. # -# Everything Else, with no tag. -# Non-Control chars combine with $Extend (combining) chars. -# Controls are do not. +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. # -[^$Control [:Ideographic:]] $Extend*; -$CR $LF; +$WSegSpace $WSegSpace; + +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning +# of a region of Text. + +$ExFm = [$Extend $Format $ZWJ]; + +^$ExFm+; # This rule fires only when there are format or extend characters at the + # start of text, or immediately following another boundary. It groups them, in + # the event there are more than one. + +[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words, + # with no special rule status value. + +$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but +$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character. +$HangulSyllable {200}; +$Hebrew_Letter $ExFm* {200}; +$Katakana $ExFm* {400}; # note: these status values override those from rule 5 +$Hiragana $ExFm* {400}; # by virtue of being numerically larger. +$Ideographic $ExFm* {400}; # # -# Reverse Rules. Back up over any of the chars that can group together. -# (Reverse rules do not need to be exact; they can back up too far, -# but must back up at least enough, and must stop on a boundary.) +# rule 5 +# Do not break between most letters. # +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 6 and 7 +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +# rule 7a +$Hebrew_Letter $ExFm* $Single_Quote {200}; + +# rule 7b and 7c +$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter; + +# rule 8 + +$Numeric $ExFm* $Numeric; + +# rule 9 -# NonStarters are the set of all characters that can appear at the 2nd - nth position of -# a word. (They may also be the first.) The reverse rule skips over these, until it -# reaches something that can only be the start (and probably only) char in a "word". -# A space or punctuation meets the test. +($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric; + +# rule 10 + +$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter); + +# rule 11 and 12 + +$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric; + +# rule 13 +# to be consistent with $KanaKanji $KanaKanhi, changed +# from 300 to 400. +# See also TestRuleStatus in intltest/rbbiapts.cpp +$Katakana $ExFm* $Katakana {400}; + +# rule 13a/b + +$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a) +$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a) +$Numeric $ExFm* $ExtendNumLet {100}; # (13a) +$Katakana $ExFm* $ExtendNumLet {400}; # (13a) +$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a) + +$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b) +$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b) +$ExtendNumLet $ExFm* $Numeric {100}; # (13b) +$ExtendNumLet $ExFm* $Katakana {400}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With incoming rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. # -$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; +^$Regional_Indicator $ExFm* $Regional_Indicator; -#!.*; -! ($NonStarters* | \n \r) .; +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; +$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found + +### BEGIN CUSTOMIZATION +### i#13494: For the purposes of editing, standalone punctuation should be treated as a word. +### This customization does not replace any rules. +[[:P:][:S:]-[:name = FULL STOP:]]* +[[:name = FULL STOP:]]*; +### END CUSTOMIZATION +# Rule 999 +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/line.txt b/i18npool/source/breakiterator/data/line.txt index ff3f3eafc42e..46a618c63cae 100644 --- a/i18npool/source/breakiterator/data/line.txt +++ b/i18npool/source/breakiterator/data/line.txt @@ -1,176 +1,116 @@ -# Copyright (c) 2002-2006 International Business Machines Corporation and +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2002-2016 International Business Machines Corporation and # others. All Rights Reserved. # # file: line.txt # # Line Breaking Rules -# Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0 -# http://www.unicode.org/reports/tr14/ - - +# Implement default line breaking as defined by +# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/) +# for Unicode 14.0, with the following modification: +# +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 +# +# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). +# It sets characters of class CJ to behave like NS. # # Character Classes defined by TR 14. # -!!chain; -!!LBCMNoChain; +### BEGIN CUSTOMIZATION +### This file contains LibreOffice-specific rule customizations. +### +### To aid future maintainability: +### - The change location should be bracketed by comments of this form. +### - The original rule should be commented out, and the modified rule placed alongside. +### - By doing this, maintainers can more easily compare to an upstream baseline. +### +### END CUSTOMIZATION - -!!lookAheadHardBreak; -# -# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere -# and only used for the line break rules. -# -# It is used in the implementation of the incredibly annoying rule LB 10 -# which says to treat any combining mark that is not attached to a base -# character as if it were of class AL (alphabetic). -# -# The problem occurs in the reverse rules. -# -# Consider a sequence like, with correct breaks as shown -# LF ID CM AL AL -# ^ ^ ^ -# Then consider the sequence without the initial ID (ideographic) -# LF CM AL AL -# ^ ^ -# Our CM, which in the first example was attached to the ideograph, -# is now unattached, becomes an alpha, and joins in with the other -# alphas. -# -# When iterating forwards, these sequences do not present any problems -# When iterating backwards, we need to look ahead when encountering -# a CM to see whether it attaches to something further on or not. -# (Look-ahead in a reverse rule is looking towards the start) -# -# If the CM is unattached, we need to force a break. -# -# !!lookAheadHardBreak forces the run time state machine to -# stop immediately when a look ahead rule ( '/' operator) matches, -# and set the match position to that of the look-ahead operator, -# no matter what other rules may be in play at the time. -# -# See rule LB 19 for an example. -# +!!chain; +!!quoted_literals_only; $AI = [:LineBreak = Ambiguous:]; -$DG = \u00B0; -$AL = [[:LineBreak = Alphabetic:] $DG]; +$AL = [:LineBreak = Alphabetic:]; $BA = [:LineBreak = Break_After:]; +$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA. $BB = [:LineBreak = Break_Before:]; $BK = [:LineBreak = Mandatory_Break:]; $B2 = [:LineBreak = Break_Both:]; $CB = [:LineBreak = Contingent_Break:]; $CJ = [:LineBreak = Conditional_Japanese_Starter:]; -$CL = [[:LineBreak = Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; # tdf#31271 -$CM = [:LineBreak = Combining_Mark:]; +$CL = [:LineBreak = Close_Punctuation:]; +# $CM = [:LineBreak = Combining_Mark:]; +$CP = [:LineBreak = Close_Parenthesis:]; $CR = [:LineBreak = Carriage_Return:]; +$EB = [:LineBreak = EB:]; +$EM = [:LineBreak = EM:]; $EX = [:LineBreak = Exclamation:]; $GL = [:LineBreak = Glue:]; $HL = [:LineBreak = Hebrew_Letter:]; $HY = [:LineBreak = Hyphen:]; $H2 = [:LineBreak = H2:]; $H3 = [:LineBreak = H3:]; -$ID = [[:LineBreak = Ideographic:] - [\ufe30]]; -$IN = [:LineBreak = Inseparable:]; -$IS = [[:LineBreak = Infix_Numeric:] [\ufe30]]; +$ID = [:LineBreak = Ideographic:]; +$IN = [:LineBreak = Inseperable:]; +$IS = [:LineBreak = Infix_Numeric:]; $JL = [:LineBreak = JL:]; $JV = [:LineBreak = JV:]; $JT = [:LineBreak = JT:]; $LF = [:LineBreak = Line_Feed:]; $NL = [:LineBreak = Next_Line:]; +# NS includes CJ for CSS strict line breaking. $NS = [[:LineBreak = Nonstarter:] $CJ]; $NU = [:LineBreak = Numeric:]; -$OP = [[:LineBreak = Open_Punctuation:] - $DG]; +$OP = [:LineBreak = Open_Punctuation:]; $PO = [:LineBreak = Postfix_Numeric:]; -$BS = \u005C; -$PR = [[:LineBreak = Prefix_Numeric:] - $BS]; +$PR = [:LineBreak = Prefix_Numeric:]; $QU = [:LineBreak = Quotation:]; +$RI = [:LineBreak = Regional_Indicator:]; $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; -$SY = [[:LineBreak = Break_Symbols:] $BS]; +$SY = [:LineBreak = Break_Symbols:]; $WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; +$ZWJ = [:LineBreak = ZWJ:]; + +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + +$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}]; + +# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly +# list it in the numerous rules that use CM. +# By LB1, SA characters with general categor of Mn or Mc also resolve to CM. + +$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]]; +$CMX = [[$CM] - [$ZWJ]]; # Dictionary character set, for triggering language-based break engines. Currently -# limited to LineBreak=Complex_Context. Note that this set only works in Unicode -# 5.0 or later as the definition of Complex_Context was corrected to include all -# characters requiring dictionary break. +# limited to LineBreak=Complex_Context (SA). -$dictionary = [:LineBreak = Complex_Context:]; +$dictionary = [$SA]; # # Rule LB1. By default, treat AI (characters with ambiguous east Asian width), -# SA (South East Asian: Thai, Lao, Khmer) +# SA (Dictionary chars, excluding Mn and Mc) # SG (Unpaired Surrogates) # XX (Unknown, unassigned) # as $AL (Alphabetic) # -$ALPlus = [$AL $AI $SA $SG $XX]; - -# -# Combining Marks. X $CM* behaves as if it were X. Rule LB6. -# -$ALcm = $ALPlus $CM*; -$BAcm = $BA $CM*; -$BBcm = $BB $CM*; -$B2cm = $B2 $CM*; -$CLcm = $CL $CM*; -$EXcm = $EX $CM*; -$GLcm = $GL $CM*; -$HLcm = $HL $CM*; -$HYcm = $HY $CM*; -$H2cm = $H2 $CM*; -$H3cm = $H3 $CM*; -$IDcm = $ID $CM*; -$INcm = $IN $CM*; -$IScm = $IS $CM*; -$JLcm = $JL $CM*; -$JVcm = $JV $CM*; -$JTcm = $JT $CM*; -$NScm = $NS $CM*; -$NUcm = $NU $CM*; -$OPcm = $OP $CM*; -$POcm = $PO $CM*; -$PRcm = $PR $CM*; -$QUcm = $QU $CM*; -$SYcm = $SY $CM*; -$WJcm = $WJ $CM*; +$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]]; -## ------------------------------------------------- -!!forward; - -# -# Each class of character can stand by itself as an unbroken token, with trailing combining stuff -# -$ALPlus $CM+; -$BA $CM+; -$BB $CM+; -$B2 $CM+; -$CL $CM+; -$EX $CM+; -$GL $CM+; -$HL $CM+; -$HY $CM+; -$H2 $CM+; -$H3 $CM+; -$ID $CM+; -$IN $CM+; -$IS $CM+; -$JL $CM+; -$JV $CM+; -$JT $CM+; -$NS $CM+; -$NU $CM+; -$OP $CM+; -$PO $CM+; -$PR $CM+; -$QU $CM+; -$SY $CM+; -$WJ $CM+; +## ------------------------------------------------- # # CAN_CM is the set of characters that may combine with CM combining chars. @@ -186,19 +126,15 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. -# Chaining is disabled with CM because it causes other failures, -# so for this one case we need to manually list out longer sequences. # -$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; -$AL_FOLLOW_CM = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP]; -$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # # Rule LB 4, 5 Mandatory (Hard) breaks. # $LB4Breaks = [$BK $CR $LF $NL]; -$LB4NonBreaks = [^$BK $CR $LF $NL]; +$LB4NonBreaks = [^$BK $CR $LF $NL $CM]; $CR $LF {100}; # @@ -206,91 +142,124 @@ $CR $LF {100}; # $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. $CAN_CM $CM* $LB4Breaks {100}; -$CM+ $LB4Breaks {100}; +^$CM+ $LB4Breaks {100}; # LB 7 x SP # x ZW $LB4NonBreaks [$SP $ZW]; $CAN_CM $CM* [$SP $ZW]; -$CM+ [$SP $ZW]; +^$CM+ [$SP $ZW]; # # LB 8 Break after zero width space +# ZW SP* ÷ # $LB8Breaks = [$LB4Breaks $ZW]; $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; +$ZW $SP* / [^$SP $ZW $LB4Breaks]; +# LB 8a ZWJ x Do not break Emoji ZWJ sequences. +# +$ZWJ [^$CM]; -# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL -# $CM not covered by the above needs to behave like $AL +# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL +# $CM not covered by the above needs to behave like $AL # See definition of $CAN_CM. $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. -$CM+; +^$CM+; # # LB 11 Do not break before or after WORD JOINER & related characters. # -$CAN_CM $CM* $WJcm; -$LB8NonBreaks $WJcm; -$CM+ $WJcm; +$CAN_CM $CM* $WJ; +$LB8NonBreaks $WJ; +^$CM+ $WJ; -$WJcm [^$CAN_CM]; -$WJcm $CAN_CM $CM*; +$WJ $CM* .; # -# LB 12 Do not break before or after NBSP and related characters. +# LB 12 Do not break after NBSP and related characters. +# GL x # -# (!SP) x GL -[$LB8NonBreaks-$SP] $CM* $GLcm; -$CM+ $GLcm; +$GL $CM* .; -# GL x -$GLcm ($LB8Breaks | $SP); -$GLcm [$LB8NonBreaks-$SP] $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think we need this rule. - # All but $CM will chain off of preceding rule. - # $GLcm will pick up the CM case by itself. +# +# LB 12a Do not break before NBSP and related characters ... +# [^SP BA HY] x GL +# +[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL; +^$CM+ $GL; -# -# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. +# LB 13 Don't break before ']' or '!' or '/', even after spaces. # $LB8NonBreaks $CL; $CAN_CM $CM* $CL; -$CM+ $CL; # by rule 10, stand-alone CM behaves as AL +^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL + +$LB8NonBreaks $CP; +$CAN_CM $CM* $CP; +^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $EX; $CAN_CM $CM* $EX; -$CM+ $EX; # by rule 10, stand-alone CM behaves as AL - -$LB8NonBreaks $IS; -$CAN_CM $CM* $IS; -$CM+ $IS; # by rule 10, stand-alone CM behaves as AL +^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL $LB8NonBreaks $SY; $CAN_CM $CM* $SY; -$CM+ $SY; # by rule 10, stand-alone CM behaves as AL +^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL # -# LB 14 Do not break after OP, even after spaced +# LB 14 Do not break after OP, even after spaces +# Note subtle interaction with "SP IS /" rules in LB14a. +# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules, +# which is the desired behavior. +# +$OP $CM* $SP* .; + +$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL + # by rule 8, CM following a SP is stand-alone. + + +# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23" +# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations. +# See issue ICU-20303 + + +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; +$SP $IS / [^ $CanFollowIS $NU $CM]; +$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; + # -$OPcm $SP* $CAN_CM $CM*; -$OPcm $SP* $CANT_CM; +# LB 14b Do not break before numeric separators (IS), even after spaces. + +[$LB8NonBreaks - $SP] $IS; +$SP $IS $CM* [$CanFollowIS {eof}]; +$SP $IS $CM* $ZWJ [^$CM $NU]; + +$CAN_CM $CM* $IS; +^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL -$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # LB 15 -# $QUcm $SP* $OPcm; + +### BEGIN CUSTOMIZATION +### i#83649: Allow line break between quote and opening punctuation. +### This customization simply disables rule LB 15. +### +# $QU $CM* $SP* $OP; +### +### END CUSTOMIZATION # LB 16 -$CLcm $SP* $NScm; +($CL | $CP) $CM* $SP* $NS; # LB 17 -$B2cm $SP* $B2cm; +$B2 $CM* $SP* $B2; # # LB 18 Break after spaces. @@ -301,347 +270,134 @@ $LB18Breaks = [$LB8Breaks $SP]; # LB 19 # x QU -$LB18NonBreaks $CM* $QUcm; -$CM+ $QUcm; +$LB18NonBreaks $CM* $QU; +^$CM+ $QU; # QU x -$QUcm .?; -$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. - # TODO: I don't think this rule is needed. - +$QU $CM* .; # LB 20 # $CB # $CB - +# $LB20NonBreaks = [$LB18NonBreaks - $CB]; +# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen. +# Originally added as a Finnish tailoring, now promoted to default ICU behavior. +# Note: this is not default UAX-14 behaviour. See issue ICU-8151. +# +^($HY | $HH) $CM* $ALPlus; + # LB 21 x (BA | HY | NS) # BB x # -$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); +$LB20NonBreaks $CM* ($BA | $HY | $NS); -$BBcm [^$CB]; # $BB x -$BBcm $LB20NonBreaks $CM*; -# LB 21a Don't break after Hebrew + Hyphen -# HL (HY | BA) x -# -$HLcm ($HYcm | $BAcm) [^$CB]?; +^$CM+ ($BA | $HY | $NS); -# LB 22 -($ALcm | $HLcm) $INcm; -$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL -$IDcm $INcm; -$INcm $INcm; -$NUcm $INcm; +$BB $CM* [^$CB]; # $BB x +$BB $CM* $LB20NonBreaks; - -# $LB 23 -$IDcm $POcm; -$ALcm $NUcm; # includes $LB19 -$HLcm $NUcm; -$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL -$NUcm $ALcm; -$NUcm $HLcm; - -# -# LB 24 -# -$PRcm $IDcm; -$ALcm $PRcm; -$PRcm ($ALcm | $HLcm); -$POcm ($ALcm | $HLcm); - -# -# LB 25 Numbers. -# -($PRcm | $POcm)? ($OPcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?; - -# LB 26 Do not break a Korean syllable +# LB 21a Don't break after Hebrew + Hyphen +# HL (HY | BA) x # -$JLcm ($JLcm | $JVcm | $H2cm | $H3cm); -($JVcm | $H2cm) ($JVcm | $JTcm); -($JTcm | $H3cm) $JTcm; - -# LB 27 Treat korean Syllable Block the same as ID (don't break it) -($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; -($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; -$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); +$HL $CM* ($HY | $BA) $CM* [^$CB]?; +# LB 21b (forward) Don't break between SY and HL +# (break between HL and SY already disallowed by LB 13 above) +$SY $CM* $HL; -# LB 28 Do not break between alphabetics +# LB 22 Do not break before ellipses # -($ALcm | $HLcm) ($ALcm | $HLcm); -$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL +$LB20NonBreaks $CM* $IN; +^$CM+ $IN; -# LB 29 -$IScm ($ALcm | $NUcm); +# LB 23 # -# Rule 30 Do not break between letters, numbers or ordinary symbols -# and opening or closing punctuation -# -($ALcm | $HLcm | $NUcm) $OPcm; -$CM+ $OPcm; -$CLcm ($ALcm | $HLcm | $NUcm); +($ALPlus | $HL) $CM* $NU; +^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL +$NU $CM* ($ALPlus | $HL); +# LB 23a # -# Reverse Rules. -# -## ------------------------------------------------- +$PR $CM* ($ID | $EB | $EM); +($ID | $EB | $EM) $CM* $PO; -!!reverse; - -$CM+ $ALPlus; -$CM+ $BA; -$CM+ $BB; -$CM+ $B2; -$CM+ $CL; -$CM+ $EX; -$CM+ $GL; -$CM+ $HL; -$CM+ $HY; -$CM+ $H2; -$CM+ $H3; -$CM+ $ID; -$CM+ $IN; -$CM+ $IS; -$CM+ $JL; -$CM+ $JV; -$CM+ $JT; -$CM+ $NS; -$CM+ $NU; -$CM+ $OP; -$CM+ $PO; -$CM+ $PR; -$CM+ $QU; -$CM+ $SY; -$CM+ $WJ; -$CM+; - - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [whatever] -# The CM needs to behave as an AL -# -$AL_FOLLOW $CM+ / ( - [$BK $CR $LF $NL $ZW {eof}] | - $SP+ $CM+ $SP | - $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to suppress this break. - # LB14 says OP SP* x . - # becomes OP SP* x AL - # becomes OP SP* x CM+ AL_FOLLOW - # - # Further note: the $AL in [$AL {eof}] is only to work around - # a rule compiler bug which complains about - # empty sets otherwise. - -# -# Sequences of the form (shown forwards) -# [CANT_CM] [CM] [PR] -# The CM needs to behave as an AL -# This rule is concerned about getting the second of the two in place. -# - -[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; - - - -# LB 4, 5, 5 - -$LB4Breaks [$LB4NonBreaks-$CM]; -$LB4Breaks $CM+ $CAN_CM; -$LF $CR; - - -# LB 7 x SP -# x ZW -[$SP $ZW] [$LB4NonBreaks-$CM]; -[$SP $ZW] $CM+ $CAN_CM; -# LB 8 Break after zero width space - - -# LB 9,10 Combining marks. -# X $CM needs to behave like X, where X is not $SP or controls. -# $CM not covered by the above needs to behave like $AL -# Stick together any combining sequences that don't match other rules. -$CM+ $CAN_CM; - - -# LB 11 -$CM* $WJ $CM* $CAN_CM; -$CM* $WJ [$LB8NonBreaks-$CM]; - - $CANT_CM $CM* $WJ; -$CM* $CAN_CM $CM* $WJ; - -# LB 12 -# x GL # -$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP]; +# LB 24 +# +($PR | $PO) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($PR | $PO); +^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL # -# GL x +# LB 25 Numbers. # -$CANT_CM $CM* $GL; -$CM* $CAN_CM $CM* $GL; +(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))* + ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?; +### BEGIN CUSTOMIZATION +### i#83229: Allow line break after hyphen in number range context. +### The default ICU rules treat number ranges (e.g. 100-199) as a single token. This change forces +### a break opportunity after the embedded '-', but only if followed by another numeral. +### +### This customization does not replace any existing rule. +### Maintainers: note that this rule should consist of two instances of the LB 25 numbers rule, +### separated by a hyphen and an explicit break. -# LB 13 -$CL $CM+ $CAN_CM; -$EX $CM+ $CAN_CM; -$IS $CM+ $CAN_CM; -$SY $CM+ $CAN_CM; +((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))* + ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?) + ($HY $CM*) / +((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))* + ($CM* ($CL | $CP))? ($CM* ($PR | $PO))?); -$CL [$LB8NonBreaks-$CM]; -$EX [$LB8NonBreaks-$CM]; -$IS [$LB8NonBreaks-$CM]; -$SY [$LB8NonBreaks-$CM]; +### END CUSTOMIZATION -# Rule 13 & 14 taken together for an edge case. -# Match this, shown forward -# OP SP+ ($CM+ behaving as $AL) (CL | EX | IS | IY) -# This really wants to chain at the $CM+ (which is acting as an $AL) -# except for $CM chaining being disabled. -[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP; +### TODO +### ((PrefixNumeric | PostfixNumeric) CombMark*) ? ((OpenPunc | Hyphen) CombMark*)? +### (InfixNumeric CombMark*)? Numeric (CombMark* (Numeric | BreakSym | InfixNumeric))* +### (CombMark* (ClosePunc | CloseParen))? (CombMark* (PrefixNumeric | PostfixNumeric))? -# LB 14 OP SP* x +# LB 26 Do not break a Korean syllable # -$CM* $CAN_CM $SP* $CM* $OP; - $CANT_CM $SP* $CM* $OP; -$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP - - $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; -$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; -$SY $CM $SP+ $OP; # TODO: Experiment. Remove. - - - -# LB 15 -# $CM* $OP $SP* $CM* $QU; - -# LB 16 -$CM* $NS $SP* $CM* $CL; +$JL $CM* ($JL | $JV | $H2 | $H3); +($JV | $H2) $CM* ($JV | $JT); +($JT | $H3) $CM* $JT; -# LB 17 -$CM* $B2 $SP* $CM* $B2; - -# LB 18 break after spaces -# Nothing explicit needed here. - - -# -# LB 19 -# -$CM* $QU $CM* $CAN_CM; # . x QU -$CM* $QU $LB18NonBreaks; +# LB 27 Treat korean Syllable Block the same as ID (don't break it) +($JL | $JV | $JT | $H2 | $H3) $CM* $PO; +$PR $CM* ($JL | $JV | $JT | $H2 | $H3); -$CM* $CAN_CM $CM* $QU; # QU x . - $CANT_CM $CM* $QU; - -# -# LB 20 Break before and after CB. -# nothing needed here. +# LB 28 Do not break between alphabetics # - -# LB 21 -$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) - -$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . -[^$CB] $CM* $BB; # - -# LB21a -[^$CB] $CM* ($HY | $BA) $CM* $HL; - -# LB 22 -$CM* $IN $CM* ($ALPlus | $HL); -$CM* $IN $CM* $ID; -$CM* $IN $CM* $IN; -$CM* $IN $CM* $NU; - -# LB 23 -$CM* $PO $CM* $ID; -$CM* $NU $CM* ($ALPlus | $HL); -$CM* ($ALPlus | $HL) $CM* $NU; - -# LB 24 -$CM* $ID $CM* $PR; -$CM* $PR $CM* $ALPlus; -$CM* ($ALPlus | $HL) $CM* $PR; -$CM* ($ALPlus | $HL) $CM* $PO; - -$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP; -$CM* $NU+ $CM* $HY+ / $SP; - -# LB 25 -($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?; - -# LB 26 -$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; -$CM* ($JT | $JV) $CM* ($H2 | $JV); -$CM* $JT $CM* ($H3 | $JT); - -# LB 27 -$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); -$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; - -# LB 28 -$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL); +($ALPlus | $HL) $CM* ($ALPlus | $HL); +^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL # LB 29 -$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP]; +$IS $CM* ($ALPlus | $HL); # LB 30 -$CM* $OP $CM* ($ALPlus | $HL | $NU); -$CM* ($ALPlus | $HL | $NU) $CM* ($CL | $SY)+ [^$SP]; - - -## ------------------------------------------------- - -!!safe_reverse; - -# LB 7 -$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; -$CM+ $SP / .; - -# LB 9 -$SP+ $CM* $OP; - -# LB 10 -$SP+ $CM* $QU; - -# LB 11 -$SP+ $CM* $CL; -$SP+ $CM* $B2; - -# LB 21 -$CM* ($HY | $BA) $CM* $HL; - -# LB 18 -($CM* ($IS | $SY))+ $CM* $NU; -$CL $CM* ($NU | $IS | $SY); - -# For dictionary-based break -$dictionary $dictionary; - -## ------------------------------------------------- - -!!safe_forward; - -# Skip forward over all character classes that are involved in -# rules containing patterns with possibly more than one char -# of context. -# -# It might be slightly more efficient to have specific rules -# instead of one generic one, but only if we could -# turn off rule chaining. We don't want to move more -# than necessary. -# -[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary]; -$dictionary $dictionary; - +($ALPlus | $HL | $NU) $CM* $OP30; +^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. +$CP30 $CM* ($ALPlus | $HL | $NU); + +# LB 30a Do not break between regional indicators. Break after pairs of them. +# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; +# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' +# because of the chain-out behavior difference. The rule must chain out only from the [set characters], +# not from the preceding $RI or $CM, which it would be able to do if the set were optional. + +# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier. +$EB $CM* $EM; +$ExtPictUnassigned $CM* $EM; + +# LB 31 Break everywhere else. +# Match a single code point if no other rule applies. +.; diff --git a/i18npool/source/breakiterator/data/sent.txt b/i18npool/source/breakiterator/data/sent.txt deleted file mode 100644 index 7fada89e6278..000000000000 --- a/i18npool/source/breakiterator/data/sent.txt +++ /dev/null @@ -1,128 +0,0 @@ -# -# Copyright (C) 2002-2006, International Business Machines Corporation and others. -# All Rights Reserved. -# -# file: sent.txt -# -# ICU Sentence Break Rules -# See Unicode Standard Annex #29. -# These rules are based on SA 29 version 5.0.0 -# Includes post 5.0 changes to treat Japanese half width voicing marks -# as Grapheme Extend. -# - - -$VoiceMarks = [\uff9e\uff9f]; -$Thai = [:Script = Thai:]; - -# -# Character categories as defined in TR 29 -# -$Sep = [\p{Sentence_Break = Sep}]; -$Format = [\p{Sentence_Break = Format}]; -$Sp = [\p{Sentence_Break = Sp}]; -$Lower = [\p{Sentence_Break = Lower}]; -$Upper = [\p{Sentence_Break = Upper}]; -$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks]; -$Numeric = [\p{Sentence_Break = Numeric}]; -$ATerm = [\p{Sentence_Break = ATerm}]; -$STerm = [\p{Sentence_Break = STerm}]; -$Close = [\p{Sentence_Break = Close}]; - -# -# Define extended forms of the character classes, -# incorporate grapheme cluster + format chars. -# Rules 4 and 5. - - -$CR = \u000d; -$LF = \u000a; -$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks]; - -$SpEx = $Sp ($Extend | $Format)*; -$LowerEx = $Lower ($Extend | $Format)*; -$UpperEx = $Upper ($Extend | $Format)*; -$OLetterEx = $OLetter ($Extend | $Format)*; -$NumericEx = $Numeric ($Extend | $Format)*; -$ATermEx = $ATerm ($Extend | $Format)*; -$STermEx = $STerm ($Extend | $Format)*; -$CloseEx = $Close ($Extend | $Format)*; - - -## ------------------------------------------------- - -!!chain; -!!forward; - -# Rule 3 - break after separators. Keep CR/LF together. -# -$CR $LF; - -$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*; -$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*; - -# Rule 4 - Break after $Sep. -# Rule 5 - Ignore $Format and $Extend -# -[^$Sep]? ($Extend | $Format)*; - - -# Rule 6 -$ATermEx $NumericEx; - -# Rule 7 -$UpperEx $ATermEx $UpperEx; - -#Rule 8 -# Note: follows errata for Unicode 5.0 boundary rules. -$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*; -$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; - -# Rule 8a -($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx); - -#Rule 9, 10, 11 -($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?; - -#Rule 12 -[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai]; -[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100}; - -## ------------------------------------------------- - -!!reverse; - -$SpEx_R = ($Extend | $Format)* $Sp; -$ATermEx_R = ($Extend | $Format)* $ATerm; -$STermEx_R = ($Extend | $Format)* $STerm; -$CloseEx_R = ($Extend | $Format)* $Close; - -# -# Reverse rules. -# For now, use the old style inexact reverse rules, which are easier -# to write, but less efficient. -# TODO: exact reverse rules. It appears that exact reverse rules -# may require improving support for look-ahead breaks in the -# builder. Needs more investigation. -# - -[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; -#.*; - -# Explanation for this rule: -# -# It needs to back over -# The $Sep at which we probably begin -# All of the non $Sep chars leading to the preceding $Sep -# The preceding $Sep, which will be the second one that the rule matches. -# Any immediately preceding STerm or ATerm sequences. We need to see these -# to get the correct rule status when moving forwards again. -# -# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match -# the entire string. -# -# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be -# at the beginning of the string at this point, and we don't want to fail. -# Can only use {eof} once, and it is used later. -# - -- 2.39.2