Adding upstream version 4:24.2.0.upstream/4%24.2.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:54:39 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:54:39 +0000
commit: 267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree: 358c9467650e1d0a1d7227a21dac2e3d08b622b2 /i18npool/source/breakiterator/breakiterator_unicode.cxx
parent: Initial commit. (diff)
download: libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
1 files changed, 608 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
new file mode 100644
index 0000000000..4927a82293
--- /dev/null
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -0,0 +1,608 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <breakiterator_unicode.hxx>
+#include <cppuhelper/supportsservice.hxx>
+#include <localedata.hxx>
+#include <i18nlangtag/languagetag.hxx>
+#include <i18nlangtag/languagetagicu.hxx>
+#include <unicode/uchar.h>
+#include <unicode/locid.h>
+#include <unicode/rbbi.h>
+#include <unicode/udata.h>
+#include <rtl/strbuf.hxx>
+#include <rtl/ustring.hxx>
+
+#include <com/sun/star/i18n/BreakType.hpp>
+#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
+#include <com/sun/star/i18n/WordType.hpp>
+
+U_CDECL_BEGIN
+extern const char OpenOffice_dat[];
+U_CDECL_END
+
+using namespace ::com::sun::star;
+using namespace ::com::sun::star::i18n;
+using namespace ::com::sun::star::lang;
+
+namespace i18npool {
+
+// Cache map of breakiterators, stores state information so has to be
+// thread_local.
+thread_local static BreakIterator_Unicode::BIMap theBIMap;
+
+BreakIterator_Unicode::BreakIterator_Unicode()
+    : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" )    // implementation name
+    , lineRule( "line" )
+    , icuBI( nullptr )
+{
+}
+
+BreakIterator_Unicode::~BreakIterator_Unicode()
+{
+}
+
+namespace {
+
+/*
+    Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
+    setbreakType method.
+*/
+class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
+{
+    public:
+    OOoRuleBasedBreakIterator(UDataMemory* image,
+                              UErrorCode &status)
+        : icu::RuleBasedBreakIterator(image, status)
+        { };
+
+};
+
+}
+
+// loading ICU breakiterator on demand.
+void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
+        sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)
+{
+    bool bNewBreak = false;
+    UErrorCode status = U_ZERO_ERROR;
+    sal_Int16 breakType = 0;
+    switch (rBreakType) {
+        case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
+        case LOAD_WORD_BREAKITERATOR:
+            assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
+            icuBI=&words[nWordType];
+            switch (nWordType) {
+                case WordType::ANY_WORD: break; // odd but previous behavior
+                case WordType::ANYWORD_IGNOREWHITESPACES:
+                    breakType = 0; rule = "edit_word"; break;
+                case WordType::DICTIONARY_WORD:
+                    breakType = 1; rule = "dict_word"; break;
+                default:
+                case WordType::WORD_COUNT:
+                    breakType = 2; rule = "count_word"; break;
+            }
+            break;
+        case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
+        case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
+    }
+
+    // Using the cache map prevents accessing the file system for each
+    // udata_open() where ICU tries first files then data objects. And that for
+    // two fallbacks worst case... for each new allocated EditEngine, layout
+    // cell, ... *ouch*  Also non-rule locale based iterators can be mapped.
+    // This also speeds up loading iterators for alternating or generally more
+    // than one language/locale in that iterators are not constructed and
+    // destroyed en masse.
+    // Four possible keys, locale rule based with break type, locale rule based
+    // only, rule based only, locale based with break type. A fifth global key
+    // for the initial lookup.
+    // Multiple global keys may map to identical value data.
+    // All enums used here should be in the range 0..9 so assert that and avoid
+    // expensive numeric conversion in append() for faster construction of the
+    // always used global key.
+    assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
+    const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
+    OStringBuffer aKeyBuf(64);
+    aKeyBuf.append( aLangtagStr + ";" );
+    if (rule)
+        aKeyBuf.append(rule);
+    aKeyBuf.append(";" + OStringChar(static_cast<char>('0'+breakType)) + ";"
+        + OStringChar(static_cast<char>('0'+rBreakType)) + ";"
+        + OStringChar( static_cast<char>('0'+nWordType)));
+    // langtag;rule;breakType;rBreakType;nWordType
+    const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
+
+    if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
+    {
+
+        auto aMapIt( theBIMap.find( aBIMapGlobalKey));
+        bool bInMap = (aMapIt != theBIMap.end());
+        if (bInMap)
+            icuBI->mpValue = aMapIt->second;
+        else
+            icuBI->mpValue.reset();
+
+        if (!bInMap && rule)
+            do
+            {
+                const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
+
+                status = U_ZERO_ERROR;
+                udata_setAppData("OpenOffice", OpenOffice_dat, &status);
+                if ( !U_SUCCESS(status) )
+                    throw uno::RuntimeException("udata_setAppData returned error " + OUString::createFromAscii(u_errorName(status)));
+
+                std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
+
+                if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
+                {
+                    // langtag;rule;breakType
+                    const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
+                    aMapIt = theBIMap.find( aBIMapRuleTypeKey);
+                    bInMap = (aMapIt != theBIMap.end());
+                    if (bInMap)
+                    {
+                        icuBI->mpValue = aMapIt->second;
+                        icuBI->maBIMapKey = aBIMapGlobalKey;
+                        theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+                        break;  // do
+                    }
+
+                    rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",
+                        OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
+
+                    if (U_SUCCESS(status))
+                    {
+                        icuBI->mpValue = std::make_shared<BI_ValueData>();
+                        icuBI->mpValue->mpBreakIterator = rbi;
+                        theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
+                    }
+                    else
+                    {
+                        rbi.reset();
+                    }
+                }
+                //use icu's breakiterator for Thai, Tibetan and Dzongkha
+                else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
+                {
+                    // language;rule (not langtag, unless we'd actually load such)
+                    OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
+                    const OString aBIMapRuleKey( aLanguage + ";" + rule);
+                    aMapIt = theBIMap.find( aBIMapRuleKey);
+                    bInMap = (aMapIt != theBIMap.end());
+                    if (bInMap)
+                    {
+                        icuBI->mpValue = aMapIt->second;
+                        icuBI->maBIMapKey = aBIMapGlobalKey;
+                        theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+                        break;  // do
+                    }
+
+                    status = U_ZERO_ERROR;
+                    OString aUDName = OString::Concat(rule) + "_" + aLanguage;
+                    UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
+                    if( U_SUCCESS(status) )
+                        rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
+                    if ( U_SUCCESS(status) )
+                    {
+                        icuBI->mpValue = std::make_shared<BI_ValueData>();
+                        icuBI->mpValue->mpBreakIterator = rbi;
+                        theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
+                    }
+                    else
+                    {
+                        rbi.reset();
+
+                        // ;rule (only)
+                        const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);
+                        aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
+                        bInMap = (aMapIt != theBIMap.end());
+                        if (bInMap)
+                        {
+                            icuBI->mpValue = aMapIt->second;
+                            icuBI->maBIMapKey = aBIMapGlobalKey;
+                            theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+                            break;  // do
+                        }
+
+                        status = U_ZERO_ERROR;
+                        pUData = udata_open("OpenOffice", "brk", rule, &status);
+                        if( U_SUCCESS(status) )
+                            rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
+                        if ( U_SUCCESS(status) )
+                        {
+                            icuBI->mpValue = std::make_shared<BI_ValueData>();
+                            icuBI->mpValue->mpBreakIterator = rbi;
+                            theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
+                        }
+                        else
+                        {
+                            rbi.reset();
+                        }
+                    }
+                }
+            } while (false);
+
+        if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
+            do
+            {
+                // langtag;;;rBreakType (empty rule; empty breakType)
+                const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
+                aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
+                bInMap = (aMapIt != theBIMap.end());
+                if (bInMap)
+                {
+                    icuBI->mpValue = aMapIt->second;
+                    icuBI->maBIMapKey = aBIMapGlobalKey;
+                    theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+                    break;  // do
+                }
+
+                icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
+                std::shared_ptr< icu::BreakIterator > pBI;
+
+                status = U_ZERO_ERROR;
+                switch (rBreakType) {
+                    case LOAD_CHARACTER_BREAKITERATOR:
+                        pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
+                        break;
+                    case LOAD_WORD_BREAKITERATOR:
+                        pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
+                        break;
+                    case LOAD_SENTENCE_BREAKITERATOR:
+                        pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
+                        break;
+                    case LOAD_LINE_BREAKITERATOR:
+                        pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
+                        break;
+                }
+                if ( !U_SUCCESS(status) || !pBI ) {
+                    throw uno::RuntimeException("Failed to create ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status)));
+                }
+                icuBI->mpValue = std::make_shared<BI_ValueData>();
+                icuBI->mpValue->mpBreakIterator = pBI;
+                theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
+            } while (false);
+        if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
+            throw uno::RuntimeException("ICU BreakIterator is not properly initialized");
+        }
+        icuBI->maBIMapKey = aBIMapGlobalKey;
+        if (!bInMap)
+            theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+        bNewBreak=true;
+    }
+
+    if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))
+        return;
+
+    const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
+
+    status = U_ZERO_ERROR;
+    icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
+
+    if (!U_SUCCESS(status))
+        throw uno::RuntimeException("utext_openUChars returned error " + OUString::createFromAscii(u_errorName(status)));
+
+    icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
+
+    if (!U_SUCCESS(status))
+        throw uno::RuntimeException("Failed to set text for ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status)));
+
+    icuBI->mpValue->maICUText = rText;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
+        sal_Int32 nStartPos, const lang::Locale &rLocale,
+        sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+{
+    if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+        loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+        icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
+        for (nDone = 0; nDone < nCount; nDone++) {
+            nStartPos = pBI->following(nStartPos);
+            if (nStartPos == icu::BreakIterator::DONE)
+                return Text.getLength();
+        }
+    } else { // for CHARACTER mode
+        for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
+            Text.iterateCodePoints(&nStartPos);
+    }
+    return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
+        sal_Int32 nStartPos, const lang::Locale& rLocale,
+        sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+{
+    if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+        loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+        icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
+        for (nDone = 0; nDone < nCount; nDone++) {
+            nStartPos = pBI->preceding(nStartPos);
+            if (nStartPos == icu::BreakIterator::DONE)
+                return 0;
+        }
+    } else { // for BS to delete one char and CHARACTER mode.
+        for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
+            Text.iterateCodePoints(&nStartPos, -1);
+    }
+    return nStartPos;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
+    const lang::Locale& rLocale, sal_Int16 rWordType )
+{
+    loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
+
+    Boundary rv;
+    rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
+    if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
+        rv.endPos = result.startPos;
+    else {
+        if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
+             && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
+            || (rWordType == WordType::DICTIONARY_WORD
+                && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
+            rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
+
+        rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
+        if(rv.endPos == icu::BreakIterator::DONE)
+            rv.endPos = rv.startPos;
+    }
+    return rv;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale& rLocale, sal_Int16 rWordType)
+{
+    loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
+
+    Boundary rv;
+    rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
+    if( rv.startPos < 0)
+        rv.endPos = rv.startPos;
+    else {
+
+        if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
+             && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
+            || (rWordType == WordType::DICTIONARY_WORD
+                && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
+            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
+
+        rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
+        if(rv.endPos == icu::BreakIterator::DONE)
+            rv.endPos = rv.startPos;
+    }
+    return rv;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
+        sal_Int16 rWordType, sal_Bool bDirection )
+{
+    loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
+    sal_Int32 len = Text.getLength();
+
+    Boundary rv;
+    if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
+        rv.startPos = rv.endPos = nPos;
+        if((bDirection || nPos == 0) && nPos < len) //forward
+            rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
+        else
+            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
+    } else {
+        if(nPos <= 0) {
+            rv.startPos = 0;
+            rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
+        } else if(nPos >= len) {
+            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
+            rv.endPos = len;
+        } else {
+            rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
+            rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
+        }
+    }
+    if (rv.startPos == icu::BreakIterator::DONE)
+        rv.startPos = rv.endPos;
+    else if (rv.endPos == icu::BreakIterator::DONE)
+        rv.endPos = rv.startPos;
+
+    return rv;
+}
+
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale &rLocale )
+{
+    loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+    sal_Int32 len = Text.getLength();
+    if (len > 0 && nStartPos == len)
+        Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+    if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
+        nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
+
+    // skip preceding space.
+    sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
+    while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
+    Text.iterateCodePoints(&nStartPos, -1);
+
+    return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale &rLocale )
+{
+    loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+    sal_Int32 len = Text.getLength();
+    if (len > 0 && nStartPos == len)
+        Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+    nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
+
+    sal_Int32 nPos=nStartPos;
+    while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
+
+    return nStartPos;
+}
+
+LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
+        const OUString& Text, sal_Int32 nStartPos,
+        const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
+        const LineBreakHyphenationOptions& hOptions,
+        const LineBreakUserOptions& /*rOptions*/ )
+{
+    LineBreakResults lbr;
+
+    if (nStartPos >= Text.getLength()) {
+        lbr.breakIndex = Text.getLength();
+        lbr.breakType = BreakType::WORDBOUNDARY;
+        return lbr;
+    }
+
+    loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
+
+    icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
+    bool GlueSpace=true;
+    while (GlueSpace) {
+        // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!
+        if (pLineBI->preceding(nStartPos + 1) == nStartPos
+                && (nStartPos == 0 || Text[nStartPos - 1] != '/'))
+        { //Line boundary break
+            lbr.breakIndex = nStartPos;
+            lbr.breakType = BreakType::WORDBOUNDARY;
+        } else if (hOptions.rHyphenator.is()) { //Hyphenation break
+            sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
+            pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
+
+            sal_Int32 nStartPosWordEnd = nStartPos;
+            while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
+                nStartPosWordEnd --;
+
+            Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
+                WordType::DICTIONARY_WORD, false);
+
+            nStartPosWordEnd = wBoundary.endPos;
+            while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
+                nStartPosWordEnd ++;
+            nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
+            if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
+#define SPACE 0x0020
+            while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
+            uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
+                        wBoundary.endPos - wBoundary.startPos), rLocale,
+                    static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
+            if (aHyphenatedWord.is()) {
+                lbr.rHyphenatedWord = aHyphenatedWord;
+                if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
+                    lbr.breakIndex = -1;
+                else
+                    lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
+                lbr.breakType = BreakType::HYPHENATION;
+
+                // check not optimal hyphenation of "word-word" (word with hyphens)
+                if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
+                    lbr.breakIndex = pLineBI->current();
+                    lbr.breakType = BreakType::WORDBOUNDARY;
+                }
+
+            } else {
+                lbr.breakIndex = pLineBI->preceding(nStartPos);
+                lbr.breakType = BreakType::WORDBOUNDARY;
+            }
+        } else { //word boundary break
+            lbr.breakIndex = pLineBI->preceding(nStartPos);
+            lbr.breakType = BreakType::WORDBOUNDARY;
+
+            // Special case for Slash U+002F SOLIDUS in URI and path names.
+            // TR14 defines that as SY: Symbols Allowing Break After (A).
+            // This is unwanted in paths, see also i#17155
+            if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
+            {
+                // Look backward and take any whitespace before as a break
+                // opportunity. This also glues something like "w/o".
+                // Avoid an overly long path and break it as was indicated.
+                // Overly long here is arbitrarily defined.
+                const sal_Int32 nOverlyLong = 66;
+                sal_Int32 nPos = lbr.breakIndex - 1;
+                while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
+                {
+                    if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
+                    {
+                        lbr.breakIndex = nPos + 1;
+                        break;
+                    }
+                }
+            }
+        }
+
+#define WJ 0x2060   // Word Joiner
+        GlueSpace=false;
+        if (lbr.breakType == BreakType::WORDBOUNDARY) {
+            nStartPos = lbr.breakIndex;
+            if (nStartPos >= 0 && Text[nStartPos--] == WJ)
+                GlueSpace=true;
+            while (nStartPos >= 0 &&
+                    (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
+                if (Text[nStartPos--] == WJ)
+                    GlueSpace=true;
+            }
+            if (GlueSpace && nStartPos < 0)  {
+                lbr.breakIndex = 0;
+                break;
+            }
+        }
+    }
+
+    return lbr;
+}
+
+OUString SAL_CALL
+BreakIterator_Unicode::getImplementationName()
+{
+    return OUString::createFromAscii(cBreakIterator);
+}
+
+sal_Bool SAL_CALL
+BreakIterator_Unicode::supportsService(const OUString& rServiceName)
+{
+    return cppu::supportsService(this, rServiceName);
+}
+
+uno::Sequence< OUString > SAL_CALL
+BreakIterator_Unicode::getSupportedServiceNames()
+{
+    uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
+    return aRet;
+}
+
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
+com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
+    css::uno::XComponentContext *,
+    css::uno::Sequence<css::uno::Any> const &)
+{
+    return cppu::acquire(new i18npool::BreakIterator_Unicode());
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:54:39 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:54:39 +0000
commit	267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree	358c9467650e1d0a1d7227a21dac2e3d08b622b2 /i18npool/source/breakiterator/breakiterator_unicode.cxx
parent	Initial commit. (diff)
download	libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip