summaryrefslogtreecommitdiffstats
path: root/i18npool/source/breakiterator/breakiterator_unicode.cxx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
commit267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree358c9467650e1d0a1d7227a21dac2e3d08b622b2 /i18npool/source/breakiterator/breakiterator_unicode.cxx
parentInitial commit. (diff)
downloadlibreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'i18npool/source/breakiterator/breakiterator_unicode.cxx')
-rw-r--r--i18npool/source/breakiterator/breakiterator_unicode.cxx608
1 files changed, 608 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx
new file mode 100644
index 0000000000..4927a82293
--- /dev/null
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -0,0 +1,608 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <breakiterator_unicode.hxx>
+#include <cppuhelper/supportsservice.hxx>
+#include <localedata.hxx>
+#include <i18nlangtag/languagetag.hxx>
+#include <i18nlangtag/languagetagicu.hxx>
+#include <unicode/uchar.h>
+#include <unicode/locid.h>
+#include <unicode/rbbi.h>
+#include <unicode/udata.h>
+#include <rtl/strbuf.hxx>
+#include <rtl/ustring.hxx>
+
+#include <com/sun/star/i18n/BreakType.hpp>
+#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
+#include <com/sun/star/i18n/WordType.hpp>
+
+U_CDECL_BEGIN
+extern const char OpenOffice_dat[];
+U_CDECL_END
+
+using namespace ::com::sun::star;
+using namespace ::com::sun::star::i18n;
+using namespace ::com::sun::star::lang;
+
+namespace i18npool {
+
+// Cache map of breakiterators, stores state information so has to be
+// thread_local.
+thread_local static BreakIterator_Unicode::BIMap theBIMap;
+
+BreakIterator_Unicode::BreakIterator_Unicode()
+ : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
+ , lineRule( "line" )
+ , icuBI( nullptr )
+{
+}
+
+BreakIterator_Unicode::~BreakIterator_Unicode()
+{
+}
+
+namespace {
+
+/*
+ Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
+ setbreakType method.
+*/
+class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
+{
+ public:
+ OOoRuleBasedBreakIterator(UDataMemory* image,
+ UErrorCode &status)
+ : icu::RuleBasedBreakIterator(image, status)
+ { };
+
+};
+
+}
+
+// loading ICU breakiterator on demand.
+void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
+ sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText)
+{
+ bool bNewBreak = false;
+ UErrorCode status = U_ZERO_ERROR;
+ sal_Int16 breakType = 0;
+ switch (rBreakType) {
+ case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
+ case LOAD_WORD_BREAKITERATOR:
+ assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
+ icuBI=&words[nWordType];
+ switch (nWordType) {
+ case WordType::ANY_WORD: break; // odd but previous behavior
+ case WordType::ANYWORD_IGNOREWHITESPACES:
+ breakType = 0; rule = "edit_word"; break;
+ case WordType::DICTIONARY_WORD:
+ breakType = 1; rule = "dict_word"; break;
+ default:
+ case WordType::WORD_COUNT:
+ breakType = 2; rule = "count_word"; break;
+ }
+ break;
+ case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
+ case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
+ }
+
+ // Using the cache map prevents accessing the file system for each
+ // udata_open() where ICU tries first files then data objects. And that for
+ // two fallbacks worst case... for each new allocated EditEngine, layout
+ // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
+ // This also speeds up loading iterators for alternating or generally more
+ // than one language/locale in that iterators are not constructed and
+ // destroyed en masse.
+ // Four possible keys, locale rule based with break type, locale rule based
+ // only, rule based only, locale based with break type. A fifth global key
+ // for the initial lookup.
+ // Multiple global keys may map to identical value data.
+ // All enums used here should be in the range 0..9 so assert that and avoid
+ // expensive numeric conversion in append() for faster construction of the
+ // always used global key.
+ assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
+ const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
+ OStringBuffer aKeyBuf(64);
+ aKeyBuf.append( aLangtagStr + ";" );
+ if (rule)
+ aKeyBuf.append(rule);
+ aKeyBuf.append(";" + OStringChar(static_cast<char>('0'+breakType)) + ";"
+ + OStringChar(static_cast<char>('0'+rBreakType)) + ";"
+ + OStringChar( static_cast<char>('0'+nWordType)));
+ // langtag;rule;breakType;rBreakType;nWordType
+ const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
+
+ if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
+ {
+
+ auto aMapIt( theBIMap.find( aBIMapGlobalKey));
+ bool bInMap = (aMapIt != theBIMap.end());
+ if (bInMap)
+ icuBI->mpValue = aMapIt->second;
+ else
+ icuBI->mpValue.reset();
+
+ if (!bInMap && rule)
+ do
+ {
+ const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
+
+ status = U_ZERO_ERROR;
+ udata_setAppData("OpenOffice", OpenOffice_dat, &status);
+ if ( !U_SUCCESS(status) )
+ throw uno::RuntimeException("udata_setAppData returned error " + OUString::createFromAscii(u_errorName(status)));
+
+ std::shared_ptr<OOoRuleBasedBreakIterator> rbi;
+
+ if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
+ {
+ // langtag;rule;breakType
+ const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
+ aMapIt = theBIMap.find( aBIMapRuleTypeKey);
+ bInMap = (aMapIt != theBIMap.end());
+ if (bInMap)
+ {
+ icuBI->mpValue = aMapIt->second;
+ icuBI->maBIMapKey = aBIMapGlobalKey;
+ theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+ break; // do
+ }
+
+ rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk",
+ OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
+
+ if (U_SUCCESS(status))
+ {
+ icuBI->mpValue = std::make_shared<BI_ValueData>();
+ icuBI->mpValue->mpBreakIterator = rbi;
+ theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
+ }
+ else
+ {
+ rbi.reset();
+ }
+ }
+ //use icu's breakiterator for Thai, Tibetan and Dzongkha
+ else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
+ {
+ // language;rule (not langtag, unless we'd actually load such)
+ OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
+ const OString aBIMapRuleKey( aLanguage + ";" + rule);
+ aMapIt = theBIMap.find( aBIMapRuleKey);
+ bInMap = (aMapIt != theBIMap.end());
+ if (bInMap)
+ {
+ icuBI->mpValue = aMapIt->second;
+ icuBI->maBIMapKey = aBIMapGlobalKey;
+ theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+ break; // do
+ }
+
+ status = U_ZERO_ERROR;
+ OString aUDName = OString::Concat(rule) + "_" + aLanguage;
+ UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
+ if( U_SUCCESS(status) )
+ rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
+ if ( U_SUCCESS(status) )
+ {
+ icuBI->mpValue = std::make_shared<BI_ValueData>();
+ icuBI->mpValue->mpBreakIterator = rbi;
+ theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
+ }
+ else
+ {
+ rbi.reset();
+
+ // ;rule (only)
+ const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule);
+ aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
+ bInMap = (aMapIt != theBIMap.end());
+ if (bInMap)
+ {
+ icuBI->mpValue = aMapIt->second;
+ icuBI->maBIMapKey = aBIMapGlobalKey;
+ theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+ break; // do
+ }
+
+ status = U_ZERO_ERROR;
+ pUData = udata_open("OpenOffice", "brk", rule, &status);
+ if( U_SUCCESS(status) )
+ rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status);
+ if ( U_SUCCESS(status) )
+ {
+ icuBI->mpValue = std::make_shared<BI_ValueData>();
+ icuBI->mpValue->mpBreakIterator = rbi;
+ theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
+ }
+ else
+ {
+ rbi.reset();
+ }
+ }
+ }
+ } while (false);
+
+ if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
+ do
+ {
+ // langtag;;;rBreakType (empty rule; empty breakType)
+ const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
+ aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
+ bInMap = (aMapIt != theBIMap.end());
+ if (bInMap)
+ {
+ icuBI->mpValue = aMapIt->second;
+ icuBI->maBIMapKey = aBIMapGlobalKey;
+ theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+ break; // do
+ }
+
+ icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
+ std::shared_ptr< icu::BreakIterator > pBI;
+
+ status = U_ZERO_ERROR;
+ switch (rBreakType) {
+ case LOAD_CHARACTER_BREAKITERATOR:
+ pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
+ break;
+ case LOAD_WORD_BREAKITERATOR:
+ pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
+ break;
+ case LOAD_SENTENCE_BREAKITERATOR:
+ pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
+ break;
+ case LOAD_LINE_BREAKITERATOR:
+ pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
+ break;
+ }
+ if ( !U_SUCCESS(status) || !pBI ) {
+ throw uno::RuntimeException("Failed to create ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status)));
+ }
+ icuBI->mpValue = std::make_shared<BI_ValueData>();
+ icuBI->mpValue->mpBreakIterator = pBI;
+ theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
+ } while (false);
+ if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
+ throw uno::RuntimeException("ICU BreakIterator is not properly initialized");
+ }
+ icuBI->maBIMapKey = aBIMapGlobalKey;
+ if (!bInMap)
+ theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
+ bNewBreak=true;
+ }
+
+ if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData))
+ return;
+
+ const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
+
+ status = U_ZERO_ERROR;
+ icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
+
+ if (!U_SUCCESS(status))
+ throw uno::RuntimeException("utext_openUChars returned error " + OUString::createFromAscii(u_errorName(status)));
+
+ icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
+
+ if (!U_SUCCESS(status))
+ throw uno::RuntimeException("Failed to set text for ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status)));
+
+ icuBI->mpValue->maICUText = rText;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
+ sal_Int32 nStartPos, const lang::Locale &rLocale,
+ sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+{
+ if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+ loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+ icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
+ for (nDone = 0; nDone < nCount; nDone++) {
+ nStartPos = pBI->following(nStartPos);
+ if (nStartPos == icu::BreakIterator::DONE)
+ return Text.getLength();
+ }
+ } else { // for CHARACTER mode
+ for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
+ Text.iterateCodePoints(&nStartPos);
+ }
+ return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
+ sal_Int32 nStartPos, const lang::Locale& rLocale,
+ sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
+{
+ if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
+ loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
+ icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
+ for (nDone = 0; nDone < nCount; nDone++) {
+ nStartPos = pBI->preceding(nStartPos);
+ if (nStartPos == icu::BreakIterator::DONE)
+ return 0;
+ }
+ } else { // for BS to delete one char and CHARACTER mode.
+ for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
+ Text.iterateCodePoints(&nStartPos, -1);
+ }
+ return nStartPos;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale& rLocale, sal_Int16 rWordType )
+{
+ loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
+
+ Boundary rv;
+ rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
+ if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
+ rv.endPos = result.startPos;
+ else {
+ if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
+ && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
+ || (rWordType == WordType::DICTIONARY_WORD
+ && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
+ rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
+
+ rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
+ if(rv.endPos == icu::BreakIterator::DONE)
+ rv.endPos = rv.startPos;
+ }
+ return rv;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale& rLocale, sal_Int16 rWordType)
+{
+ loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
+
+ Boundary rv;
+ rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
+ if( rv.startPos < 0)
+ rv.endPos = rv.startPos;
+ else {
+
+ if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
+ && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
+ || (rWordType == WordType::DICTIONARY_WORD
+ && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0))))
+ rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
+
+ rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
+ if(rv.endPos == icu::BreakIterator::DONE)
+ rv.endPos = rv.startPos;
+ }
+ return rv;
+}
+
+
+Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
+ sal_Int16 rWordType, sal_Bool bDirection )
+{
+ loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
+ sal_Int32 len = Text.getLength();
+
+ Boundary rv;
+ if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
+ rv.startPos = rv.endPos = nPos;
+ if((bDirection || nPos == 0) && nPos < len) //forward
+ rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
+ else
+ rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
+ } else {
+ if(nPos <= 0) {
+ rv.startPos = 0;
+ rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
+ } else if(nPos >= len) {
+ rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
+ rv.endPos = len;
+ } else {
+ rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
+ rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
+ }
+ }
+ if (rv.startPos == icu::BreakIterator::DONE)
+ rv.startPos = rv.endPos;
+ else if (rv.endPos == icu::BreakIterator::DONE)
+ rv.endPos = rv.startPos;
+
+ return rv;
+}
+
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale &rLocale )
+{
+ loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+ sal_Int32 len = Text.getLength();
+ if (len > 0 && nStartPos == len)
+ Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+ if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
+ nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
+
+ // skip preceding space.
+ sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
+ while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
+ Text.iterateCodePoints(&nStartPos, -1);
+
+ return nStartPos;
+}
+
+sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale &rLocale )
+{
+ loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
+
+ sal_Int32 len = Text.getLength();
+ if (len > 0 && nStartPos == len)
+ Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
+ nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
+
+ sal_Int32 nPos=nStartPos;
+ while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
+
+ return nStartPos;
+}
+
+LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
+ const OUString& Text, sal_Int32 nStartPos,
+ const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
+ const LineBreakHyphenationOptions& hOptions,
+ const LineBreakUserOptions& /*rOptions*/ )
+{
+ LineBreakResults lbr;
+
+ if (nStartPos >= Text.getLength()) {
+ lbr.breakIndex = Text.getLength();
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ return lbr;
+ }
+
+ loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
+
+ icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
+ bool GlueSpace=true;
+ while (GlueSpace) {
+ // don't break with Slash U+002F SOLIDUS at end of line; see "else" below!
+ if (pLineBI->preceding(nStartPos + 1) == nStartPos
+ && (nStartPos == 0 || Text[nStartPos - 1] != '/'))
+ { //Line boundary break
+ lbr.breakIndex = nStartPos;
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ } else if (hOptions.rHyphenator.is()) { //Hyphenation break
+ sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
+ pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
+
+ sal_Int32 nStartPosWordEnd = nStartPos;
+ while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
+ nStartPosWordEnd --;
+
+ Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
+ WordType::DICTIONARY_WORD, false);
+
+ nStartPosWordEnd = wBoundary.endPos;
+ while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
+ nStartPosWordEnd ++;
+ nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
+ if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
+#define SPACE 0x0020
+ while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
+ uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
+ wBoundary.endPos - wBoundary.startPos), rLocale,
+ static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
+ if (aHyphenatedWord.is()) {
+ lbr.rHyphenatedWord = aHyphenatedWord;
+ if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
+ lbr.breakIndex = -1;
+ else
+ lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
+ lbr.breakType = BreakType::HYPHENATION;
+
+ // check not optimal hyphenation of "word-word" (word with hyphens)
+ if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
+ lbr.breakIndex = pLineBI->current();
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ }
+
+ } else {
+ lbr.breakIndex = pLineBI->preceding(nStartPos);
+ lbr.breakType = BreakType::WORDBOUNDARY;
+ }
+ } else { //word boundary break
+ lbr.breakIndex = pLineBI->preceding(nStartPos);
+ lbr.breakType = BreakType::WORDBOUNDARY;
+
+ // Special case for Slash U+002F SOLIDUS in URI and path names.
+ // TR14 defines that as SY: Symbols Allowing Break After (A).
+ // This is unwanted in paths, see also i#17155
+ if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
+ {
+ // Look backward and take any whitespace before as a break
+ // opportunity. This also glues something like "w/o".
+ // Avoid an overly long path and break it as was indicated.
+ // Overly long here is arbitrarily defined.
+ const sal_Int32 nOverlyLong = 66;
+ sal_Int32 nPos = lbr.breakIndex - 1;
+ while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
+ {
+ if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
+ {
+ lbr.breakIndex = nPos + 1;
+ break;
+ }
+ }
+ }
+ }
+
+#define WJ 0x2060 // Word Joiner
+ GlueSpace=false;
+ if (lbr.breakType == BreakType::WORDBOUNDARY) {
+ nStartPos = lbr.breakIndex;
+ if (nStartPos >= 0 && Text[nStartPos--] == WJ)
+ GlueSpace=true;
+ while (nStartPos >= 0 &&
+ (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
+ if (Text[nStartPos--] == WJ)
+ GlueSpace=true;
+ }
+ if (GlueSpace && nStartPos < 0) {
+ lbr.breakIndex = 0;
+ break;
+ }
+ }
+ }
+
+ return lbr;
+}
+
+OUString SAL_CALL
+BreakIterator_Unicode::getImplementationName()
+{
+ return OUString::createFromAscii(cBreakIterator);
+}
+
+sal_Bool SAL_CALL
+BreakIterator_Unicode::supportsService(const OUString& rServiceName)
+{
+ return cppu::supportsService(this, rServiceName);
+}
+
+uno::Sequence< OUString > SAL_CALL
+BreakIterator_Unicode::getSupportedServiceNames()
+{
+ uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
+ return aRet;
+}
+
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
+com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
+ css::uno::XComponentContext *,
+ css::uno::Sequence<css::uno::Any> const &)
+{
+ return cppu::acquire(new i18npool::BreakIterator_Unicode());
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */