diff options
Diffstat (limited to 'i18npool/source/breakiterator/breakiterator_unicode.cxx')
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_unicode.cxx | 608 |
1 files changed, 608 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx new file mode 100644 index 0000000000..4927a82293 --- /dev/null +++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -0,0 +1,608 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <breakiterator_unicode.hxx> +#include <cppuhelper/supportsservice.hxx> +#include <localedata.hxx> +#include <i18nlangtag/languagetag.hxx> +#include <i18nlangtag/languagetagicu.hxx> +#include <unicode/uchar.h> +#include <unicode/locid.h> +#include <unicode/rbbi.h> +#include <unicode/udata.h> +#include <rtl/strbuf.hxx> +#include <rtl/ustring.hxx> + +#include <com/sun/star/i18n/BreakType.hpp> +#include <com/sun/star/i18n/CharacterIteratorMode.hpp> +#include <com/sun/star/i18n/WordType.hpp> + +U_CDECL_BEGIN +extern const char OpenOffice_dat[]; +U_CDECL_END + +using namespace ::com::sun::star; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star::lang; + +namespace i18npool { + +// Cache map of breakiterators, stores state information so has to be +// thread_local. +thread_local static BreakIterator_Unicode::BIMap theBIMap; + +BreakIterator_Unicode::BreakIterator_Unicode() + : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name + , lineRule( "line" ) + , icuBI( nullptr ) +{ +} + +BreakIterator_Unicode::~BreakIterator_Unicode() +{ +} + +namespace { + +/* + Wrapper class to provide public access to the icu::RuleBasedBreakIterator's + setbreakType method. +*/ +class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator +{ + public: + OOoRuleBasedBreakIterator(UDataMemory* image, + UErrorCode &status) + : icu::RuleBasedBreakIterator(image, status) + { }; + +}; + +} + +// loading ICU breakiterator on demand. +void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale, + sal_Int16 rBreakType, sal_Int16 nWordType, const char *rule, const OUString& rText) +{ + bool bNewBreak = false; + UErrorCode status = U_ZERO_ERROR; + sal_Int16 breakType = 0; + switch (rBreakType) { + case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break; + case LOAD_WORD_BREAKITERATOR: + assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT); + icuBI=&words[nWordType]; + switch (nWordType) { + case WordType::ANY_WORD: break; // odd but previous behavior + case WordType::ANYWORD_IGNOREWHITESPACES: + breakType = 0; rule = "edit_word"; break; + case WordType::DICTIONARY_WORD: + breakType = 1; rule = "dict_word"; break; + default: + case WordType::WORD_COUNT: + breakType = 2; rule = "count_word"; break; + } + break; + case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break; + case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break; + } + + // Using the cache map prevents accessing the file system for each + // udata_open() where ICU tries first files then data objects. And that for + // two fallbacks worst case... for each new allocated EditEngine, layout + // cell, ... *ouch* Also non-rule locale based iterators can be mapped. + // This also speeds up loading iterators for alternating or generally more + // than one language/locale in that iterators are not constructed and + // destroyed en masse. + // Four possible keys, locale rule based with break type, locale rule based + // only, rule based only, locale based with break type. A fifth global key + // for the initial lookup. + // Multiple global keys may map to identical value data. + // All enums used here should be in the range 0..9 so assert that and avoid + // expensive numeric conversion in append() for faster construction of the + // always used global key. + assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9); + const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8()); + OStringBuffer aKeyBuf(64); + aKeyBuf.append( aLangtagStr + ";" ); + if (rule) + aKeyBuf.append(rule); + aKeyBuf.append(";" + OStringChar(static_cast<char>('0'+breakType)) + ";" + + OStringChar(static_cast<char>('0'+rBreakType)) + ";" + + OStringChar( static_cast<char>('0'+nWordType))); + // langtag;rule;breakType;rBreakType;nWordType + const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear()); + + if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) + { + + auto aMapIt( theBIMap.find( aBIMapGlobalKey)); + bool bInMap = (aMapIt != theBIMap.end()); + if (bInMap) + icuBI->mpValue = aMapIt->second; + else + icuBI->mpValue.reset(); + + if (!bInMap && rule) + do + { + const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale); + + status = U_ZERO_ERROR; + udata_setAppData("OpenOffice", OpenOffice_dat, &status); + if ( !U_SUCCESS(status) ) + throw uno::RuntimeException("udata_setAppData returned error " + OUString::createFromAscii(u_errorName(status))); + + std::shared_ptr<OOoRuleBasedBreakIterator> rbi; + + if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty()) + { + // langtag;rule;breakType + const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType)); + aMapIt = theBIMap.find( aBIMapRuleTypeKey); + bInMap = (aMapIt != theBIMap.end()); + if (bInMap) + { + icuBI->mpValue = aMapIt->second; + icuBI->maBIMapKey = aBIMapGlobalKey; + theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); + break; // do + } + + rbi = std::make_shared<OOoRuleBasedBreakIterator>(udata_open("OpenOffice", "brk", + OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status); + + if (U_SUCCESS(status)) + { + icuBI->mpValue = std::make_shared<BI_ValueData>(); + icuBI->mpValue->mpBreakIterator = rbi; + theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue)); + } + else + { + rbi.reset(); + } + } + //use icu's breakiterator for Thai, Tibetan and Dzongkha + else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km") + { + // language;rule (not langtag, unless we'd actually load such) + OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8()); + const OString aBIMapRuleKey( aLanguage + ";" + rule); + aMapIt = theBIMap.find( aBIMapRuleKey); + bInMap = (aMapIt != theBIMap.end()); + if (bInMap) + { + icuBI->mpValue = aMapIt->second; + icuBI->maBIMapKey = aBIMapGlobalKey; + theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); + break; // do + } + + status = U_ZERO_ERROR; + OString aUDName = OString::Concat(rule) + "_" + aLanguage; + UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status); + if( U_SUCCESS(status) ) + rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status); + if ( U_SUCCESS(status) ) + { + icuBI->mpValue = std::make_shared<BI_ValueData>(); + icuBI->mpValue->mpBreakIterator = rbi; + theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue)); + } + else + { + rbi.reset(); + + // ;rule (only) + const OString aBIMapRuleOnlyKey( OString::Concat(";") + rule); + aMapIt = theBIMap.find( aBIMapRuleOnlyKey); + bInMap = (aMapIt != theBIMap.end()); + if (bInMap) + { + icuBI->mpValue = aMapIt->second; + icuBI->maBIMapKey = aBIMapGlobalKey; + theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); + break; // do + } + + status = U_ZERO_ERROR; + pUData = udata_open("OpenOffice", "brk", rule, &status); + if( U_SUCCESS(status) ) + rbi = std::make_shared<OOoRuleBasedBreakIterator>( pUData, status); + if ( U_SUCCESS(status) ) + { + icuBI->mpValue = std::make_shared<BI_ValueData>(); + icuBI->mpValue->mpBreakIterator = rbi; + theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue)); + } + else + { + rbi.reset(); + } + } + } + } while (false); + + if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) + do + { + // langtag;;;rBreakType (empty rule; empty breakType) + const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType)); + aMapIt = theBIMap.find( aBIMapLocaleTypeKey); + bInMap = (aMapIt != theBIMap.end()); + if (bInMap) + { + icuBI->mpValue = aMapIt->second; + icuBI->maBIMapKey = aBIMapGlobalKey; + theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); + break; // do + } + + icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale))); + std::shared_ptr< icu::BreakIterator > pBI; + + status = U_ZERO_ERROR; + switch (rBreakType) { + case LOAD_CHARACTER_BREAKITERATOR: + pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) ); + break; + case LOAD_WORD_BREAKITERATOR: + pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) ); + break; + case LOAD_SENTENCE_BREAKITERATOR: + pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) ); + break; + case LOAD_LINE_BREAKITERATOR: + pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) ); + break; + } + if ( !U_SUCCESS(status) || !pBI ) { + throw uno::RuntimeException("Failed to create ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status))); + } + icuBI->mpValue = std::make_shared<BI_ValueData>(); + icuBI->mpValue->mpBreakIterator = pBI; + theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue)); + } while (false); + if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) { + throw uno::RuntimeException("ICU BreakIterator is not properly initialized"); + } + icuBI->maBIMapKey = aBIMapGlobalKey; + if (!bInMap) + theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue)); + bNewBreak=true; + } + + if (!(bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData)) + return; + + const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr()); + + status = U_ZERO_ERROR; + icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status); + + if (!U_SUCCESS(status)) + throw uno::RuntimeException("utext_openUChars returned error " + OUString::createFromAscii(u_errorName(status))); + + icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status); + + if (!U_SUCCESS(status)) + throw uno::RuntimeException("Failed to set text for ICU BreakIterator: error " + OUString::createFromAscii(u_errorName(status))); + + icuBI->mpValue->maICUText = rText; +} + +sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text, + sal_Int32 nStartPos, const lang::Locale &rLocale, + sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) +{ + if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode + loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); + icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get(); + for (nDone = 0; nDone < nCount; nDone++) { + nStartPos = pBI->following(nStartPos); + if (nStartPos == icu::BreakIterator::DONE) + return Text.getLength(); + } + } else { // for CHARACTER mode + for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++) + Text.iterateCodePoints(&nStartPos); + } + return nStartPos; +} + +sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text, + sal_Int32 nStartPos, const lang::Locale& rLocale, + sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone ) +{ + if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode + loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text); + icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get(); + for (nDone = 0; nDone < nCount; nDone++) { + nStartPos = pBI->preceding(nStartPos); + if (nStartPos == icu::BreakIterator::DONE) + return 0; + } + } else { // for BS to delete one char and CHARACTER mode. + for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++) + Text.iterateCodePoints(&nStartPos, -1); + } + return nStartPos; +} + + +Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos, + const lang::Locale& rLocale, sal_Int16 rWordType ) +{ + loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text); + + Boundary rv; + rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos); + if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE ) + rv.endPos = result.startPos; + else { + if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES + && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0))) + || (rWordType == WordType::DICTIONARY_WORD + && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)))) + rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos); + + rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos); + if(rv.endPos == icu::BreakIterator::DONE) + rv.endPos = rv.startPos; + } + return rv; +} + + +Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos, + const lang::Locale& rLocale, sal_Int16 rWordType) +{ + loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text); + + Boundary rv; + rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos); + if( rv.startPos < 0) + rv.endPos = rv.startPos; + else { + + if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES + && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0))) + || (rWordType == WordType::DICTIONARY_WORD + && u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)))) + rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos); + + rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos); + if(rv.endPos == icu::BreakIterator::DONE) + rv.endPos = rv.startPos; + } + return rv; +} + + +Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale, + sal_Int16 rWordType, sal_Bool bDirection ) +{ + loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text); + sal_Int32 len = Text.getLength(); + + Boundary rv; + if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) { + rv.startPos = rv.endPos = nPos; + if((bDirection || nPos == 0) && nPos < len) //forward + rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos); + else + rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos); + } else { + if(nPos <= 0) { + rv.startPos = 0; + rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0; + } else if(nPos >= len) { + rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len); + rv.endPos = len; + } else { + rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos); + rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos); + } + } + if (rv.startPos == icu::BreakIterator::DONE) + rv.startPos = rv.endPos; + else if (rv.endPos == icu::BreakIterator::DONE) + rv.endPos = rv.startPos; + + return rv; +} + + +sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos, + const lang::Locale &rLocale ) +{ + loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text); + + sal_Int32 len = Text.getLength(); + if (len > 0 && nStartPos == len) + Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence + if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos)) + nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos); + + // skip preceding space. + sal_uInt32 ch = Text.iterateCodePoints(&nStartPos); + while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos); + Text.iterateCodePoints(&nStartPos, -1); + + return nStartPos; +} + +sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos, + const lang::Locale &rLocale ) +{ + loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text); + + sal_Int32 len = Text.getLength(); + if (len > 0 && nStartPos == len) + Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence + nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos); + + sal_Int32 nPos=nStartPos; + while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos; + + return nStartPos; +} + +LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak( + const OUString& Text, sal_Int32 nStartPos, + const lang::Locale& rLocale, sal_Int32 nMinBreakPos, + const LineBreakHyphenationOptions& hOptions, + const LineBreakUserOptions& /*rOptions*/ ) +{ + LineBreakResults lbr; + + if (nStartPos >= Text.getLength()) { + lbr.breakIndex = Text.getLength(); + lbr.breakType = BreakType::WORDBOUNDARY; + return lbr; + } + + loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text); + + icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get(); + bool GlueSpace=true; + while (GlueSpace) { + // don't break with Slash U+002F SOLIDUS at end of line; see "else" below! + if (pLineBI->preceding(nStartPos + 1) == nStartPos + && (nStartPos == 0 || Text[nStartPos - 1] != '/')) + { //Line boundary break + lbr.breakIndex = nStartPos; + lbr.breakType = BreakType::WORDBOUNDARY; + } else if (hOptions.rHyphenator.is()) { //Hyphenation break + sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0; + pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word" + + sal_Int32 nStartPosWordEnd = nStartPos; + while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation + nStartPosWordEnd --; + + Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale, + WordType::DICTIONARY_WORD, false); + + nStartPosWordEnd = wBoundary.endPos; + while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation + nStartPosWordEnd ++; + nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos; + if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos; +#define SPACE 0x0020 + while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE); + uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos, + wBoundary.endPos - wBoundary.startPos), rLocale, + static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions); + if (aHyphenatedWord.is()) { + lbr.rHyphenatedWord = aHyphenatedWord; + if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos ) + lbr.breakIndex = -1; + else + lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos(); + lbr.breakType = BreakType::HYPHENATION; + + // check not optimal hyphenation of "word-word" (word with hyphens) + if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) { + lbr.breakIndex = pLineBI->current(); + lbr.breakType = BreakType::WORDBOUNDARY; + } + + } else { + lbr.breakIndex = pLineBI->preceding(nStartPos); + lbr.breakType = BreakType::WORDBOUNDARY; + } + } else { //word boundary break + lbr.breakIndex = pLineBI->preceding(nStartPos); + lbr.breakType = BreakType::WORDBOUNDARY; + + // Special case for Slash U+002F SOLIDUS in URI and path names. + // TR14 defines that as SY: Symbols Allowing Break After (A). + // This is unwanted in paths, see also i#17155 + if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/') + { + // Look backward and take any whitespace before as a break + // opportunity. This also glues something like "w/o". + // Avoid an overly long path and break it as was indicated. + // Overly long here is arbitrarily defined. + const sal_Int32 nOverlyLong = 66; + sal_Int32 nPos = lbr.breakIndex - 1; + while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong) + { + if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1))) + { + lbr.breakIndex = nPos + 1; + break; + } + } + } + } + +#define WJ 0x2060 // Word Joiner + GlueSpace=false; + if (lbr.breakType == BreakType::WORDBOUNDARY) { + nStartPos = lbr.breakIndex; + if (nStartPos >= 0 && Text[nStartPos--] == WJ) + GlueSpace=true; + while (nStartPos >= 0 && + (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) { + if (Text[nStartPos--] == WJ) + GlueSpace=true; + } + if (GlueSpace && nStartPos < 0) { + lbr.breakIndex = 0; + break; + } + } + } + + return lbr; +} + +OUString SAL_CALL +BreakIterator_Unicode::getImplementationName() +{ + return OUString::createFromAscii(cBreakIterator); +} + +sal_Bool SAL_CALL +BreakIterator_Unicode::supportsService(const OUString& rServiceName) +{ + return cppu::supportsService(this, rServiceName); +} + +uno::Sequence< OUString > SAL_CALL +BreakIterator_Unicode::getSupportedServiceNames() +{ + uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) }; + return aRet; +} + +} + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * +com_sun_star_i18n_BreakIterator_Unicode_get_implementation( + css::uno::XComponentContext *, + css::uno::Sequence<css::uno::Any> const &) +{ + return cppu::acquire(new i18npool::BreakIterator_Unicode()); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |