diff options
Diffstat (limited to 'i18npool/source/breakiterator/breakiterator_cjk.cxx')
-rw-r--r-- | i18npool/source/breakiterator/breakiterator_cjk.cxx | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx b/i18npool/source/breakiterator/breakiterator_cjk.cxx new file mode 100644 index 000000000..baf302e6e --- /dev/null +++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx @@ -0,0 +1,188 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <breakiterator_cjk.hxx> +#include <localedata.hxx> + +#include <com/sun/star/i18n/BreakType.hpp> +#include <com/sun/star/i18n/ScriptType.hpp> + +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star::lang; + +namespace i18npool { + +// ---------------------------------------------------- +// class BreakIterator_CJK +// ----------------------------------------------------; + +BreakIterator_CJK::BreakIterator_CJK() +{ + cBreakIterator = "com.sun.star.i18n.BreakIterator_CJK"; +} + +Boundary SAL_CALL +BreakIterator_CJK::previousWord(const OUString& text, sal_Int32 anyPos, + const css::lang::Locale& nLocale, sal_Int16 wordType) +{ + if (m_xDict) { + result = m_xDict->previousWord(text, anyPos, wordType); + // #109813# for non-CJK, single character word, fallback to ICU breakiterator. + if (result.endPos - result.startPos != 1 || + getScriptType(text, result.startPos) == ScriptType::ASIAN) + return result; + result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true); + if (result.endPos < anyPos) + return result; + } + return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType); +} + +Boundary SAL_CALL +BreakIterator_CJK::nextWord(const OUString& text, sal_Int32 anyPos, + const css::lang::Locale& nLocale, sal_Int16 wordType) +{ + if (m_xDict) { + result = m_xDict->nextWord(text, anyPos, wordType); + // #109813# for non-CJK, single character word, fallback to ICU breakiterator. + if (result.endPos - result.startPos != 1 || + getScriptType(text, result.startPos) == ScriptType::ASIAN) + return result; + result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true); + if (result.startPos > anyPos) + return result; + } + return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType); +} + +Boundary SAL_CALL +BreakIterator_CJK::getWordBoundary( const OUString& text, sal_Int32 anyPos, + const css::lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection ) +{ + if (m_xDict) { + result = m_xDict->getWordBoundary(text, anyPos, wordType, bDirection); + // #109813# for non-CJK, single character word, fallback to ICU breakiterator. + if (result.endPos - result.startPos != 1 || + getScriptType(text, result.startPos) == ScriptType::ASIAN) + return result; + } + return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection); +} + +namespace { +bool isHangul( sal_Unicode cCh ) +{ + return (cCh >= 0xAC00 && cCh <= 0xD7AF) || (cCh >= 0x1100 && cCh <= 0x11FF) || + (cCh >= 0xA960 && cCh <= 0xA97F) || (cCh >= 0xD7B0 && cCh <= 0xD7FF) || + (cCh >= 0x3130 && cCh <= 0x318F); +} +} + +LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak( + const OUString& Text, sal_Int32 nStartPos, + const css::lang::Locale& /*rLocale*/, sal_Int32 /*nMinBreakPos*/, + const LineBreakHyphenationOptions& /*hOptions*/, + const LineBreakUserOptions& bOptions ) +{ + LineBreakResults lbr; + + const sal_Int32 nOldStartPos = nStartPos; + + if (bOptions.allowPunctuationOutsideMargin && + nStartPos != Text.getLength() && + hangingCharacters.indexOf(Text[nStartPos]) != -1 && + (Text.iterateCodePoints( &nStartPos ), nStartPos == Text.getLength())) { + ; // do nothing + } else if (bOptions.applyForbiddenRules && 0 < nStartPos && nStartPos < Text.getLength()) { + + while (nStartPos > 0 && + (bOptions.forbiddenBeginCharacters.indexOf(Text[nStartPos]) != -1 || + bOptions.forbiddenEndCharacters.indexOf(Text[nStartPos-1]) != -1)) + Text.iterateCodePoints( &nStartPos, -1); + } + + // Prevent cutting Korean words in the middle. + if (nOldStartPos == nStartPos && nStartPos < Text.getLength() + && isHangul(Text[nStartPos])) + { + while ( nStartPos >= 0 && isHangul( Text[nStartPos] ) ) + --nStartPos; + + // beginning of the last Korean word. + if ( nStartPos < nOldStartPos ) + ++nStartPos; + + if ( nStartPos == 0 ) + nStartPos = nOldStartPos; + } + + lbr.breakIndex = nStartPos; + lbr.breakType = BreakType::WORDBOUNDARY; + return lbr; +} + +#define LOCALE(language, country) css::lang::Locale(language, country, OUString()) +// ---------------------------------------------------- +// class BreakIterator_zh +// ----------------------------------------------------; +BreakIterator_zh::BreakIterator_zh() +{ + m_xDict = std::make_unique<xdictionary>("zh"); + assert(hangingCharacters.pData); + hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "CN")); + cBreakIterator = "com.sun.star.i18n.BreakIterator_zh"; +} + +// ---------------------------------------------------- +// class BreakIterator_zh_TW +// ----------------------------------------------------; +BreakIterator_zh_TW::BreakIterator_zh_TW() +{ + m_xDict = std::make_unique<xdictionary>("zh"); + assert(hangingCharacters.pData); + hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "TW")); + cBreakIterator = "com.sun.star.i18n.BreakIterator_zh_TW"; +} + +// ---------------------------------------------------- +// class BreakIterator_ja +// ----------------------------------------------------; +BreakIterator_ja::BreakIterator_ja() +{ + m_xDict = std::make_unique<xdictionary>("ja"); + m_xDict->setJapaneseWordBreak(); + assert(hangingCharacters.pData); + hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("ja", "JP")); + cBreakIterator = "com.sun.star.i18n.BreakIterator_ja"; +} + +// ---------------------------------------------------- +// class BreakIterator_ko +// ----------------------------------------------------; +BreakIterator_ko::BreakIterator_ko() +{ + assert(hangingCharacters.pData); + hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("ko", "KR")); + cBreakIterator = "com.sun.star.i18n.BreakIterator_ko"; +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |