diff options
Diffstat (limited to 'i18npool/source/breakiterator/xdictionary.cxx')
-rw-r--r-- | i18npool/source/breakiterator/xdictionary.cxx | 483 |
1 files changed, 483 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx new file mode 100644 index 000000000..3e1398422 --- /dev/null +++ b/i18npool/source/breakiterator/xdictionary.cxx @@ -0,0 +1,483 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <config_folders.h> + +#include <osl/file.h> +#include <osl/module.h> +#include <osl/mutex.hxx> +#include <rtl/ustrbuf.hxx> +#include <rtl/bootstrap.hxx> +#include <com/sun/star/i18n/ScriptType.hpp> +#include <com/sun/star/i18n/WordType.hpp> +#include <xdictionary.hxx> +#include <unicode/uchar.h> +#include <string.h> +#include <breakiteratorImpl.hxx> + +using namespace com::sun::star::i18n; + +namespace i18npool { + +#ifdef DICT_JA_ZH_IN_DATAFILE + +#elif !defined DISABLE_DYNLOADING + +extern "C" { static void thisModule() {} } + +#else + +extern "C" { + +sal_uInt8* getExistMark_ja(); +sal_Int16* getIndex1_ja(); +sal_Int32* getIndex2_ja(); +sal_Int32* getLenArray_ja(); +sal_Unicode* getDataArea_ja(); + +sal_uInt8* getExistMark_zh(); +sal_Int16* getIndex1_zh(); +sal_Int32* getIndex2_zh(); +sal_Int32* getLenArray_zh(); +sal_Unicode* getDataArea_zh(); + +} + +#endif + +xdictionary::xdictionary(const char *lang) : + boundary(), + japaneseWordBreak( false ) +{ + +#ifdef DICT_JA_ZH_IN_DATAFILE + + if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 ) + { + OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" ); + rtl::Bootstrap::expandMacros(sUrl); + + if( strcmp( lang, "ja" ) == 0 ) + sUrl += "ja.data"; + else if( strcmp( lang, "zh" ) == 0 ) + sUrl += "zh.data"; + + oslFileHandle aFileHandle; + sal_uInt64 nFileSize; + char *pMapping; + if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None && + osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None && + osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None ) + { + // We have the offsets to the parts of the file at its end, see gendict.cxx + sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize); + + data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]); + data.index2 = (sal_Int32*) (pMapping + pEOF[-2]); + data.index1 = (sal_Int16*) (pMapping + pEOF[-3]); + data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]); + data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]); + } + } + +#elif !defined DISABLE_DYNLOADING + + initDictionaryData( lang ); + +#else + + if( strcmp( lang, "ja" ) == 0 ) { + data.existMark = getExistMark_ja(); + data.index1 = getIndex1_ja(); + data.index2 = getIndex2_ja(); + data.lenArray = getLenArray_ja(); + data.dataArea = getDataArea_ja(); + } + else if( strcmp( lang, "zh" ) == 0 ) { + data.existMark = getExistMark_zh(); + data.index1 = getIndex1_zh(); + data.index2 = getIndex2_zh(); + data.lenArray = getLenArray_zh(); + data.dataArea = getDataArea_zh(); + } + +#endif + + for (WordBreakCache & i : cache) + i.size = 0; + + japaneseWordBreak = false; +} + +xdictionary::~xdictionary() +{ + for (const WordBreakCache & i : cache) { + if (i.size > 0) { + delete [] i.contents; + delete [] i.wordboundary; + } + } +} + +namespace { + struct datacache { + oslModule mhModule; + OString maLang; + xdictionarydata maData; + }; +} + +#if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING) + +void xdictionary::initDictionaryData(const char *pLang) +{ + // Global cache, never released for performance + static std::vector< datacache > aLoadedCache; + + osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() ); + for(const datacache & i : aLoadedCache) + { + if( i.maLang != pLang ) + { + data = i.maData; + return; + } + } + + // otherwise add to the cache, positive or negative. + datacache aEntry; + aEntry.maLang = OString( pLang, strlen( pLang ) ); + +#ifdef SAL_DLLPREFIX + OString sModuleName = // mostly "lib*.so" (with * == dict_zh) + SAL_DLLPREFIX +#else + OString sModuleName = // mostly "*.dll" (with * == dict_zh) +#endif + "dict_" + rtl::OStringView(pLang) + SAL_DLLEXTENSION; + aEntry.mhModule = osl_loadModuleRelativeAscii( &thisModule, sModuleName.getStr(), SAL_LOADMODULE_DEFAULT ); + if( aEntry.mhModule ) { + oslGenericFunction func; + func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" ); + aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)(); + func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" ); + aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)(); + func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" ); + aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)(); + func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" ); + aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)(); + func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" ); + aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)(); + } + + data = aEntry.maData; + aLoadedCache.push_back( aEntry ); +} + +#endif + +void xdictionary::setJapaneseWordBreak() +{ + japaneseWordBreak = true; +} + +bool xdictionary::exists(const sal_uInt32 c) +{ + // 0x1FFF is the hardcoded limit in gendict for data.existMarks + bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0; + if (!exist && japaneseWordBreak) + return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN; + else + return exist; +} + +sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) +{ + if ( !data.index1 ) return 0; + + sal_Int16 idx = data.index1[str[0] >> 8]; + + if (idx == 0xFF) return 0; + + idx = (idx<<8) | (str[0]&0xff); + + sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1]; + + if (begin == 0) return 0; + + str++; sLen--; // first character is not stored in the dictionary + for (sal_uInt32 i = end; i > begin; i--) { + sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1]; + if (sLen >= len) { + const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1]; + sal_Int32 pos = 0; + + while (pos < len && dstr[pos] == str[pos]) { pos++; } + + if (pos == len) + return len + 1; + } + } + return 0; +} + + +/* + * c-tor + */ + +WordBreakCache::WordBreakCache() : + length( 0 ), + contents( nullptr ), + wordboundary( nullptr ), + size( 0 ) +{ +} + +/* + * Compare two unicode string, + */ + +bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary) +{ + // Different length, different string. + if (length != boundary.endPos - boundary.startPos) return false; + + for (sal_Int32 i = 0; i < length; i++) + if (contents[i] != str[i + boundary.startPos]) return false; + + return true; +} + + +/* + * Retrieve the segment containing the character at pos. + * @param pos : Position of the given character. + * @return true if CJK. + */ +bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos, + Boundary& segBoundary) +{ + sal_Int32 indexUtf16; + + if (segmentCachedString.pData != rText.pData) { + // Cache the passed text so we can avoid regenerating the segment if it's the same + // (pData is refcounted and assigning the OUString references it, which ensures that + // the object is the same if we get the same pointer back later) + segmentCachedString = rText; + } else { + // If pos is within the cached boundary, use that boundary + if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) { + segBoundary.startPos = segmentCachedBoundary.startPos; + segBoundary.endPos = segmentCachedBoundary.endPos; + indexUtf16 = segmentCachedBoundary.startPos; + rText.iterateCodePoints(&indexUtf16); + return segmentCachedBoundary.endPos > indexUtf16; + } + } + + segBoundary.endPos = segBoundary.startPos = pos; + + indexUtf16 = pos; + while (indexUtf16 > 0) + { + sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1); + if (u_isWhitespace(ch) || exists(ch)) + segBoundary.startPos = indexUtf16; + else + break; + } + + indexUtf16 = pos; + while (indexUtf16 < rText.getLength()) + { + sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16); + if (u_isWhitespace(ch) || exists(ch)) + segBoundary.endPos = indexUtf16; + else + break; + } + + // Cache the calculated boundary + segmentCachedBoundary.startPos = segBoundary.startPos; + segmentCachedBoundary.endPos = segBoundary.endPos; + + indexUtf16 = segBoundary.startPos; + rText.iterateCodePoints(&indexUtf16); + return segBoundary.endPos > indexUtf16; +} + +#define KANJA 1 +#define KATAKANA 2 +#define HIRAKANA 3 + +static sal_Int16 JapaneseCharType(sal_Unicode c) +{ + if (0x3041 <= c && c <= 0x309e) + return HIRAKANA; + if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f)) + return KATAKANA; + return KANJA; +} + +WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary) +{ + WordBreakCache& rCache = cache[text[0] & 0x1f]; + + if (rCache.size != 0 && rCache.equals(text, wordBoundary)) + return rCache; + + sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos; + + if (rCache.size == 0 || len > rCache.size) { + if (rCache.size != 0) { + delete [] rCache.contents; + delete [] rCache.wordboundary; + rCache.size = len; + } + else + rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE); + rCache.contents = new sal_Unicode[rCache.size + 1]; + rCache.wordboundary = new sal_Int32[rCache.size + 2]; + } + rCache.length = len; + memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode)); + *(rCache.contents + len) = 0x0000; + // reset the wordboundary in cache + memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2)); + + sal_Int32 i = 0; // loop variable + while (rCache.wordboundary[i] < rCache.length) { + len = 0; + // look the continuous white space as one word and cache it + while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len]))) + len ++; + + if (len == 0) { + const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i]; + sal_Int32 slen = rCache.length - rCache.wordboundary[i]; + sal_Int16 type = 0, count = 0; + for (;len == 0 && slen > 0; str++, slen--) { + len = getLongestMatch(str, slen); + if (len == 0) { + if (!japaneseWordBreak) { + len = 1; + } else { + if (count == 0) + type = JapaneseCharType(*str); + else if (type != JapaneseCharType(*str)) + break; + count++; + } + } + } + if (count) + { + rCache.wordboundary[i+1] = rCache.wordboundary[i] + count; + i++; + } + } + + if (len) { + rCache.wordboundary[i+1] = rCache.wordboundary[i] + len; + i++; + } + } + rCache.wordboundary[i + 1] = rCache.length + 1; + + return rCache; +} + +Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) +{ + // looking for the first non-whitespace character from anyPos + sal_uInt32 ch = 0; + if (anyPos > 0) + rText.iterateCodePoints(&anyPos, -1); + + while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1); + + return getWordBoundary(rText, anyPos, wordType, true); +} + +Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType) +{ + boundary = getWordBoundary(rText, anyPos, wordType, true); + anyPos = boundary.endPos; + const sal_Int32 nLen = rText.getLength(); + if (anyPos < nLen) { + // looking for the first non-whitespace character from anyPos + sal_uInt32 ch = rText.iterateCodePoints(&anyPos); + while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos); + if (anyPos > 0) + rText.iterateCodePoints(&anyPos, -1); + } + + return getWordBoundary(rText, anyPos, wordType, true); +} + +Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection) +{ + const sal_Unicode *text=rText.getStr(); + sal_Int32 len=rText.getLength(); + if (anyPos >= len || anyPos < 0) { + boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len; + } else if (seekSegment(rText, anyPos, boundary)) { // character in dict + WordBreakCache& aCache = getCache(text, boundary); + sal_Int32 i = 0; + + while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++; + + sal_Int32 startPos = aCache.wordboundary[i - 1]; + // if bDirection is false + if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos)) + { + sal_Int32 indexUtf16 = anyPos-1; + sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16); + if (u_isWhitespace(ch)) + i--; + } + + boundary.endPos = boundary.startPos; + boundary.endPos += aCache.wordboundary[i]; + boundary.startPos += aCache.wordboundary[i-1]; + + } else { + boundary.startPos = anyPos; + if (anyPos < len) rText.iterateCodePoints(&anyPos); + boundary.endPos = std::min(anyPos, len); + } + if (wordType == WordType::WORD_COUNT) { + // skip punctuation for word count. + while (boundary.endPos < len) + { + sal_Int32 indexUtf16 = boundary.endPos; + if (u_ispunct(rText.iterateCodePoints(&indexUtf16))) + boundary.endPos = indexUtf16; + else + break; + } + } + + return boundary; +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |