summaryrefslogtreecommitdiffstats
path: root/i18npool/source/breakiterator/xdictionary.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'i18npool/source/breakiterator/xdictionary.cxx')
-rw-r--r--i18npool/source/breakiterator/xdictionary.cxx483
1 files changed, 483 insertions, 0 deletions
diff --git a/i18npool/source/breakiterator/xdictionary.cxx b/i18npool/source/breakiterator/xdictionary.cxx
new file mode 100644
index 000000000..3e1398422
--- /dev/null
+++ b/i18npool/source/breakiterator/xdictionary.cxx
@@ -0,0 +1,483 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <config_folders.h>
+
+#include <osl/file.h>
+#include <osl/module.h>
+#include <osl/mutex.hxx>
+#include <rtl/ustrbuf.hxx>
+#include <rtl/bootstrap.hxx>
+#include <com/sun/star/i18n/ScriptType.hpp>
+#include <com/sun/star/i18n/WordType.hpp>
+#include <xdictionary.hxx>
+#include <unicode/uchar.h>
+#include <string.h>
+#include <breakiteratorImpl.hxx>
+
+using namespace com::sun::star::i18n;
+
+namespace i18npool {
+
+#ifdef DICT_JA_ZH_IN_DATAFILE
+
+#elif !defined DISABLE_DYNLOADING
+
+extern "C" { static void thisModule() {} }
+
+#else
+
+extern "C" {
+
+sal_uInt8* getExistMark_ja();
+sal_Int16* getIndex1_ja();
+sal_Int32* getIndex2_ja();
+sal_Int32* getLenArray_ja();
+sal_Unicode* getDataArea_ja();
+
+sal_uInt8* getExistMark_zh();
+sal_Int16* getIndex1_zh();
+sal_Int32* getIndex2_zh();
+sal_Int32* getLenArray_zh();
+sal_Unicode* getDataArea_zh();
+
+}
+
+#endif
+
+xdictionary::xdictionary(const char *lang) :
+ boundary(),
+ japaneseWordBreak( false )
+{
+
+#ifdef DICT_JA_ZH_IN_DATAFILE
+
+ if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
+ {
+ OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
+ rtl::Bootstrap::expandMacros(sUrl);
+
+ if( strcmp( lang, "ja" ) == 0 )
+ sUrl += "ja.data";
+ else if( strcmp( lang, "zh" ) == 0 )
+ sUrl += "zh.data";
+
+ oslFileHandle aFileHandle;
+ sal_uInt64 nFileSize;
+ char *pMapping;
+ if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
+ osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None &&
+ osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
+ {
+ // We have the offsets to the parts of the file at its end, see gendict.cxx
+ sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize);
+
+ data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]);
+ data.index2 = (sal_Int32*) (pMapping + pEOF[-2]);
+ data.index1 = (sal_Int16*) (pMapping + pEOF[-3]);
+ data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]);
+ data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]);
+ }
+ }
+
+#elif !defined DISABLE_DYNLOADING
+
+ initDictionaryData( lang );
+
+#else
+
+ if( strcmp( lang, "ja" ) == 0 ) {
+ data.existMark = getExistMark_ja();
+ data.index1 = getIndex1_ja();
+ data.index2 = getIndex2_ja();
+ data.lenArray = getLenArray_ja();
+ data.dataArea = getDataArea_ja();
+ }
+ else if( strcmp( lang, "zh" ) == 0 ) {
+ data.existMark = getExistMark_zh();
+ data.index1 = getIndex1_zh();
+ data.index2 = getIndex2_zh();
+ data.lenArray = getLenArray_zh();
+ data.dataArea = getDataArea_zh();
+ }
+
+#endif
+
+ for (WordBreakCache & i : cache)
+ i.size = 0;
+
+ japaneseWordBreak = false;
+}
+
+xdictionary::~xdictionary()
+{
+ for (const WordBreakCache & i : cache) {
+ if (i.size > 0) {
+ delete [] i.contents;
+ delete [] i.wordboundary;
+ }
+ }
+}
+
+namespace {
+ struct datacache {
+ oslModule mhModule;
+ OString maLang;
+ xdictionarydata maData;
+ };
+}
+
+#if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
+
+void xdictionary::initDictionaryData(const char *pLang)
+{
+ // Global cache, never released for performance
+ static std::vector< datacache > aLoadedCache;
+
+ osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
+ for(const datacache & i : aLoadedCache)
+ {
+ if( i.maLang != pLang )
+ {
+ data = i.maData;
+ return;
+ }
+ }
+
+ // otherwise add to the cache, positive or negative.
+ datacache aEntry;
+ aEntry.maLang = OString( pLang, strlen( pLang ) );
+
+#ifdef SAL_DLLPREFIX
+ OString sModuleName = // mostly "lib*.so" (with * == dict_zh)
+ SAL_DLLPREFIX
+#else
+ OString sModuleName = // mostly "*.dll" (with * == dict_zh)
+#endif
+ "dict_" + rtl::OStringView(pLang) + SAL_DLLEXTENSION;
+ aEntry.mhModule = osl_loadModuleRelativeAscii( &thisModule, sModuleName.getStr(), SAL_LOADMODULE_DEFAULT );
+ if( aEntry.mhModule ) {
+ oslGenericFunction func;
+ func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
+ aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
+ func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
+ aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
+ func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
+ aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
+ func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
+ aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
+ func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
+ aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
+ }
+
+ data = aEntry.maData;
+ aLoadedCache.push_back( aEntry );
+}
+
+#endif
+
+void xdictionary::setJapaneseWordBreak()
+{
+ japaneseWordBreak = true;
+}
+
+bool xdictionary::exists(const sal_uInt32 c)
+{
+ // 0x1FFF is the hardcoded limit in gendict for data.existMarks
+ bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
+ if (!exist && japaneseWordBreak)
+ return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN;
+ else
+ return exist;
+}
+
+sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
+{
+ if ( !data.index1 ) return 0;
+
+ sal_Int16 idx = data.index1[str[0] >> 8];
+
+ if (idx == 0xFF) return 0;
+
+ idx = (idx<<8) | (str[0]&0xff);
+
+ sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
+
+ if (begin == 0) return 0;
+
+ str++; sLen--; // first character is not stored in the dictionary
+ for (sal_uInt32 i = end; i > begin; i--) {
+ sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
+ if (sLen >= len) {
+ const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
+ sal_Int32 pos = 0;
+
+ while (pos < len && dstr[pos] == str[pos]) { pos++; }
+
+ if (pos == len)
+ return len + 1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * c-tor
+ */
+
+WordBreakCache::WordBreakCache() :
+ length( 0 ),
+ contents( nullptr ),
+ wordboundary( nullptr ),
+ size( 0 )
+{
+}
+
+/*
+ * Compare two unicode string,
+ */
+
+bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary)
+{
+ // Different length, different string.
+ if (length != boundary.endPos - boundary.startPos) return false;
+
+ for (sal_Int32 i = 0; i < length; i++)
+ if (contents[i] != str[i + boundary.startPos]) return false;
+
+ return true;
+}
+
+
+/*
+ * Retrieve the segment containing the character at pos.
+ * @param pos : Position of the given character.
+ * @return true if CJK.
+ */
+bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
+ Boundary& segBoundary)
+{
+ sal_Int32 indexUtf16;
+
+ if (segmentCachedString.pData != rText.pData) {
+ // Cache the passed text so we can avoid regenerating the segment if it's the same
+ // (pData is refcounted and assigning the OUString references it, which ensures that
+ // the object is the same if we get the same pointer back later)
+ segmentCachedString = rText;
+ } else {
+ // If pos is within the cached boundary, use that boundary
+ if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
+ segBoundary.startPos = segmentCachedBoundary.startPos;
+ segBoundary.endPos = segmentCachedBoundary.endPos;
+ indexUtf16 = segmentCachedBoundary.startPos;
+ rText.iterateCodePoints(&indexUtf16);
+ return segmentCachedBoundary.endPos > indexUtf16;
+ }
+ }
+
+ segBoundary.endPos = segBoundary.startPos = pos;
+
+ indexUtf16 = pos;
+ while (indexUtf16 > 0)
+ {
+ sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
+ if (u_isWhitespace(ch) || exists(ch))
+ segBoundary.startPos = indexUtf16;
+ else
+ break;
+ }
+
+ indexUtf16 = pos;
+ while (indexUtf16 < rText.getLength())
+ {
+ sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
+ if (u_isWhitespace(ch) || exists(ch))
+ segBoundary.endPos = indexUtf16;
+ else
+ break;
+ }
+
+ // Cache the calculated boundary
+ segmentCachedBoundary.startPos = segBoundary.startPos;
+ segmentCachedBoundary.endPos = segBoundary.endPos;
+
+ indexUtf16 = segBoundary.startPos;
+ rText.iterateCodePoints(&indexUtf16);
+ return segBoundary.endPos > indexUtf16;
+}
+
+#define KANJA 1
+#define KATAKANA 2
+#define HIRAKANA 3
+
+static sal_Int16 JapaneseCharType(sal_Unicode c)
+{
+ if (0x3041 <= c && c <= 0x309e)
+ return HIRAKANA;
+ if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
+ return KATAKANA;
+ return KANJA;
+}
+
+WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary)
+{
+ WordBreakCache& rCache = cache[text[0] & 0x1f];
+
+ if (rCache.size != 0 && rCache.equals(text, wordBoundary))
+ return rCache;
+
+ sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
+
+ if (rCache.size == 0 || len > rCache.size) {
+ if (rCache.size != 0) {
+ delete [] rCache.contents;
+ delete [] rCache.wordboundary;
+ rCache.size = len;
+ }
+ else
+ rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE);
+ rCache.contents = new sal_Unicode[rCache.size + 1];
+ rCache.wordboundary = new sal_Int32[rCache.size + 2];
+ }
+ rCache.length = len;
+ memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
+ *(rCache.contents + len) = 0x0000;
+ // reset the wordboundary in cache
+ memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
+
+ sal_Int32 i = 0; // loop variable
+ while (rCache.wordboundary[i] < rCache.length) {
+ len = 0;
+ // look the continuous white space as one word and cache it
+ while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len])))
+ len ++;
+
+ if (len == 0) {
+ const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
+ sal_Int32 slen = rCache.length - rCache.wordboundary[i];
+ sal_Int16 type = 0, count = 0;
+ for (;len == 0 && slen > 0; str++, slen--) {
+ len = getLongestMatch(str, slen);
+ if (len == 0) {
+ if (!japaneseWordBreak) {
+ len = 1;
+ } else {
+ if (count == 0)
+ type = JapaneseCharType(*str);
+ else if (type != JapaneseCharType(*str))
+ break;
+ count++;
+ }
+ }
+ }
+ if (count)
+ {
+ rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
+ i++;
+ }
+ }
+
+ if (len) {
+ rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
+ i++;
+ }
+ }
+ rCache.wordboundary[i + 1] = rCache.length + 1;
+
+ return rCache;
+}
+
+Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
+{
+ // looking for the first non-whitespace character from anyPos
+ sal_uInt32 ch = 0;
+ if (anyPos > 0)
+ rText.iterateCodePoints(&anyPos, -1);
+
+ while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
+
+ return getWordBoundary(rText, anyPos, wordType, true);
+}
+
+Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
+{
+ boundary = getWordBoundary(rText, anyPos, wordType, true);
+ anyPos = boundary.endPos;
+ const sal_Int32 nLen = rText.getLength();
+ if (anyPos < nLen) {
+ // looking for the first non-whitespace character from anyPos
+ sal_uInt32 ch = rText.iterateCodePoints(&anyPos);
+ while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos);
+ if (anyPos > 0)
+ rText.iterateCodePoints(&anyPos, -1);
+ }
+
+ return getWordBoundary(rText, anyPos, wordType, true);
+}
+
+Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
+{
+ const sal_Unicode *text=rText.getStr();
+ sal_Int32 len=rText.getLength();
+ if (anyPos >= len || anyPos < 0) {
+ boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
+ } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
+ WordBreakCache& aCache = getCache(text, boundary);
+ sal_Int32 i = 0;
+
+ while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
+
+ sal_Int32 startPos = aCache.wordboundary[i - 1];
+ // if bDirection is false
+ if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
+ {
+ sal_Int32 indexUtf16 = anyPos-1;
+ sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
+ if (u_isWhitespace(ch))
+ i--;
+ }
+
+ boundary.endPos = boundary.startPos;
+ boundary.endPos += aCache.wordboundary[i];
+ boundary.startPos += aCache.wordboundary[i-1];
+
+ } else {
+ boundary.startPos = anyPos;
+ if (anyPos < len) rText.iterateCodePoints(&anyPos);
+ boundary.endPos = std::min(anyPos, len);
+ }
+ if (wordType == WordType::WORD_COUNT) {
+ // skip punctuation for word count.
+ while (boundary.endPos < len)
+ {
+ sal_Int32 indexUtf16 = boundary.endPos;
+ if (u_ispunct(rText.iterateCodePoints(&indexUtf16)))
+ boundary.endPos = indexUtf16;
+ else
+ break;
+ }
+ }
+
+ return boundary;
+}
+
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */