summaryrefslogtreecommitdiffstats
path: root/lingucomponent/source/thesaurus/libnth/nthesimp.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'lingucomponent/source/thesaurus/libnth/nthesimp.cxx')
-rw-r--r--lingucomponent/source/thesaurus/libnth/nthesimp.cxx571
1 files changed, 571 insertions, 0 deletions
diff --git a/lingucomponent/source/thesaurus/libnth/nthesimp.cxx b/lingucomponent/source/thesaurus/libnth/nthesimp.cxx
new file mode 100644
index 000000000..ea3e3af8d
--- /dev/null
+++ b/lingucomponent/source/thesaurus/libnth/nthesimp.cxx
@@ -0,0 +1,571 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <com/sun/star/uno/Reference.h>
+#include <cppuhelper/factory.hxx>
+#include <cppuhelper/supportsservice.hxx>
+#include <cppuhelper/weak.hxx>
+#include <com/sun/star/linguistic2/LinguServiceManager.hpp>
+#include <com/sun/star/linguistic2/XLinguProperties.hpp>
+#include <com/sun/star/linguistic2/XSpellChecker1.hpp>
+#include <i18nlangtag/languagetag.hxx>
+#include <tools/debug.hxx>
+#include <comphelper/lok.hxx>
+#include <comphelper/processfactory.hxx>
+#include <comphelper/sequence.hxx>
+#include <osl/mutex.hxx>
+#include <osl/thread.h>
+#include <unotools/lingucfg.hxx>
+#include <unotools/resmgr.hxx>
+
+#include <rtl/string.hxx>
+#include <rtl/textenc.h>
+
+#include <svtools/strings.hrc>
+
+#include "nthesimp.hxx"
+#include <linguistic/misc.hxx>
+#include "nthesdta.hxx"
+
+#include <vector>
+#include <numeric>
+#include <set>
+#include <string.h>
+
+// XML-header to query SPELLML support
+constexpr OUStringLiteral SPELLML_SUPPORT = u"<?xml?>";
+
+using namespace osl;
+using namespace com::sun::star;
+using namespace com::sun::star::beans;
+using namespace com::sun::star::lang;
+using namespace com::sun::star::uno;
+using namespace com::sun::star::linguistic2;
+using namespace linguistic;
+
+static uno::Reference< XLinguServiceManager2 > GetLngSvcMgr_Impl()
+{
+ uno::Reference< XComponentContext > xContext( comphelper::getProcessComponentContext() );
+ uno::Reference< XLinguServiceManager2 > xRes = LinguServiceManager::create( xContext ) ;
+ return xRes;
+}
+
+Thesaurus::Thesaurus() :
+ aEvtListeners ( GetLinguMutex() ), pPropHelper(nullptr), bDisposing(false),
+ prevLocale(LANGUAGE_DONTKNOW)
+{
+}
+
+Thesaurus::~Thesaurus()
+{
+ mvThesInfo.clear();
+ if (pPropHelper)
+ {
+ pPropHelper->RemoveAsPropListener();
+ }
+}
+
+PropertyHelper_Thesaurus& Thesaurus::GetPropHelper_Impl()
+{
+ if (!pPropHelper)
+ {
+ Reference< XLinguProperties > xPropSet = GetLinguProperties();
+
+ pPropHelper = new PropertyHelper_Thesaurus( static_cast<XThesaurus *>(this), xPropSet );
+ pPropHelper->AddAsPropListener(); //! after a reference is established
+ }
+ return *pPropHelper;
+}
+
+Sequence< Locale > SAL_CALL Thesaurus::getLocales()
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ // this routine should return the locales supported by the installed
+ // dictionaries.
+ if (mvThesInfo.empty())
+ {
+ SvtLinguConfig aLinguCfg;
+
+ // get list of dictionaries-to-use
+ std::vector< SvtLinguConfigDictionaryEntry > aDics;
+ uno::Sequence< OUString > aFormatList;
+ aLinguCfg.GetSupportedDictionaryFormatsFor( "Thesauri",
+ "org.openoffice.lingu.new.Thesaurus", aFormatList );
+ for (const auto& rFormat : std::as_const(aFormatList))
+ {
+ std::vector< SvtLinguConfigDictionaryEntry > aTmpDic(
+ aLinguCfg.GetActiveDictionariesByFormat( rFormat ) );
+ aDics.insert( aDics.end(), aTmpDic.begin(), aTmpDic.end() );
+ }
+
+ //!! for compatibility with old dictionaries (the ones not using extensions
+ //!! or new configuration entries, but still using the dictionary.lst file)
+ //!! Get the list of old style spell checking dictionaries to use...
+ std::vector< SvtLinguConfigDictionaryEntry > aOldStyleDics(
+ GetOldStyleDics( "THES" ) );
+
+ // to prefer dictionaries with configuration entries we will only
+ // use those old style dictionaries that add a language that
+ // is not yet supported by the list of new style dictionaries
+ MergeNewStyleDicsAndOldStyleDics( aDics, aOldStyleDics );
+
+ if (!aDics.empty())
+ {
+ // get supported locales from the dictionaries-to-use...
+ std::set<OUString> aLocaleNamesSet;
+ for (auto const& dict : aDics)
+ {
+ for (const auto& rLocaleName : dict.aLocaleNames)
+ {
+ if (!comphelper::LibreOfficeKit::isAllowlistedLanguage(rLocaleName))
+ continue;
+
+ aLocaleNamesSet.insert( rLocaleName );
+ }
+ }
+ // ... and add them to the resulting sequence
+ std::vector<Locale> aLocalesVec;
+ aLocalesVec.reserve(aLocaleNamesSet.size());
+
+ std::transform(aLocaleNamesSet.begin(), aLocaleNamesSet.end(), std::back_inserter(aLocalesVec),
+ [](const OUString& localeName) -> Locale { return LanguageTag::convertToLocale(localeName); });
+
+ aSuppLocales = comphelper::containerToSequence(aLocalesVec);
+
+ //! For each dictionary and each locale we need a separate entry.
+ //! If this results in more than one dictionary per locale than (for now)
+ //! it is undefined which dictionary gets used.
+ //! In the future the implementation should support using several dictionaries
+ //! for one locale.
+ sal_Int32 numthes = std::accumulate(aDics.begin(), aDics.end(), 0,
+ [](const sal_Int32 nSum, const SvtLinguConfigDictionaryEntry& dict) {
+ return nSum + dict.aLocaleNames.getLength(); });
+
+ // add dictionary information
+ mvThesInfo.resize(numthes);
+
+ sal_Int32 k = 0;
+ for (auto const& dict : aDics)
+ {
+ if (dict.aLocaleNames.hasElements() &&
+ dict.aLocations.hasElements())
+ {
+ // currently only one language per dictionary is supported in the actual implementation...
+ // Thus here we work-around this by adding the same dictionary several times.
+ // Once for each of its supported locales.
+ for (const auto& rLocaleName : dict.aLocaleNames)
+ {
+ LanguageTag aLanguageTag(rLocaleName);
+ mvThesInfo[k].aEncoding = RTL_TEXTENCODING_DONTKNOW;
+ mvThesInfo[k].aLocale = aLanguageTag.getLocale();
+ mvThesInfo[k].aCharSetInfo.reset( new CharClass( std::move(aLanguageTag) ) );
+ // also both files have to be in the same directory and the
+ // file names must only differ in the extension (.aff/.dic).
+ // Thus we use the first location only and strip the extension part.
+ OUString aLocation = dict.aLocations[0];
+ sal_Int32 nPos = aLocation.lastIndexOf( '.' );
+ aLocation = aLocation.copy( 0, nPos );
+ mvThesInfo[k].aName = aLocation;
+
+ ++k;
+ }
+ }
+ }
+ DBG_ASSERT( k == numthes, "index mismatch?" );
+ }
+ else
+ {
+ /* no dictionary found so register no dictionaries */
+ mvThesInfo.clear();
+ aSuppLocales.realloc(0);
+ }
+ }
+
+ return aSuppLocales;
+}
+
+sal_Bool SAL_CALL Thesaurus::hasLocale(const Locale& rLocale)
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ if (!aSuppLocales.hasElements())
+ getLocales();
+
+ return comphelper::findValue(aSuppLocales, rLocale) != -1;
+}
+
+Sequence < Reference < css::linguistic2::XMeaning > > SAL_CALL Thesaurus::queryMeanings(
+ const OUString& qTerm, const Locale& rLocale,
+ const css::uno::Sequence< css::beans::PropertyValue >& rProperties)
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ uno::Sequence< Reference< XMeaning > > aMeanings( 1 );
+ uno::Sequence< Reference< XMeaning > > noMeanings( 0 );
+ uno::Reference< XLinguServiceManager2 > xLngSvcMgr( GetLngSvcMgr_Impl() );
+ uno::Reference< XSpellChecker1 > xSpell;
+
+ OUString aRTerm(qTerm);
+ OUString aPTerm(qTerm);
+ CapType ct = CapType::UNKNOWN;
+ sal_Int32 stem = 0;
+ sal_Int32 stem2 = 0;
+
+ LanguageType nLanguage = LinguLocaleToLanguage( rLocale );
+
+ if (LinguIsUnspecified( nLanguage) || aRTerm.isEmpty())
+ return noMeanings;
+
+ if (!hasLocale( rLocale ))
+#ifdef LINGU_EXCEPTIONS
+ throw( IllegalArgumentException() );
+#else
+ return noMeanings;
+#endif
+
+ if (prevTerm == qTerm && prevLocale == nLanguage)
+ return prevMeanings;
+
+ mentry * pmean = nullptr;
+ sal_Int32 nmean = 0;
+
+ PropertyHelper_Thesaurus &rHelper = GetPropHelper();
+ rHelper.SetTmpPropVals( rProperties );
+
+ MyThes * pTH = nullptr;
+ rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
+ CharClass * pCC = nullptr;
+
+ // find the first thesaurus that matches the locale
+ for (size_t i =0; i < mvThesInfo.size(); i++)
+ {
+ if (rLocale == mvThesInfo[i].aLocale)
+ {
+ // open up and initialize this thesaurus if need be
+ if (!mvThesInfo[i].aThes)
+ {
+ OUString datpath = mvThesInfo[i].aName + ".dat";
+ OUString idxpath = mvThesInfo[i].aName + ".idx";
+ OUString ndat;
+ OUString nidx;
+ osl::FileBase::getSystemPathFromFileURL(datpath,ndat);
+ osl::FileBase::getSystemPathFromFileURL(idxpath,nidx);
+
+#if defined(_WIN32)
+ // MyThes waits UTF-8 encoded paths with \\?\ long path prefix.
+ OString aTmpidx = Win_AddLongPathPrefix(OUStringToOString(nidx, RTL_TEXTENCODING_UTF8));
+ OString aTmpdat = Win_AddLongPathPrefix(OUStringToOString(ndat, RTL_TEXTENCODING_UTF8));
+#else
+ OString aTmpidx(OU2ENC(nidx,osl_getThreadTextEncoding()));
+ OString aTmpdat(OU2ENC(ndat,osl_getThreadTextEncoding()));
+#endif
+
+ mvThesInfo[i].aThes.reset( new MyThes(aTmpidx.getStr(),aTmpdat.getStr()) );
+ mvThesInfo[i].aEncoding = getTextEncodingFromCharset(mvThesInfo[i].aThes->get_th_encoding());
+ }
+ pTH = mvThesInfo[i].aThes.get();
+ eEnc = mvThesInfo[i].aEncoding;
+ pCC = mvThesInfo[i].aCharSetInfo.get();
+
+ if (pTH)
+ break;
+ }
+ }
+
+ // we don't want to work with a default text encoding since following incorrect
+ // results may occur only for specific text and thus may be hard to notice.
+ // Thus better always make a clean exit here if the text encoding is in question.
+ // Hopefully something not working at all will raise proper attention quickly. ;-)
+ DBG_ASSERT( eEnc != RTL_TEXTENCODING_DONTKNOW, "failed to get text encoding! (maybe incorrect encoding string in file)" );
+ if (eEnc == RTL_TEXTENCODING_DONTKNOW)
+ return noMeanings;
+
+ while (pTH)
+ {
+ // convert word to all lower case for searching
+ if (!stem)
+ ct = capitalType(aRTerm, pCC);
+ OUString nTerm(makeLowerCase(aRTerm, pCC));
+ OString aTmp( OU2ENC(nTerm, eEnc) );
+ nmean = pTH->Lookup(aTmp.getStr(),aTmp.getLength(),&pmean);
+
+ if (nmean)
+ aMeanings.realloc( nmean );
+
+ mentry * pe = pmean;
+ OUString codeTerm = qTerm;
+ Reference< XSpellAlternatives > xTmpRes2;
+
+ if (stem)
+ {
+ xTmpRes2 = xSpell->spell( "<?xml?><query type='analyze'><word>" +
+ aPTerm + "</word></query>", static_cast<sal_uInt16>(nLanguage), rProperties );
+ if (xTmpRes2.is())
+ {
+ Sequence<OUString>seq = xTmpRes2->getAlternatives();
+ if (seq.hasElements())
+ {
+ codeTerm = seq[0];
+ stem2 = 1;
+ }
+ }
+ }
+
+ for (int j = 0; j < nmean; j++)
+ {
+ int count = pe->count;
+ if (count)
+ {
+ Sequence< OUString > aStr( count );
+ OUString *pStr = aStr.getArray();
+
+ for (int i=0; i < count; i++)
+ {
+ OUString sTerm(pe->psyns[i],strlen(pe->psyns[i]),eEnc );
+ sal_Int32 catpos = sTerm.indexOf('(');
+ OUString catst;
+ if (catpos > 2)
+ {
+ // remove category name for affixation and casing
+ catst = OUString::Concat(" ") + sTerm.subView(catpos);
+ sTerm = sTerm.copy(0, catpos);
+ sTerm = sTerm.trim();
+ }
+ // generate synonyms with affixes
+ if (stem && stem2)
+ {
+ Reference< XSpellAlternatives > xTmpRes = xSpell->spell( "<?xml?><query type='generate'><word>" +
+ sTerm + "</word>" + codeTerm + "</query>", static_cast<sal_uInt16>(nLanguage), rProperties );
+ if (xTmpRes.is())
+ {
+ Sequence<OUString>seq = xTmpRes->getAlternatives();
+ if (seq.hasElements())
+ sTerm = seq[0];
+ }
+ }
+
+ CapType ct1 = capitalType(sTerm, pCC);
+ if (CapType::MIXED == ct1)
+ ct = ct1;
+ OUString cTerm;
+ switch (ct)
+ {
+ case CapType::ALLCAP:
+ cTerm = makeUpperCase(sTerm, pCC);
+ break;
+ case CapType::INITCAP:
+ cTerm = makeInitCap(sTerm, pCC);
+ break;
+ default:
+ cTerm = sTerm;
+ break;
+ }
+ OUString aAlt( cTerm + catst);
+ pStr[i] = aAlt;
+ }
+ rtl::Reference<Meaning> pMn = new Meaning(aRTerm);
+ OUString dTerm(pe->defn,strlen(pe->defn),eEnc );
+ pMn->SetMeaning(dTerm);
+ pMn->SetSynonyms(aStr);
+ Reference<XMeaning>* pMeaning = aMeanings.getArray();
+ pMeaning[j] = pMn;
+ }
+ pe++;
+ }
+ pTH->CleanUpAfterLookup(&pmean,nmean);
+
+ if (nmean)
+ {
+ prevTerm = qTerm;
+ prevMeanings = aMeanings;
+ prevLocale = nLanguage;
+ return aMeanings;
+ }
+
+ if (stem || !xLngSvcMgr.is())
+ return noMeanings;
+ stem = 1;
+
+ xSpell.set( xLngSvcMgr->getSpellChecker(), UNO_QUERY );
+ if (!xSpell.is() || !xSpell->isValid( SPELLML_SUPPORT, static_cast<sal_uInt16>(nLanguage), rProperties ))
+ return noMeanings;
+ Reference< XSpellAlternatives > xTmpRes = xSpell->spell( "<?xml?><query type='stem'><word>" +
+ aRTerm + "</word></query>", static_cast<sal_uInt16>(nLanguage), rProperties );
+ if (xTmpRes.is())
+ {
+ Sequence<OUString>seq = xTmpRes->getAlternatives();
+ if (seq.hasElements())
+ {
+ aRTerm = seq[0]; // XXX Use only the first stem
+ continue;
+ }
+ }
+
+ // stem the last word of the synonym (for categories after affixation)
+ aRTerm = aRTerm.trim();
+ sal_Int32 pos = aRTerm.lastIndexOf(' ');
+ if (!pos)
+ return noMeanings;
+ xTmpRes = xSpell->spell( OUString::Concat("<?xml?><query type='stem'><word>") +
+ aRTerm.subView(pos + 1) + "</word></query>", static_cast<sal_uInt16>(nLanguage), rProperties );
+ if (xTmpRes.is())
+ {
+ Sequence<OUString>seq = xTmpRes->getAlternatives();
+ if (seq.hasElements())
+ {
+ aPTerm = aRTerm.copy(pos + 1);
+ aRTerm = aRTerm.subView(0, pos + 1) + seq[0];
+#if 0
+ for (int i = 0; i < seq.getLength(); i++)
+ {
+ OString o = OUStringToOString(seq[i], RTL_TEXTENCODING_UTF8);
+ fprintf(stderr, "%d: %s\n", i + 1, o.pData->buffer);
+ }
+#endif
+ continue;
+ }
+ }
+ break;
+ }
+ return noMeanings;
+}
+
+OUString SAL_CALL Thesaurus::getServiceDisplayName(const Locale& rLocale)
+{
+ std::locale loc(Translate::Create("svt", LanguageTag(rLocale)));
+ return Translate::get(STR_DESCRIPTION_MYTHES, loc);
+}
+
+void SAL_CALL Thesaurus::initialize( const Sequence< Any >& rArguments )
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ if (pPropHelper)
+ return;
+
+ sal_Int32 nLen = rArguments.getLength();
+ // Accept one of two args so we can be compatible with the call site in GetAvailLocales()
+ // linguistic module
+ if (1 == nLen || 2 == nLen)
+ {
+ Reference< XLinguProperties > xPropSet;
+ rArguments.getConstArray()[0] >>= xPropSet;
+ assert(xPropSet);
+
+ //! Pointer allows for access of the non-UNO functions.
+ //! And the reference to the UNO-functions while increasing
+ //! the ref-count and will implicitly free the memory
+ //! when the object is no longer used.
+ pPropHelper = new PropertyHelper_Thesaurus( static_cast<XThesaurus *>(this), xPropSet );
+ pPropHelper->AddAsPropListener(); //! after a reference is established
+ }
+ else
+ OSL_FAIL( "wrong number of arguments in sequence" );
+}
+
+OUString Thesaurus::makeLowerCase(const OUString& aTerm, CharClass const * pCC)
+{
+ if (pCC)
+ return pCC->lowercase(aTerm);
+ return aTerm;
+}
+
+OUString Thesaurus::makeUpperCase(const OUString& aTerm, CharClass const * pCC)
+{
+ if (pCC)
+ return pCC->uppercase(aTerm);
+ return aTerm;
+}
+
+OUString Thesaurus::makeInitCap(const OUString& aTerm, CharClass const * pCC)
+{
+ sal_Int32 tlen = aTerm.getLength();
+ if (pCC && tlen)
+ {
+ OUString bTemp = aTerm.copy(0,1);
+ if (tlen > 1)
+ {
+ return ( pCC->uppercase(bTemp, 0, 1)
+ + pCC->lowercase(aTerm,1,(tlen-1)) );
+ }
+
+ return pCC->uppercase(bTemp, 0, 1);
+ }
+ return aTerm;
+}
+
+void SAL_CALL Thesaurus::dispose()
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ if (!bDisposing)
+ {
+ bDisposing = true;
+ EventObject aEvtObj( static_cast<XThesaurus *>(this) );
+ aEvtListeners.disposeAndClear( aEvtObj );
+ if (pPropHelper)
+ {
+ pPropHelper->RemoveAsPropListener();
+ delete pPropHelper;
+ pPropHelper = nullptr;
+ }
+ }
+}
+
+void SAL_CALL Thesaurus::addEventListener( const Reference< XEventListener >& rxListener )
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ if (!bDisposing && rxListener.is())
+ aEvtListeners.addInterface( rxListener );
+}
+
+void SAL_CALL Thesaurus::removeEventListener( const Reference< XEventListener >& rxListener )
+{
+ MutexGuard aGuard( GetLinguMutex() );
+
+ if (!bDisposing && rxListener.is())
+ aEvtListeners.removeInterface( rxListener );
+}
+
+// Service specific part
+OUString SAL_CALL Thesaurus::getImplementationName()
+{
+ return "org.openoffice.lingu.new.Thesaurus";
+}
+
+sal_Bool SAL_CALL Thesaurus::supportsService( const OUString& ServiceName )
+{
+ return cppu::supportsService(this, ServiceName);
+}
+
+Sequence< OUString > SAL_CALL Thesaurus::getSupportedServiceNames()
+{
+ return { SN_THESAURUS };
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
+lingucomponent_Thesaurus_get_implementation(
+ css::uno::XComponentContext* , css::uno::Sequence<css::uno::Any> const&)
+{
+ return cppu::acquire(new Thesaurus());
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */