diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 16:51:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 16:51:28 +0000 |
commit | 940b4d1848e8c70ab7642901a68594e8016caffc (patch) | |
tree | eb72f344ee6c3d9b80a7ecc079ea79e9fba8676d /lingucomponent/source/languageguessing | |
parent | Initial commit. (diff) | |
download | libreoffice-940b4d1848e8c70ab7642901a68594e8016caffc.tar.xz libreoffice-940b4d1848e8c70ab7642901a68594e8016caffc.zip |
Adding upstream version 1:7.0.4.upstream/1%7.0.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lingucomponent/source/languageguessing')
-rw-r--r-- | lingucomponent/source/languageguessing/guess.cxx | 103 | ||||
-rw-r--r-- | lingucomponent/source/languageguessing/guess.hxx | 56 | ||||
-rw-r--r-- | lingucomponent/source/languageguessing/guesslang.component | 25 | ||||
-rw-r--r-- | lingucomponent/source/languageguessing/guesslang.cxx | 360 | ||||
-rw-r--r-- | lingucomponent/source/languageguessing/simpleguesser.cxx | 227 | ||||
-rw-r--r-- | lingucomponent/source/languageguessing/simpleguesser.hxx | 112 |
6 files changed, 883 insertions, 0 deletions
diff --git a/lingucomponent/source/languageguessing/guess.cxx b/lingucomponent/source/languageguessing/guess.cxx new file mode 100644 index 000000000..45700ff7d --- /dev/null +++ b/lingucomponent/source/languageguessing/guess.cxx @@ -0,0 +1,103 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <sal/config.h> + +#include <cassert> +#include <iostream> +#include <string.h> + +#ifdef SYSTEM_LIBEXTTEXTCAT +#include <libexttextcat/textcat.h> +#else +#include <textcat.h> +#endif + +#include "guess.hxx" + +/* Old textcat.h versions defined bad spelled constants. */ +#ifndef TEXTCAT_RESULT_UNKNOWN_STR +#define TEXTCAT_RESULT_UNKNOWN_STR _TEXTCAT_RESULT_UNKOWN +#endif + +#ifndef TEXTCAT_RESULT_SHORT_STR +#define TEXTCAT_RESULT_SHORT_STR _TEXTCAT_RESULT_SHORT +#endif + +using namespace std; + +Guess::Guess() + : language_str(DEFAULT_LANGUAGE) + , country_str(DEFAULT_COUNTRY) +{ +} + +/* +* this use a char * string to build the guess object +* a string like those is made as : [language-country-encoding]... +* +*/ +Guess::Guess(const char * guess_str) + : language_str(DEFAULT_LANGUAGE) + , country_str(DEFAULT_COUNTRY) +{ + //if the guess is not like "UNKNOWN" or "SHORT", go into the brackets + if(strcmp(guess_str + 1, TEXTCAT_RESULT_UNKNOWN_STR) == 0 + || strcmp(guess_str + 1, TEXTCAT_RESULT_SHORT_STR) == 0) + return; + + // From how this ctor is called from SimpleGuesser::GuessLanguage and + // SimpleGuesser::GetManagedLanguages in + // lingucomponent/source/languageguessing/simpleguesser.cxx, guess_str must start with "[": + assert(guess_str[0] == GUESS_SEPARATOR_OPEN); + auto const start = guess_str + 1; + // Only look at the prefix of guess_str, delimited by the next "]" or "[" or end-of-string; + // split it into at most three segments separated by "-" (where excess occurrences of "-" + // would become part of the third segment), like "en-US-utf8"; the first segment denotes the + // language; if there are three segments, the second denotes the country and the third the + // encoding; otherwise, the second segment, if any (e.g., in "haw-utf8"), denotes the + // encoding: + char const * dash1 = nullptr; + char const * dash2 = nullptr; + auto p = start; + for (;; ++p) { + auto const c = *p; + if (c == '\0' || c == GUESS_SEPARATOR_OPEN || c == GUESS_SEPARATOR_CLOSE) { + break; + } + if (c == GUESS_SEPARATOR_SEP) { + if (dash1 == nullptr) { + dash1 = p; + } else { + dash2 = p; + // The encoding is ignored, so we can stop as soon as we found the second "-": + break; + } + } + } + auto const langLen = (dash1 == nullptr ? p : dash1) - start; + if (langLen != 0) { // if not we use the default value + language_str.assign(start, langLen); + } + if (dash2 != nullptr) { + country_str.assign(dash1 + 1, dash2 - (dash1 + 1)); + } +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lingucomponent/source/languageguessing/guess.hxx b/lingucomponent/source/languageguessing/guess.hxx new file mode 100644 index 000000000..e68d852a5 --- /dev/null +++ b/lingucomponent/source/languageguessing/guess.hxx @@ -0,0 +1,56 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ +#ifndef INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_GUESS_HXX +#define INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_GUESS_HXX + +#define GUESS_SEPARATOR_OPEN '[' +#define GUESS_SEPARATOR_CLOSE ']' +#define GUESS_SEPARATOR_SEP '-' +#define DEFAULT_LANGUAGE "" +#define DEFAULT_COUNTRY "" +#define DEFAULT_ENCODING "" + +#include <string> + +using namespace std; + +class Guess final { + public: + + /** + * Default init + */ + Guess(); + + /** + * Init from a string like [en-UK-utf8] and the rank + */ + Guess(const char * guess_str); + + const string& GetLanguage() const { return language_str;} + const string& GetCountry() const { return country_str;} + + private: + string language_str; + string country_str; +}; + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lingucomponent/source/languageguessing/guesslang.component b/lingucomponent/source/languageguessing/guesslang.component new file mode 100644 index 000000000..8e2f2c001 --- /dev/null +++ b/lingucomponent/source/languageguessing/guesslang.component @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + --> + +<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@" + prefix="guesslang" xmlns="http://openoffice.org/2010/uno-components"> + <implementation name="com.sun.star.lingu2.LanguageGuessing"> + <service name="com.sun.star.linguistic2.LanguageGuessing"/> + </implementation> +</component> diff --git a/lingucomponent/source/languageguessing/guesslang.cxx b/lingucomponent/source/languageguessing/guesslang.cxx new file mode 100644 index 000000000..7e9d1999b --- /dev/null +++ b/lingucomponent/source/languageguessing/guesslang.cxx @@ -0,0 +1,360 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <iostream> + +#include <osl/file.hxx> +#include <tools/debug.hxx> + +#include <sal/config.h> +#include <cppuhelper/factory.hxx> +#include <cppuhelper/implementationentry.hxx> +#include <cppuhelper/implbase.hxx> +#include <cppuhelper/supportsservice.hxx> + +#include "simpleguesser.hxx" +#include "guess.hxx" + +#include <com/sun/star/lang/IllegalArgumentException.hpp> +#include <com/sun/star/lang/XServiceInfo.hpp> +#include <com/sun/star/linguistic2/XLanguageGuessing.hpp> +#include <unotools/pathoptions.hxx> +#include <osl/thread.h> + +#include <sal/macros.h> + +#ifdef SYSTEM_LIBEXTTEXTCAT +#include <libexttextcat/textcat.h> +#else +#include <textcat.h> +#endif + +using namespace ::osl; +using namespace ::cppu; +using namespace ::com::sun::star; +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::lang; +using namespace ::com::sun::star::linguistic2; + +#define SERVICENAME "com.sun.star.linguistic2.LanguageGuessing" +#define IMPLNAME "com.sun.star.lingu2.LanguageGuessing" + +static Sequence< OUString > getSupportedServiceNames_LangGuess_Impl() +{ + Sequence<OUString> names { SERVICENAME }; + return names; +} + +static OUString getImplementationName_LangGuess_Impl() +{ + return IMPLNAME; +} + +static osl::Mutex & GetLangGuessMutex() +{ + static osl::Mutex aMutex; + return aMutex; +} + +namespace { + +class LangGuess_Impl : + public ::cppu::WeakImplHelper< + XLanguageGuessing, + XServiceInfo > +{ + SimpleGuesser m_aGuesser; + bool m_bInitialized; + + virtual ~LangGuess_Impl() override {} + void EnsureInitialized(); + +public: + LangGuess_Impl(); + LangGuess_Impl(const LangGuess_Impl&) = delete; + LangGuess_Impl& operator=(const LangGuess_Impl&) = delete; + + // XServiceInfo implementation + virtual OUString SAL_CALL getImplementationName( ) override; + virtual sal_Bool SAL_CALL supportsService( const OUString& ServiceName ) override; + virtual Sequence< OUString > SAL_CALL getSupportedServiceNames( ) override; + + // XLanguageGuessing implementation + virtual css::lang::Locale SAL_CALL guessPrimaryLanguage( const OUString& aText, ::sal_Int32 nStartPos, ::sal_Int32 nLen ) override; + virtual void SAL_CALL disableLanguages( const css::uno::Sequence< css::lang::Locale >& aLanguages ) override; + virtual void SAL_CALL enableLanguages( const css::uno::Sequence< css::lang::Locale >& aLanguages ) override; + virtual css::uno::Sequence< css::lang::Locale > SAL_CALL getAvailableLanguages( ) override; + virtual css::uno::Sequence< css::lang::Locale > SAL_CALL getEnabledLanguages( ) override; + virtual css::uno::Sequence< css::lang::Locale > SAL_CALL getDisabledLanguages( ) override; + + // implementation specific + /// @throws RuntimeException + void SetFingerPrintsDB( const OUString &fileName ); +}; + +} + +LangGuess_Impl::LangGuess_Impl() : + m_bInitialized( false ) +{ +} + +void LangGuess_Impl::EnsureInitialized() +{ + if (m_bInitialized) + return; + + // set this to true at the very start to prevent loops because of + // implicitly called functions below + m_bInitialized = true; + + // set default fingerprint path to where those get installed + OUString aPhysPath; + OUString aURL( SvtPathOptions().GetFingerprintPath() ); + osl::FileBase::getSystemPathFromFileURL( aURL, aPhysPath ); +#ifdef _WIN32 + aPhysPath += "\\"; +#else + aPhysPath += "/"; +#endif + + SetFingerPrintsDB( aPhysPath ); + +#if !defined(EXTTEXTCAT_VERSION_MAJOR) + + // disable currently not functional languages... + struct LangCountry + { + const char *pLang; + const char *pCountry; + }; + LangCountry aDisable[] = + { + // not functional in modified libtextcat, but fixed in >= libexttextcat 3.1.0 + // which is the first with EXTTEXTCAT_VERSION_MAJOR defined + {"sco", ""}, {"zh", "CN"}, {"zh", "TW"}, {"ja", ""}, {"ko", ""}, + {"ka", ""}, {"hi", ""}, {"mr", ""}, {"ne", ""}, {"sa", ""}, + {"ta", ""}, {"th", ""}, {"qu", ""}, {"yi", ""} + }; + sal_Int32 nNum = SAL_N_ELEMENTS(aDisable); + Sequence< Locale > aDisableSeq( nNum ); + Locale *pDisableSeq = aDisableSeq.getArray(); + for (sal_Int32 i = 0; i < nNum; ++i) + { + Locale aLocale; + aLocale.Language = OUString::createFromAscii( aDisable[i].pLang ); + aLocale.Country = OUString::createFromAscii( aDisable[i].pCountry ); + pDisableSeq[i] = aLocale; + } + disableLanguages( aDisableSeq ); + DBG_ASSERT( nNum == getDisabledLanguages().getLength(), "size mismatch" ); +#endif +} + +Locale SAL_CALL LangGuess_Impl::guessPrimaryLanguage( + const OUString& rText, + ::sal_Int32 nStartPos, + ::sal_Int32 nLen ) +{ + osl::MutexGuard aGuard( GetLangGuessMutex() ); + + EnsureInitialized(); + + if (nStartPos < 0 || nLen < 0 || nStartPos + nLen > rText.getLength()) + throw lang::IllegalArgumentException(); + + OString o( OUStringToOString( rText.copy(nStartPos, nLen), RTL_TEXTENCODING_UTF8 ) ); + Guess g = m_aGuesser.GuessPrimaryLanguage(o.getStr()); + lang::Locale aRes; + aRes.Language = OUString::createFromAscii( g.GetLanguage().c_str() ); + aRes.Country = OUString::createFromAscii( g.GetCountry().c_str() ); + return aRes; +} + +#define DEFAULT_CONF_FILE_NAME "fpdb.conf" + +void LangGuess_Impl::SetFingerPrintsDB( + const OUString &filePath ) +{ + //! text encoding for file name / path needs to be in the same encoding the OS uses + OString path = OUStringToOString( filePath, osl_getThreadTextEncoding() ); + OString conf_file_path = path + DEFAULT_CONF_FILE_NAME; + + m_aGuesser.SetDBPath(conf_file_path.getStr(), path.getStr()); +} + +uno::Sequence< Locale > SAL_CALL LangGuess_Impl::getAvailableLanguages( ) +{ + osl::MutexGuard aGuard( GetLangGuessMutex() ); + + EnsureInitialized(); + + Sequence< css::lang::Locale > aRes; + vector<Guess> gs = m_aGuesser.GetAllManagedLanguages(); + aRes.realloc(gs.size()); + + css::lang::Locale *pRes = aRes.getArray(); + + for(size_t i = 0; i < gs.size() ; i++ ){ + css::lang::Locale current_aRes; + current_aRes.Language = OUString::createFromAscii( gs[i].GetLanguage().c_str() ); + current_aRes.Country = OUString::createFromAscii( gs[i].GetCountry().c_str() ); + pRes[i] = current_aRes; + } + + return aRes; +} + +uno::Sequence< Locale > SAL_CALL LangGuess_Impl::getEnabledLanguages( ) +{ + osl::MutexGuard aGuard( GetLangGuessMutex() ); + + EnsureInitialized(); + + Sequence< css::lang::Locale > aRes; + vector<Guess> gs = m_aGuesser.GetAvailableLanguages(); + aRes.realloc(gs.size()); + + css::lang::Locale *pRes = aRes.getArray(); + + for(size_t i = 0; i < gs.size() ; i++ ){ + css::lang::Locale current_aRes; + current_aRes.Language = OUString::createFromAscii( gs[i].GetLanguage().c_str() ); + current_aRes.Country = OUString::createFromAscii( gs[i].GetCountry().c_str() ); + pRes[i] = current_aRes; + } + + return aRes; +} + +uno::Sequence< Locale > SAL_CALL LangGuess_Impl::getDisabledLanguages( ) +{ + osl::MutexGuard aGuard( GetLangGuessMutex() ); + + EnsureInitialized(); + + Sequence< css::lang::Locale > aRes; + vector<Guess> gs = m_aGuesser.GetUnavailableLanguages(); + aRes.realloc(gs.size()); + + css::lang::Locale *pRes = aRes.getArray(); + + for(size_t i = 0; i < gs.size() ; i++ ){ + css::lang::Locale current_aRes; + current_aRes.Language = OUString::createFromAscii( gs[i].GetLanguage().c_str() ); + current_aRes.Country = OUString::createFromAscii( gs[i].GetCountry().c_str() ); + pRes[i] = current_aRes; + } + + return aRes; +} + +void SAL_CALL LangGuess_Impl::disableLanguages( + const uno::Sequence< Locale >& rLanguages ) +{ + osl::MutexGuard aGuard( GetLangGuessMutex() ); + + EnsureInitialized(); + + for (const Locale& rLanguage : rLanguages) + { + string language; + + OString l = OUStringToOString( rLanguage.Language, RTL_TEXTENCODING_ASCII_US ); + OString c = OUStringToOString( rLanguage.Country, RTL_TEXTENCODING_ASCII_US ); + + language += l.getStr(); + language += "-"; + language += c.getStr(); + m_aGuesser.DisableLanguage(language); + } +} + +void SAL_CALL LangGuess_Impl::enableLanguages( + const uno::Sequence< Locale >& rLanguages ) +{ + osl::MutexGuard aGuard( GetLangGuessMutex() ); + + EnsureInitialized(); + + for (const Locale& rLanguage : rLanguages) + { + string language; + + OString l = OUStringToOString( rLanguage.Language, RTL_TEXTENCODING_ASCII_US ); + OString c = OUStringToOString( rLanguage.Country, RTL_TEXTENCODING_ASCII_US ); + + language += l.getStr(); + language += "-"; + language += c.getStr(); + m_aGuesser.EnableLanguage(language); + } +} + +OUString SAL_CALL LangGuess_Impl::getImplementationName( ) +{ + return IMPLNAME; +} + +sal_Bool SAL_CALL LangGuess_Impl::supportsService( const OUString& ServiceName ) +{ + return cppu::supportsService(this, ServiceName); +} + +Sequence<OUString> SAL_CALL LangGuess_Impl::getSupportedServiceNames( ) +{ + return { SERVICENAME }; +} + +/** + * Function to create a new component instance; is needed by factory helper implementation. + * @param xMgr service manager to if the components needs other component instances + */ +static Reference< XInterface > LangGuess_Impl_create( + Reference< XComponentContext > const & ) +{ + return static_cast< ::cppu::OWeakObject * >( new LangGuess_Impl ); +} + +//#### EXPORTED ### functions to allow for registration and creation of the UNO component +static const struct ::cppu::ImplementationEntry s_component_entries [] = +{ + { + LangGuess_Impl_create, getImplementationName_LangGuess_Impl, + getSupportedServiceNames_LangGuess_Impl, + ::cppu::createSingleComponentFactory, + nullptr, 0 + }, + { nullptr, nullptr, nullptr, nullptr, nullptr, 0 } +}; + +extern "C" +{ + +SAL_DLLPUBLIC_EXPORT void * guesslang_component_getFactory( + char const * implName, void * xMgr, + void * xRegistry ) +{ + return ::cppu::component_getFactoryHelper( + implName, xMgr, xRegistry, s_component_entries ); +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lingucomponent/source/languageguessing/simpleguesser.cxx b/lingucomponent/source/languageguessing/simpleguesser.cxx new file mode 100644 index 000000000..76b3b65c3 --- /dev/null +++ b/lingucomponent/source/languageguessing/simpleguesser.cxx @@ -0,0 +1,227 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + + /** + * + * + * + * + * TODO + * - Add exception throwing when h == NULL + * - Not init h when implicit constructor is launched + */ + +#include <string.h> +#include <sstream> +#include <iostream> + +#ifdef SYSTEM_LIBEXTTEXTCAT +#include <libexttextcat/textcat.h> +#include <libexttextcat/common.h> +#include <libexttextcat/constants.h> +#include <libexttextcat/fingerprint.h> +#include <libexttextcat/utf8misc.h> +#else +#include <textcat.h> +#include <common.h> +#include <constants.h> +#include <fingerprint.h> +#include <utf8misc.h> +#endif + +#include <sal/types.h> + +#include<rtl/character.hxx> +#include "simpleguesser.hxx" + +using namespace std; + +static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2){ + size_t i; + int ret = 0; + + size_t min = s1.length(); + if (min > s2.length()) + min = s2.length(); + + for(i = 0; i < min && s2[i] && s1[i] && !ret; i++){ + ret = rtl::toAsciiUpperCase(static_cast<unsigned char>(s1[i])) + - rtl::toAsciiUpperCase(static_cast<unsigned char>(s2[i])); + if(s1[i] == '.' || s2[i] == '.') {ret = 0;} //. is a neutral character + } + return ret; + } + +namespace { + +/** + * This following structure is from textcat.c + */ +typedef struct textcat_t{ + + void **fprint; + char *fprint_disable; + uint4 size; + uint4 maxsize; + + char output[MAXOUTPUTSIZE]; + +} textcat_t; +// end of the 3 structs + +} + +SimpleGuesser::SimpleGuesser() +{ + h = nullptr; +} + +SimpleGuesser& SimpleGuesser::operator=(const SimpleGuesser& sg){ + // Check for self-assignment! + if (this == &sg) // Same object? + return *this; // Yes, so skip assignment, and just return *this. + + if(h){textcat_Done(h);} + h = sg.h; + return *this; +} + +SimpleGuesser::~SimpleGuesser() +{ + if(h){textcat_Done(h);} +} + +/*! + \fn SimpleGuesser::GuessLanguage(char* text) + */ +vector<Guess> SimpleGuesser::GuessLanguage(const char* text) +{ + vector<Guess> guesses; + + if (!h) + return guesses; + + int len = strlen(text); + + if (len > MAX_STRING_LENGTH_TO_ANALYSE) + len = MAX_STRING_LENGTH_TO_ANALYSE; + + const char *guess_list = textcat_Classify(h, text, len); + + if (strcmp(guess_list, TEXTCAT_RESULT_SHORT_STR) == 0) + return guesses; + + int current_pointer = 0; + + for(int i = 0; guess_list[current_pointer] != '\0'; i++) + { + while (guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0') + current_pointer++; + if(guess_list[current_pointer] != '\0') + { + Guess g(guess_list + current_pointer); + + guesses.push_back(g); + + current_pointer++; + } + } + + return guesses; +} + +Guess SimpleGuesser::GuessPrimaryLanguage(const char* text) +{ + vector<Guess> ret = GuessLanguage(text); + return ret.empty() ? Guess() : ret[0]; +} +/** + * Is used to know which language is available, unavailable or both + * when mask = 0xF0, return only Available + * when mask = 0x0F, return only Unavailable + * when mask = 0xFF, return both Available and Unavailable + */ +vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) +{ + textcat_t *tables = static_cast<textcat_t*>(h); + + vector<Guess> lang; + if(!h){return lang;} + + for (size_t i=0; i<tables->size; ++i) + { + if (tables->fprint_disable[i] & mask) + { + string langStr = "["; + langStr += fp_Name(tables->fprint[i]); + Guess g(langStr.c_str()); + lang.push_back(g); + } + } + + return lang; +} + +vector<Guess> SimpleGuesser::GetAvailableLanguages() +{ + return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); +} + +vector<Guess> SimpleGuesser::GetUnavailableLanguages() +{ + return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); +} + +vector<Guess> SimpleGuesser::GetAllManagedLanguages() +{ + return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); +} + +void SimpleGuesser::XableLanguage(const string& lang, char mask) +{ + textcat_t *tables = static_cast<textcat_t*>(h); + + if(!h){return;} + + for (size_t i=0; i<tables->size; i++) + { + string language(fp_Name(tables->fprint[i])); + if (startsAsciiCaseInsensitive(language,lang) == 0) + tables->fprint_disable[i] = mask; + } +} + +void SimpleGuesser::EnableLanguage(const string& lang) +{ + XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); +} + +void SimpleGuesser::DisableLanguage(const string& lang) +{ + XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); +} + +void SimpleGuesser::SetDBPath(const char* path, const char* prefix) +{ + if (h) + textcat_Done(h); + h = special_textcat_Init(path, prefix); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/lingucomponent/source/languageguessing/simpleguesser.hxx b/lingucomponent/source/languageguessing/simpleguesser.hxx new file mode 100644 index 000000000..34abf26d4 --- /dev/null +++ b/lingucomponent/source/languageguessing/simpleguesser.hxx @@ -0,0 +1,112 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ +#ifndef INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_SIMPLEGUESSER_HXX +#define INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_SIMPLEGUESSER_HXX + +#include <string.h> +#include <string> +#include <cstdlib> +#include <vector> +#include "guess.hxx" + +#define MAX_STRING_LENGTH_TO_ANALYSE 200 + +using namespace std; + +class SimpleGuesser final { +public: + /**inits the object with conf file "./conf.txt"*/ + SimpleGuesser(); + + /** + * @param SimpleGuesser& sg the other guesser + */ + SimpleGuesser& operator=(const SimpleGuesser& sg); + + /** + * destroy the object + */ + ~SimpleGuesser(); + + /** + * Analyze a text and return the most probable languages of the text + * @param char* text is the text to analyze + * @return the list of guess + */ + vector<Guess> GuessLanguage(const char* text); + + /** + * Analyze a text and return the most probable language of the text + * @param char* text is the text to analyze + * @return the guess (containing language) + */ + Guess GuessPrimaryLanguage(const char* text); + + /** + * List all available languages (possibly to be in guesses) + * @return the list of languages + */ + vector<Guess> GetAvailableLanguages(); + + /** + * List all languages (possibly in guesses or not) + * @return the list of languages + */ + vector<Guess> GetAllManagedLanguages(); + + /** + * List all Unavailable languages (disable for any reason) + * @return the list of languages + */ + vector<Guess> GetUnavailableLanguages(); + + /** + * Mark a language enabled + * @param string lang the language to enable (build like language-COUNTRY-encoding) + */ + void EnableLanguage(const string& lang); + + /** + * Mark a language disabled + * @param string lang the language to disable (build like language-COUNTRY-encoding) + */ + void DisableLanguage(const string& lang); + + /** + * Load a new DB of fingerprints + * @param const char* thePathOfConfFile self explaining + * @param const char* prefix is the path where the directory which contains fingerprint files is stored + */ + void SetDBPath(const char* thePathOfConfFile, const char* prefix); + +private: + + //Where typical fingerprints (n-gram tables) are stored + void* h; + + //Is used to select languages into the fingerprints DB, the mask is used to indicate if we want enabled disabled or both + vector<Guess> GetManagedLanguages(const char mask); + + //Like getManagedLanguages, this function enable or disable a language and it depends of the mask + void XableLanguage(const string& lang, char mask); +}; + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |