summaryrefslogtreecommitdiffstats
path: root/lingucomponent/source/languageguessing
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:06:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:06:44 +0000
commited5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch)
tree7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /lingucomponent/source/languageguessing
parentInitial commit. (diff)
downloadlibreoffice-upstream/4%7.4.7.tar.xz
libreoffice-upstream/4%7.4.7.zip
Adding upstream version 4:7.4.7.upstream/4%7.4.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--lingucomponent/source/languageguessing/guess.cxx100
-rw-r--r--lingucomponent/source/languageguessing/guess.hxx54
-rw-r--r--lingucomponent/source/languageguessing/guesslang.component26
-rw-r--r--lingucomponent/source/languageguessing/guesslang.cxx321
-rw-r--r--lingucomponent/source/languageguessing/simpleguesser.cxx221
-rw-r--r--lingucomponent/source/languageguessing/simpleguesser.hxx108
6 files changed, 830 insertions, 0 deletions
diff --git a/lingucomponent/source/languageguessing/guess.cxx b/lingucomponent/source/languageguessing/guess.cxx
new file mode 100644
index 000000000..a7cbeccab
--- /dev/null
+++ b/lingucomponent/source/languageguessing/guess.cxx
@@ -0,0 +1,100 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <sal/config.h>
+
+#include <cassert>
+#include <string.h>
+
+#ifdef SYSTEM_LIBEXTTEXTCAT
+#include <libexttextcat/textcat.h>
+#else
+#include <textcat.h>
+#endif
+
+#include "guess.hxx"
+
+/* Old textcat.h versions defined bad spelled constants. */
+#ifndef TEXTCAT_RESULT_UNKNOWN_STR
+#define TEXTCAT_RESULT_UNKNOWN_STR _TEXTCAT_RESULT_UNKOWN
+#endif
+
+#ifndef TEXTCAT_RESULT_SHORT_STR
+#define TEXTCAT_RESULT_SHORT_STR _TEXTCAT_RESULT_SHORT
+#endif
+
+Guess::Guess()
+ : language_str(DEFAULT_LANGUAGE)
+ , country_str(DEFAULT_COUNTRY)
+{
+}
+
+/*
+* this use a char * string to build the guess object
+* a string like those is made as : [language-country-encoding]...
+*
+*/
+Guess::Guess(const char * guess_str)
+ : language_str(DEFAULT_LANGUAGE)
+ , country_str(DEFAULT_COUNTRY)
+{
+ //if the guess is not like "UNKNOWN" or "SHORT", go into the brackets
+ if(strcmp(guess_str + 1, TEXTCAT_RESULT_UNKNOWN_STR) == 0
+ || strcmp(guess_str + 1, TEXTCAT_RESULT_SHORT_STR) == 0)
+ return;
+
+ // From how this ctor is called from SimpleGuesser::GuessLanguage and
+ // SimpleGuesser::GetManagedLanguages in
+ // lingucomponent/source/languageguessing/simpleguesser.cxx, guess_str must start with "[":
+ assert(guess_str[0] == GUESS_SEPARATOR_OPEN);
+ auto const start = guess_str + 1;
+ // Only look at the prefix of guess_str, delimited by the next "]" or "[" or end-of-string;
+ // split it into at most three segments separated by "-" (where excess occurrences of "-"
+ // would become part of the third segment), like "en-US-utf8"; the first segment denotes the
+ // language; if there are three segments, the second denotes the country and the third the
+ // encoding; otherwise, the second segment, if any (e.g., in "haw-utf8"), denotes the
+ // encoding:
+ char const * dash1 = nullptr;
+ char const * dash2 = nullptr;
+ auto p = start;
+ for (;; ++p) {
+ auto const c = *p;
+ if (c == '\0' || c == GUESS_SEPARATOR_OPEN || c == GUESS_SEPARATOR_CLOSE) {
+ break;
+ }
+ if (c == GUESS_SEPARATOR_SEP) {
+ if (dash1 == nullptr) {
+ dash1 = p;
+ } else {
+ dash2 = p;
+ // The encoding is ignored, so we can stop as soon as we found the second "-":
+ break;
+ }
+ }
+ }
+ auto const langLen = (dash1 == nullptr ? p : dash1) - start;
+ if (langLen != 0) { // if not we use the default value
+ language_str.assign(start, langLen);
+ }
+ if (dash2 != nullptr) {
+ country_str.assign(dash1 + 1, dash2 - (dash1 + 1));
+ }
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lingucomponent/source/languageguessing/guess.hxx b/lingucomponent/source/languageguessing/guess.hxx
new file mode 100644
index 000000000..627033d3a
--- /dev/null
+++ b/lingucomponent/source/languageguessing/guess.hxx
@@ -0,0 +1,54 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+#ifndef INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_GUESS_HXX
+#define INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_GUESS_HXX
+
+#define GUESS_SEPARATOR_OPEN '['
+#define GUESS_SEPARATOR_CLOSE ']'
+#define GUESS_SEPARATOR_SEP '-'
+#define DEFAULT_LANGUAGE ""
+#define DEFAULT_COUNTRY ""
+#define DEFAULT_ENCODING ""
+
+#include <string>
+
+class Guess final {
+ public:
+
+ /**
+ * Default init
+ */
+ Guess();
+
+ /**
+ * Init from a string like [en-UK-utf8] and the rank
+ */
+ Guess(const char * guess_str);
+
+ const std::string& GetLanguage() const { return language_str;}
+ const std::string& GetCountry() const { return country_str;}
+
+ private:
+ std::string language_str;
+ std::string country_str;
+};
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lingucomponent/source/languageguessing/guesslang.component b/lingucomponent/source/languageguessing/guesslang.component
new file mode 100644
index 000000000..75f6e7ce2
--- /dev/null
+++ b/lingucomponent/source/languageguessing/guesslang.component
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ -->
+
+<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@"
+ xmlns="http://openoffice.org/2010/uno-components">
+ <implementation name="com.sun.star.lingu2.LanguageGuessing"
+ constructor="lingucomponent_LangGuess_get_implementation">
+ <service name="com.sun.star.linguistic2.LanguageGuessing"/>
+ </implementation>
+</component>
diff --git a/lingucomponent/source/languageguessing/guesslang.cxx b/lingucomponent/source/languageguessing/guesslang.cxx
new file mode 100644
index 000000000..d6d5803a5
--- /dev/null
+++ b/lingucomponent/source/languageguessing/guesslang.cxx
@@ -0,0 +1,321 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include <iostream>
+#include <mutex>
+#include <string_view>
+
+#include <osl/file.hxx>
+#include <tools/debug.hxx>
+
+#include <sal/config.h>
+#include <cppuhelper/factory.hxx>
+#include <cppuhelper/implbase.hxx>
+#include <cppuhelper/supportsservice.hxx>
+
+#include "simpleguesser.hxx"
+#include "guess.hxx"
+
+#include <com/sun/star/lang/IllegalArgumentException.hpp>
+#include <com/sun/star/lang/XServiceInfo.hpp>
+#include <com/sun/star/linguistic2/XLanguageGuessing.hpp>
+#include <unotools/pathoptions.hxx>
+#include <osl/thread.h>
+
+#include <sal/macros.h>
+
+#ifdef SYSTEM_LIBEXTTEXTCAT
+#include <libexttextcat/textcat.h>
+#else
+#include <textcat.h>
+#endif
+
+using namespace ::std;
+using namespace ::osl;
+using namespace ::cppu;
+using namespace ::com::sun::star;
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::lang;
+using namespace ::com::sun::star::linguistic2;
+
+static std::mutex & GetLangGuessMutex()
+{
+ static std::mutex aMutex;
+ return aMutex;
+}
+
+namespace {
+
+class LangGuess_Impl :
+ public ::cppu::WeakImplHelper<
+ XLanguageGuessing,
+ XServiceInfo >
+{
+ SimpleGuesser m_aGuesser;
+ bool m_bInitialized;
+
+ virtual ~LangGuess_Impl() override {}
+ void EnsureInitialized();
+
+public:
+ LangGuess_Impl();
+ LangGuess_Impl(const LangGuess_Impl&) = delete;
+ LangGuess_Impl& operator=(const LangGuess_Impl&) = delete;
+
+ // XServiceInfo implementation
+ virtual OUString SAL_CALL getImplementationName( ) override;
+ virtual sal_Bool SAL_CALL supportsService( const OUString& ServiceName ) override;
+ virtual Sequence< OUString > SAL_CALL getSupportedServiceNames( ) override;
+
+ // XLanguageGuessing implementation
+ virtual css::lang::Locale SAL_CALL guessPrimaryLanguage( const OUString& aText, ::sal_Int32 nStartPos, ::sal_Int32 nLen ) override;
+ virtual void SAL_CALL disableLanguages( const css::uno::Sequence< css::lang::Locale >& aLanguages ) override;
+ virtual void SAL_CALL enableLanguages( const css::uno::Sequence< css::lang::Locale >& aLanguages ) override;
+ virtual css::uno::Sequence< css::lang::Locale > SAL_CALL getAvailableLanguages( ) override;
+ virtual css::uno::Sequence< css::lang::Locale > SAL_CALL getEnabledLanguages( ) override;
+ virtual css::uno::Sequence< css::lang::Locale > SAL_CALL getDisabledLanguages( ) override;
+
+ // implementation specific
+ /// @throws RuntimeException
+ void SetFingerPrintsDB( std::u16string_view fileName );
+};
+
+}
+
+LangGuess_Impl::LangGuess_Impl() :
+ m_bInitialized( false )
+{
+}
+
+void LangGuess_Impl::EnsureInitialized()
+{
+ if (m_bInitialized)
+ return;
+
+ // set this to true at the very start to prevent loops because of
+ // implicitly called functions below
+ m_bInitialized = true;
+
+ // set default fingerprint path to where those get installed
+ OUString aPhysPath;
+ OUString aURL( SvtPathOptions().GetFingerprintPath() );
+ osl::FileBase::getSystemPathFromFileURL( aURL, aPhysPath );
+#ifdef _WIN32
+ aPhysPath += "\\";
+#else
+ aPhysPath += "/";
+#endif
+
+ SetFingerPrintsDB( aPhysPath );
+
+#if !defined(EXTTEXTCAT_VERSION_MAJOR)
+
+ // disable currently not functional languages...
+ struct LangCountry
+ {
+ const char *pLang;
+ const char *pCountry;
+ };
+ LangCountry aDisable[] =
+ {
+ // not functional in modified libtextcat, but fixed in >= libexttextcat 3.1.0
+ // which is the first with EXTTEXTCAT_VERSION_MAJOR defined
+ {"sco", ""}, {"zh", "CN"}, {"zh", "TW"}, {"ja", ""}, {"ko", ""},
+ {"ka", ""}, {"hi", ""}, {"mr", ""}, {"ne", ""}, {"sa", ""},
+ {"ta", ""}, {"th", ""}, {"qu", ""}, {"yi", ""}
+ };
+ sal_Int32 nNum = SAL_N_ELEMENTS(aDisable);
+ Sequence< Locale > aDisableSeq( nNum );
+ Locale *pDisableSeq = aDisableSeq.getArray();
+ for (sal_Int32 i = 0; i < nNum; ++i)
+ {
+ Locale aLocale;
+ aLocale.Language = OUString::createFromAscii( aDisable[i].pLang );
+ aLocale.Country = OUString::createFromAscii( aDisable[i].pCountry );
+ pDisableSeq[i] = aLocale;
+ }
+ disableLanguages( aDisableSeq );
+ DBG_ASSERT( nNum == getDisabledLanguages().getLength(), "size mismatch" );
+#endif
+}
+
+Locale SAL_CALL LangGuess_Impl::guessPrimaryLanguage(
+ const OUString& rText,
+ ::sal_Int32 nStartPos,
+ ::sal_Int32 nLen )
+{
+ std::scoped_lock aGuard( GetLangGuessMutex() );
+
+ EnsureInitialized();
+
+ if (nStartPos < 0 || nLen < 0 || nStartPos + nLen > rText.getLength())
+ throw lang::IllegalArgumentException();
+
+ OString o( OUStringToOString( rText.subView(nStartPos, nLen), RTL_TEXTENCODING_UTF8 ) );
+ Guess g = m_aGuesser.GuessPrimaryLanguage(o.getStr());
+ lang::Locale aRes;
+ aRes.Language = OUString::createFromAscii( g.GetLanguage().c_str() );
+ aRes.Country = OUString::createFromAscii( g.GetCountry().c_str() );
+ return aRes;
+}
+
+#define DEFAULT_CONF_FILE_NAME "fpdb.conf"
+
+void LangGuess_Impl::SetFingerPrintsDB(
+ std::u16string_view filePath )
+{
+ //! text encoding for file name / path needs to be in the same encoding the OS uses
+ OString path = OUStringToOString( filePath, osl_getThreadTextEncoding() );
+ OString conf_file_path = path + DEFAULT_CONF_FILE_NAME;
+
+ m_aGuesser.SetDBPath(conf_file_path.getStr(), path.getStr());
+}
+
+uno::Sequence< Locale > SAL_CALL LangGuess_Impl::getAvailableLanguages( )
+{
+ std::scoped_lock aGuard( GetLangGuessMutex() );
+
+ EnsureInitialized();
+
+ Sequence< css::lang::Locale > aRes;
+ vector<Guess> gs = m_aGuesser.GetAllManagedLanguages();
+ aRes.realloc(gs.size());
+
+ css::lang::Locale *pRes = aRes.getArray();
+
+ for(size_t i = 0; i < gs.size() ; i++ ){
+ css::lang::Locale current_aRes;
+ current_aRes.Language = OUString::createFromAscii( gs[i].GetLanguage().c_str() );
+ current_aRes.Country = OUString::createFromAscii( gs[i].GetCountry().c_str() );
+ pRes[i] = current_aRes;
+ }
+
+ return aRes;
+}
+
+uno::Sequence< Locale > SAL_CALL LangGuess_Impl::getEnabledLanguages( )
+{
+ std::scoped_lock aGuard( GetLangGuessMutex() );
+
+ EnsureInitialized();
+
+ Sequence< css::lang::Locale > aRes;
+ vector<Guess> gs = m_aGuesser.GetAvailableLanguages();
+ aRes.realloc(gs.size());
+
+ css::lang::Locale *pRes = aRes.getArray();
+
+ for(size_t i = 0; i < gs.size() ; i++ ){
+ css::lang::Locale current_aRes;
+ current_aRes.Language = OUString::createFromAscii( gs[i].GetLanguage().c_str() );
+ current_aRes.Country = OUString::createFromAscii( gs[i].GetCountry().c_str() );
+ pRes[i] = current_aRes;
+ }
+
+ return aRes;
+}
+
+uno::Sequence< Locale > SAL_CALL LangGuess_Impl::getDisabledLanguages( )
+{
+ std::scoped_lock aGuard( GetLangGuessMutex() );
+
+ EnsureInitialized();
+
+ Sequence< css::lang::Locale > aRes;
+ vector<Guess> gs = m_aGuesser.GetUnavailableLanguages();
+ aRes.realloc(gs.size());
+
+ css::lang::Locale *pRes = aRes.getArray();
+
+ for(size_t i = 0; i < gs.size() ; i++ ){
+ css::lang::Locale current_aRes;
+ current_aRes.Language = OUString::createFromAscii( gs[i].GetLanguage().c_str() );
+ current_aRes.Country = OUString::createFromAscii( gs[i].GetCountry().c_str() );
+ pRes[i] = current_aRes;
+ }
+
+ return aRes;
+}
+
+void SAL_CALL LangGuess_Impl::disableLanguages(
+ const uno::Sequence< Locale >& rLanguages )
+{
+ std::scoped_lock aGuard( GetLangGuessMutex() );
+
+ EnsureInitialized();
+
+ for (const Locale& rLanguage : rLanguages)
+ {
+ string language;
+
+ OString l = OUStringToOString( rLanguage.Language, RTL_TEXTENCODING_ASCII_US );
+ OString c = OUStringToOString( rLanguage.Country, RTL_TEXTENCODING_ASCII_US );
+
+ language += l.getStr();
+ language += "-";
+ language += c.getStr();
+ m_aGuesser.DisableLanguage(language);
+ }
+}
+
+void SAL_CALL LangGuess_Impl::enableLanguages(
+ const uno::Sequence< Locale >& rLanguages )
+{
+ std::scoped_lock aGuard( GetLangGuessMutex() );
+
+ EnsureInitialized();
+
+ for (const Locale& rLanguage : rLanguages)
+ {
+ string language;
+
+ OString l = OUStringToOString( rLanguage.Language, RTL_TEXTENCODING_ASCII_US );
+ OString c = OUStringToOString( rLanguage.Country, RTL_TEXTENCODING_ASCII_US );
+
+ language += l.getStr();
+ language += "-";
+ language += c.getStr();
+ m_aGuesser.EnableLanguage(language);
+ }
+}
+
+OUString SAL_CALL LangGuess_Impl::getImplementationName( )
+{
+ return "com.sun.star.lingu2.LanguageGuessing";
+}
+
+sal_Bool SAL_CALL LangGuess_Impl::supportsService( const OUString& ServiceName )
+{
+ return cppu::supportsService(this, ServiceName);
+}
+
+Sequence<OUString> SAL_CALL LangGuess_Impl::getSupportedServiceNames( )
+{
+ return { "com.sun.star.linguistic2.LanguageGuessing" };
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
+lingucomponent_LangGuess_get_implementation(
+ css::uno::XComponentContext* , css::uno::Sequence<css::uno::Any> const&)
+{
+ return cppu::acquire(new LangGuess_Impl());
+}
+
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lingucomponent/source/languageguessing/simpleguesser.cxx b/lingucomponent/source/languageguessing/simpleguesser.cxx
new file mode 100644
index 000000000..7210b1f45
--- /dev/null
+++ b/lingucomponent/source/languageguessing/simpleguesser.cxx
@@ -0,0 +1,221 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+ /**
+ *
+ *
+ *
+ *
+ * TODO
+ * - Add exception throwing when h == NULL
+ * - Not init h when implicit constructor is launched
+ */
+
+#include <string.h>
+
+#ifdef SYSTEM_LIBEXTTEXTCAT
+#include <libexttextcat/textcat.h>
+#include <libexttextcat/common.h>
+#include <libexttextcat/constants.h>
+#include <libexttextcat/fingerprint.h>
+#else
+#include <textcat.h>
+#include <common.h>
+#include <constants.h>
+#include <fingerprint.h>
+#endif
+
+#include <sal/types.h>
+
+#include<rtl/character.hxx>
+#include "simpleguesser.hxx"
+
+static int startsAsciiCaseInsensitive(const std::string &s1, const std::string &s2){
+ size_t i;
+ int ret = 0;
+
+ size_t min = s1.length();
+ if (min > s2.length())
+ min = s2.length();
+
+ for(i = 0; i < min && s2[i] && s1[i] && !ret; i++){
+ ret = rtl::toAsciiUpperCase(static_cast<unsigned char>(s1[i]))
+ - rtl::toAsciiUpperCase(static_cast<unsigned char>(s2[i]));
+ if(s1[i] == '.' || s2[i] == '.') {ret = 0;} //. is a neutral character
+ }
+ return ret;
+ }
+
+namespace {
+
+/**
+ * This following structure is from textcat.c
+ */
+typedef struct textcat_t{
+
+ void **fprint;
+ char *fprint_disable;
+ uint4 size;
+ uint4 maxsize;
+
+ char output[MAXOUTPUTSIZE];
+
+} textcat_t;
+// end of the 3 structs
+
+}
+
+SimpleGuesser::SimpleGuesser()
+{
+ h = nullptr;
+}
+
+SimpleGuesser& SimpleGuesser::operator=(const SimpleGuesser& sg){
+ // Check for self-assignment!
+ if (this == &sg) // Same object?
+ return *this; // Yes, so skip assignment, and just return *this.
+
+ if(h){textcat_Done(h);}
+ h = sg.h;
+ return *this;
+}
+
+SimpleGuesser::~SimpleGuesser()
+{
+ if(h){textcat_Done(h);}
+}
+
+/*!
+ \fn SimpleGuesser::GuessLanguage(char* text)
+ */
+std::vector<Guess> SimpleGuesser::GuessLanguage(const char* text)
+{
+ std::vector<Guess> guesses;
+
+ if (!h)
+ return guesses;
+
+ int len = strlen(text);
+
+ if (len > MAX_STRING_LENGTH_TO_ANALYSE)
+ len = MAX_STRING_LENGTH_TO_ANALYSE;
+
+ const char *guess_list = textcat_Classify(h, text, len);
+
+ if (strcmp(guess_list, TEXTCAT_RESULT_SHORT_STR) == 0)
+ return guesses;
+
+ int current_pointer = 0;
+
+ while(guess_list[current_pointer] != '\0')
+ {
+ while (guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0')
+ current_pointer++;
+ if(guess_list[current_pointer] != '\0')
+ {
+ Guess g(guess_list + current_pointer);
+
+ guesses.push_back(g);
+
+ current_pointer++;
+ }
+ }
+
+ return guesses;
+}
+
+Guess SimpleGuesser::GuessPrimaryLanguage(const char* text)
+{
+ std::vector<Guess> ret = GuessLanguage(text);
+ return ret.empty() ? Guess() : ret[0];
+}
+/**
+ * Is used to know which language is available, unavailable or both
+ * when mask = 0xF0, return only Available
+ * when mask = 0x0F, return only Unavailable
+ * when mask = 0xFF, return both Available and Unavailable
+ */
+std::vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
+{
+ textcat_t *tables = static_cast<textcat_t*>(h);
+
+ std::vector<Guess> lang;
+ if(!h){return lang;}
+
+ for (size_t i=0; i<tables->size; ++i)
+ {
+ if (tables->fprint_disable[i] & mask)
+ {
+ std::string langStr = "[";
+ langStr += fp_Name(tables->fprint[i]);
+ Guess g(langStr.c_str());
+ lang.push_back(g);
+ }
+ }
+
+ return lang;
+}
+
+std::vector<Guess> SimpleGuesser::GetAvailableLanguages()
+{
+ return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
+}
+
+std::vector<Guess> SimpleGuesser::GetUnavailableLanguages()
+{
+ return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
+}
+
+std::vector<Guess> SimpleGuesser::GetAllManagedLanguages()
+{
+ return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
+}
+
+void SimpleGuesser::XableLanguage(const std::string& lang, char mask)
+{
+ textcat_t *tables = static_cast<textcat_t*>(h);
+
+ if(!h){return;}
+
+ for (size_t i=0; i<tables->size; i++)
+ {
+ std::string language(fp_Name(tables->fprint[i]));
+ if (startsAsciiCaseInsensitive(language,lang) == 0)
+ tables->fprint_disable[i] = mask;
+ }
+}
+
+void SimpleGuesser::EnableLanguage(const std::string& lang)
+{
+ XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
+}
+
+void SimpleGuesser::DisableLanguage(const std::string& lang)
+{
+ XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
+}
+
+void SimpleGuesser::SetDBPath(const char* path, const char* prefix)
+{
+ if (h)
+ textcat_Done(h);
+ h = special_textcat_Init(path, prefix);
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lingucomponent/source/languageguessing/simpleguesser.hxx b/lingucomponent/source/languageguessing/simpleguesser.hxx
new file mode 100644
index 000000000..aec544285
--- /dev/null
+++ b/lingucomponent/source/languageguessing/simpleguesser.hxx
@@ -0,0 +1,108 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+#ifndef INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_SIMPLEGUESSER_HXX
+#define INCLUDED_LINGUCOMPONENT_SOURCE_LANGUAGEGUESSING_SIMPLEGUESSER_HXX
+
+#include <string>
+#include <vector>
+#include "guess.hxx"
+
+#define MAX_STRING_LENGTH_TO_ANALYSE 200
+
+class SimpleGuesser final
+{
+public:
+ /**inits the object with conf file "./conf.txt"*/
+ SimpleGuesser();
+
+ /**
+ * @param SimpleGuesser& sg the other guesser
+ */
+ SimpleGuesser& operator=(const SimpleGuesser& sg);
+
+ /**
+ * destroy the object
+ */
+ ~SimpleGuesser();
+
+ /**
+ * Analyze a text and return the most probable languages of the text
+ * @param char* text is the text to analyze
+ * @return the list of guess
+ */
+ std::vector<Guess> GuessLanguage(const char* text);
+
+ /**
+ * Analyze a text and return the most probable language of the text
+ * @param char* text is the text to analyze
+ * @return the guess (containing language)
+ */
+ Guess GuessPrimaryLanguage(const char* text);
+
+ /**
+ * List all available languages (possibly to be in guesses)
+ * @return the list of languages
+ */
+ std::vector<Guess> GetAvailableLanguages();
+
+ /**
+ * List all languages (possibly in guesses or not)
+ * @return the list of languages
+ */
+ std::vector<Guess> GetAllManagedLanguages();
+
+ /**
+ * List all Unavailable languages (disable for any reason)
+ * @return the list of languages
+ */
+ std::vector<Guess> GetUnavailableLanguages();
+
+ /**
+ * Mark a language enabled
+ * @param string lang the language to enable (build like language-COUNTRY-encoding)
+ */
+ void EnableLanguage(const std::string& lang);
+
+ /**
+ * Mark a language disabled
+ * @param string lang the language to disable (build like language-COUNTRY-encoding)
+ */
+ void DisableLanguage(const std::string& lang);
+
+ /**
+ * Load a new DB of fingerprints
+ * @param const char* thePathOfConfFile self explaining
+ * @param const char* prefix is the path where the directory which contains fingerprint files is stored
+ */
+ void SetDBPath(const char* thePathOfConfFile, const char* prefix);
+
+private:
+ //Where typical fingerprints (n-gram tables) are stored
+ void* h;
+
+ //Is used to select languages into the fingerprints DB, the mask is used to indicate if we want enabled disabled or both
+ std::vector<Guess> GetManagedLanguages(const char mask);
+
+ //Like getManagedLanguages, this function enable or disable a language and it depends of the mask
+ void XableLanguage(const std::string& lang, char mask);
+};
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */