summaryrefslogtreecommitdiffstats
path: root/lingucomponent/source/lingutil
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 05:54:39 +0000
commit267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree358c9467650e1d0a1d7227a21dac2e3d08b622b2 /lingucomponent/source/lingutil
parentInitial commit. (diff)
downloadlibreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lingucomponent/source/lingutil')
-rw-r--r--lingucomponent/source/lingutil/lingutil.cxx314
-rw-r--r--lingucomponent/source/lingutil/lingutil.hxx55
2 files changed, 369 insertions, 0 deletions
diff --git a/lingucomponent/source/lingutil/lingutil.cxx b/lingucomponent/source/lingutil/lingutil.cxx
new file mode 100644
index 0000000000..c737698417
--- /dev/null
+++ b/lingucomponent/source/lingutil/lingutil.cxx
@@ -0,0 +1,314 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#if defined(_WIN32)
+#if !defined WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#endif
+
+#include <osl/thread.h>
+#include <osl/file.hxx>
+#include <osl/process.h>
+#include <tools/debug.hxx>
+#include <tools/urlobj.hxx>
+#include <i18nlangtag/languagetag.hxx>
+#include <i18nlangtag/mslangid.hxx>
+#include <unotools/bootstrap.hxx>
+#include <unotools/lingucfg.hxx>
+#include <unotools/pathoptions.hxx>
+#include <rtl/bootstrap.hxx>
+#include <rtl/ustring.hxx>
+#include <rtl/string.hxx>
+#include <rtl/tencinfo.h>
+#include <linguistic/misc.hxx>
+
+#include <set>
+#include <vector>
+#include <string.h>
+
+#include "lingutil.hxx"
+
+#include <sal/macros.h>
+
+using namespace ::com::sun::star;
+
+#if defined(_WIN32)
+OString Win_AddLongPathPrefix( const OString &rPathName )
+{
+ constexpr OString WIN32_LONG_PATH_PREFIX = "\\\\?\\"_ostr;
+ if (!rPathName.match(WIN32_LONG_PATH_PREFIX)) return WIN32_LONG_PATH_PREFIX + rPathName;
+ return rPathName;
+}
+#endif //defined(_WIN32)
+
+#if defined SYSTEM_DICTS || defined IOS
+// find old style dictionaries in system directories
+static void GetOldStyleDicsInDir(
+ OUString const & aSystemDir, OUString const & aFormatName,
+ std::u16string_view aSystemSuffix, std::u16string_view aSystemPrefix,
+ std::set< OUString >& aDicLangInUse,
+ std::vector< SvtLinguConfigDictionaryEntry >& aRes )
+{
+ osl::Directory aSystemDicts(aSystemDir);
+ if (aSystemDicts.open() != osl::FileBase::E_None)
+ return;
+
+ osl::DirectoryItem aItem;
+ osl::FileStatus aFileStatus(osl_FileStatus_Mask_FileURL);
+ while (aSystemDicts.getNextItem(aItem) == osl::FileBase::E_None)
+ {
+ aItem.getFileStatus(aFileStatus);
+ OUString sPath = aFileStatus.getFileURL();
+ if (sPath.endsWith(aSystemSuffix))
+ {
+ sal_Int32 nStartIndex = sPath.lastIndexOf('/') + 1;
+ if (!sPath.match(aSystemPrefix, nStartIndex))
+ continue;
+ OUString sChunk = sPath.copy(nStartIndex + aSystemPrefix.size(),
+ sPath.getLength() - aSystemSuffix.size() -
+ nStartIndex - aSystemPrefix.size());
+ if (sChunk.isEmpty())
+ continue;
+
+ // We prefer (now) to use language tags.
+ // Avoid feeding in the older LANG_REGION scheme to the BCP47
+ // ctor as that triggers use of liblangtag and initializes its
+ // database which we do not want during startup. Convert
+ // instead.
+ sChunk = sChunk.replace( '_', '-');
+
+ // There's a known exception to the rule, the dreaded
+ // hu_HU_u8.dic of the myspell-hu package, see
+ // http://packages.debian.org/search?arch=any&searchon=contents&keywords=hu_HU_u8.dic
+ // This was ignored because unknown in the old implementation,
+ // truncate to the known locale and either insert because hu_HU
+ // wasn't encountered yet, or skip because it was. It doesn't
+ // really matter because the proper new-style hu_HU dictionary
+ // will take precedence anyway if installed with a Hungarian
+ // languagepack. Again, this is only to not pull in all
+ // liblangtag and stuff during startup, the result would be
+ // !isValidBcp47() and the dictionary ignored.
+ if (sChunk == "hu-HU-u8")
+ sChunk = "hu-HU";
+
+ LanguageTag aLangTag(sChunk, true);
+ if (!aLangTag.isValidBcp47())
+ continue;
+
+ // Thus we first get the language of the dictionary
+ const OUString& aLocaleName(aLangTag.getBcp47());
+
+ if (aDicLangInUse.insert(aLocaleName).second)
+ {
+ // add the dictionary to the resulting vector
+ SvtLinguConfigDictionaryEntry aDicEntry;
+ aDicEntry.aLocations = { sPath };
+ aDicEntry.aFormatName = aFormatName;
+ if (aLocaleName == u"ar")
+ aDicEntry.aLocaleNames = {
+ aLocaleName,
+ u"ar-AE"_ustr, u"ar-BH"_ustr, u"ar-DJ"_ustr, u"ar-DZ"_ustr, u"ar-EG"_ustr,
+ u"ar-ER"_ustr, u"ar-IL"_ustr, u"ar-IQ"_ustr, u"ar-JO"_ustr, u"ar-KM"_ustr,
+ u"ar-KW"_ustr, u"ar-LB"_ustr, u"ar-LY"_ustr, u"ar-MA"_ustr, u"ar-MR"_ustr,
+ u"ar-OM"_ustr, u"ar-PS"_ustr, u"ar-QA"_ustr, u"ar-SA"_ustr, u"ar-SD"_ustr,
+ u"ar-SO"_ustr, u"ar-SY"_ustr, u"ar-TD"_ustr, u"ar-TN"_ustr, u"ar-YE"_ustr
+ };
+ else
+ aDicEntry.aLocaleNames = { aLocaleName };
+ aRes.push_back( aDicEntry );
+ }
+ }
+ }
+}
+#endif
+
+// build list of old style dictionaries (not as extensions) to use.
+// User installed dictionaries (the ones residing in the user paths)
+// will get precedence over system installed ones for the same language.
+std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics( const char *pDicType )
+{
+ std::vector< SvtLinguConfigDictionaryEntry > aRes;
+
+ if (!pDicType)
+ return aRes;
+
+ OUString aFormatName;
+ OUString aDicExtension;
+#if defined SYSTEM_DICTS || defined IOS
+ OUString aSystemDir;
+ OUString aSystemPrefix;
+ OUString aSystemSuffix;
+#endif
+ if (strcmp( pDicType, "DICT" ) == 0)
+ {
+ aFormatName = "DICT_SPELL";
+ aDicExtension = ".dic";
+#ifdef SYSTEM_DICTS
+ aSystemDir = DICT_SYSTEM_DIR;
+ aSystemSuffix = aDicExtension;
+#elif defined IOS
+ aSystemDir = "$BRAND_BASE_DIR/share/spell";
+ rtl::Bootstrap::expandMacros(aSystemDir);
+ aSystemSuffix = ".dic";
+#endif
+ }
+ else if (strcmp( pDicType, "HYPH" ) == 0)
+ {
+ aFormatName = "DICT_HYPH";
+ aDicExtension = ".dic";
+#ifdef SYSTEM_DICTS
+ aSystemDir = HYPH_SYSTEM_DIR;
+ aSystemPrefix = "hyph_";
+ aSystemSuffix = aDicExtension;
+#endif
+ }
+ else if (strcmp( pDicType, "THES" ) == 0)
+ {
+ aFormatName = "DICT_THES";
+ aDicExtension = ".dat";
+#ifdef SYSTEM_DICTS
+ aSystemDir = THES_SYSTEM_DIR;
+ aSystemPrefix = "th_";
+ aSystemSuffix = "_v2.dat";
+#elif defined IOS
+ aSystemDir = "$BRAND_BASE_DIR/share/thes";
+ rtl::Bootstrap::expandMacros(aSystemDir);
+ aSystemPrefix = "th_";
+ aSystemSuffix = "_v2.dat";
+#endif
+ }
+
+ if (aFormatName.isEmpty() || aDicExtension.isEmpty())
+ return aRes;
+
+#if defined SYSTEM_DICTS || defined IOS
+ // set of languages to remember the language where it is already
+ // decided to make use of the dictionary.
+ std::set< OUString > aDicLangInUse;
+
+#ifndef IOS
+ // follow the hunspell tool's example and check DICPATH for preferred dictionaries
+ rtl_uString * pSearchPath = nullptr;
+ osl_getEnvironment(OUString("DICPATH").pData, &pSearchPath);
+
+ if (pSearchPath)
+ {
+ OUString aSearchPath(pSearchPath);
+ rtl_uString_release(pSearchPath);
+
+ sal_Int32 nIndex = 0;
+ do
+ {
+ OUString aSystem( aSearchPath.getToken(0, ':', nIndex) );
+ OUString aCWD;
+ OUString aRelative;
+ OUString aAbsolute;
+
+ if (!utl::Bootstrap::getProcessWorkingDir(aCWD))
+ continue;
+ if (osl::FileBase::getFileURLFromSystemPath(aSystem, aRelative)
+ != osl::FileBase::E_None)
+ continue;
+ if (osl::FileBase::getAbsoluteFileURL(aCWD, aRelative, aAbsolute)
+ != osl::FileBase::E_None)
+ continue;
+
+ // GetOldStyleDicsInDir will make sure the dictionary is the right
+ // type based on its prefix, that way hyphen, mythes and regular
+ // dictionaries can live in one directory
+ GetOldStyleDicsInDir(aAbsolute, aFormatName, aSystemSuffix,
+ aSystemPrefix, aDicLangInUse, aRes);
+ }
+ while (nIndex != -1);
+ }
+#endif
+
+ // load system directories last so that DICPATH prevails
+ GetOldStyleDicsInDir(aSystemDir, aFormatName, aSystemSuffix, aSystemPrefix,
+ aDicLangInUse, aRes);
+#endif
+
+ return aRes;
+}
+
+void MergeNewStyleDicsAndOldStyleDics(
+ std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics,
+ const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics )
+{
+ // get list of languages supported by new style dictionaries
+ std::set< OUString > aNewStyleLanguages;
+ for (auto const& newStyleDic : rNewStyleDics)
+ {
+ const uno::Sequence< OUString > aLocaleNames(newStyleDic.aLocaleNames);
+ sal_Int32 nLocaleNames = aLocaleNames.getLength();
+ for (sal_Int32 k = 0; k < nLocaleNames; ++k)
+ {
+ aNewStyleLanguages.insert( aLocaleNames[k] );
+ }
+ }
+
+ // now check all old style dictionaries if they will add a not yet
+ // added language. If so add them to the resulting vector
+ for (auto const& oldStyleDic : rOldStyleDics)
+ {
+ sal_Int32 nOldStyleDics = oldStyleDic.aLocaleNames.getLength();
+
+ // old style dics should only have one language listed...
+ DBG_ASSERT( nOldStyleDics, "old style dictionary with more than one language found!");
+ if (nOldStyleDics > 0)
+ {
+ if (linguistic::LinguIsUnspecified( oldStyleDic.aLocaleNames[0]))
+ {
+ OSL_FAIL( "old style dictionary with invalid language found!" );
+ continue;
+ }
+
+ // language not yet added?
+ if (aNewStyleLanguages.find( oldStyleDic.aLocaleNames[0] ) == aNewStyleLanguages.end())
+ rNewStyleDics.push_back(oldStyleDic);
+ }
+ else
+ {
+ OSL_FAIL( "old style dictionary with no language found!" );
+ }
+ }
+}
+
+rtl_TextEncoding getTextEncodingFromCharset(const char* pCharset)
+{
+ // default result: used to indicate that we failed to get the proper encoding
+ rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
+
+ if (pCharset)
+ {
+ eRet = rtl_getTextEncodingFromMimeCharset(pCharset);
+ if (eRet == RTL_TEXTENCODING_DONTKNOW)
+ eRet = rtl_getTextEncodingFromUnixCharset(pCharset);
+ if (eRet == RTL_TEXTENCODING_DONTKNOW)
+ {
+ if (strcmp("ISCII-DEVANAGARI", pCharset) == 0)
+ eRet = RTL_TEXTENCODING_ISCII_DEVANAGARI;
+ }
+ }
+ return eRet;
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/lingucomponent/source/lingutil/lingutil.hxx b/lingucomponent/source/lingutil/lingutil.hxx
new file mode 100644
index 0000000000..687c414827
--- /dev/null
+++ b/lingucomponent/source/lingutil/lingutil.hxx
@@ -0,0 +1,55 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed
+ * with this work for additional information regarding copyright
+ * ownership. The ASF licenses this file to you under the Apache
+ * License, Version 2.0 (the "License"); you may not use this file
+ * except in compliance with the License. You may obtain a copy of
+ * the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#ifndef INCLUDED_LINGUCOMPONENT_SOURCE_LINGUTIL_LINGUTIL_HXX
+#define INCLUDED_LINGUCOMPONENT_SOURCE_LINGUTIL_LINGUTIL_HXX
+
+#include <rtl/string.hxx>
+
+#include <vector>
+
+#define OU2ENC(rtlOUString, rtlEncoding) \
+ OString((rtlOUString).getStr(), (rtlOUString).getLength(), \
+ rtlEncoding, RTL_UNICODETOTEXT_FLAGS_UNDEFINED_QUESTIONMARK)
+
+struct SvtLinguConfigDictionaryEntry;
+
+#if defined(_WIN32)
+
+// to be use to get a path name with long path prefix
+// under Windows for Hunspell, Hyphen and MyThes libraries
+OString Win_AddLongPathPrefix( const OString &rPathName );
+#endif
+
+
+// temporary function, to be removed when new style dictionaries
+// using configuration entries are fully implemented and provided
+std::vector< SvtLinguConfigDictionaryEntry > GetOldStyleDics( const char * pDicType );
+void MergeNewStyleDicsAndOldStyleDics( std::vector< SvtLinguConfigDictionaryEntry > &rNewStyleDics, const std::vector< SvtLinguConfigDictionaryEntry > &rOldStyleDics );
+
+//Find an encoding from a charset string, using
+//rtl_getTextEncodingFromMimeCharset and falling back to
+//rtl_getTextEncodingFromUnixCharset with the addition of
+//ISCII-DEVANAGARI. On failure will return final fallback of
+//RTL_TEXTENCODING_ISO_8859_1
+rtl_TextEncoding getTextEncodingFromCharset(const char* pCharset);
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */