diff options
Diffstat (limited to 'i18npool/source/transliteration/transliteration_body.cxx')
-rw-r--r-- | i18npool/source/transliteration/transliteration_body.cxx | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/i18npool/source/transliteration/transliteration_body.cxx b/i18npool/source/transliteration/transliteration_body.cxx new file mode 100644 index 000000000..f12b152dc --- /dev/null +++ b/i18npool/source/transliteration/transliteration_body.cxx @@ -0,0 +1,296 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ +// Silence spurious Werror=maybe-uninitialized in transliterateImpl emitted at least by GCC 11.2.0 +#if defined __GNUC__ && !defined __clang__ +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + +#include <rtl/ref.hxx> +#include <i18nutil/casefolding.hxx> +#include <i18nutil/unicode.hxx> +#include <com/sun/star/i18n/MultipleCharsOutputException.hpp> +#include <com/sun/star/i18n/TransliterationType.hpp> +#include <comphelper/processfactory.hxx> +#include <comphelper/sequence.hxx> +#include <o3tl/temporary.hxx> + +#include <characterclassificationImpl.hxx> + +#include <transliteration_body.hxx> +#include <memory> +#include <numeric> + +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star::lang; + +namespace i18npool { + +Transliteration_body::Transliteration_body() +{ + nMappingType = MappingType::NONE; + transliterationName = "Transliteration_body"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; +} + +sal_Int16 SAL_CALL Transliteration_body::getType() +{ + return TransliterationType::ONE_TO_ONE; +} + +sal_Bool SAL_CALL Transliteration_body::equals( + const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, + const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) +{ + throw RuntimeException(); +} + +Sequence< OUString > SAL_CALL +Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) +{ + return { str1, str2 }; +} + +static MappingType lcl_getMappingTypeForToggleCase( MappingType nMappingType, sal_Unicode cChar ) +{ + MappingType nRes = nMappingType; + + // take care of TOGGLE_CASE transliteration: + // nMappingType should not be a combination of flags, thuse we decide now + // which one to use. + if (nMappingType == (MappingType::LowerToUpper | MappingType::UpperToLower)) + { + const sal_Int16 nType = unicode::getUnicodeType( cChar ); + if (nType & 0x02 /* lower case*/) + nRes = MappingType::LowerToUpper; + else + { + // should also work properly for non-upper characters like white spaces, numbers, ... + nRes = MappingType::UpperToLower; + } + } + + return nRes; +} + +OUString +Transliteration_body::transliterateImpl( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >* pOffset) +{ + const sal_Unicode *in = inStr.getStr() + startPos; + + // We could assume that most calls result in identical string lengths, + // thus using a preallocated OUStringBuffer could be an easy way + // to assemble the return string without too much hassle. However, + // for single characters the OUStringBuffer::append() method is quite + // expensive compared to a simple array operation, so it pays here + // to copy the final result instead. + + // Allocate the max possible buffer. Try to use stack instead of heap, + // which would have to be reallocated most times anyways. + constexpr sal_Int32 nLocalBuf = 2048; + sal_Unicode* out; + std::unique_ptr<sal_Unicode[]> pHeapBuf; + if (nCount <= nLocalBuf) + out = static_cast<sal_Unicode*>(alloca(nCount * NMAPPINGMAX * sizeof(sal_Unicode))); + else + { + pHeapBuf.reset(new sal_Unicode[ nCount * NMAPPINGMAX ]); + out = pHeapBuf.get(); + } + + sal_Int32 j = 0; + // Two different blocks to eliminate the if(useOffset) condition inside the loop. + // Yes, on massive use even such small things do count. + if ( pOffset ) + { + std::vector<sal_Int32> aVec; + aVec.reserve(std::max<sal_Int32>(nLocalBuf, nCount) * NMAPPINGMAX); + + for (sal_Int32 i = 0; i < nCount; i++) + { + // take care of TOGGLE_CASE transliteration: + MappingType nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const i18nutil::Mapping &map = i18nutil::casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); + std::fill_n(std::back_inserter(aVec), map.nmap, i + startPos); + std::copy_n(map.map, map.nmap, out + j); + j += map.nmap; + } + + *pOffset = comphelper::containerToSequence(aVec); + } + else + { + for ( sal_Int32 i = 0; i < nCount; i++) + { + // take care of TOGGLE_CASE transliteration: + MappingType nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); + + const i18nutil::Mapping &map = i18nutil::casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); + std::copy_n(map.map, map.nmap, out + j); + j += map.nmap; + } + } + + return OUString(out, j); +} + +OUString SAL_CALL +Transliteration_body::transliterateChar2String( sal_Unicode inChar ) +{ + const i18nutil::Mapping &map = i18nutil::casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); + rtl_uString* pStr = rtl_uString_alloc(map.nmap); + sal_Unicode* out = pStr->buffer; + sal_Int32 i; + + for (i = 0; i < map.nmap; i++) + out[i] = map.map[i]; + out[i] = 0; + + return OUString( pStr, SAL_NO_ACQUIRE ); +} + +sal_Unicode SAL_CALL +Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) +{ + const i18nutil::Mapping &map = i18nutil::casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); + if (map.nmap > 1) + throw MultipleCharsOutputException(); + return map.map[0]; +} + +OUString +Transliteration_body::foldingImpl( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >* pOffset) +{ + return transliterateImpl(inStr, startPos, nCount, pOffset); +} + +Transliteration_casemapping::Transliteration_casemapping() +{ + nMappingType = MappingType::NONE; + transliterationName = "casemapping(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; +} + +Transliteration_u2l::Transliteration_u2l() +{ + nMappingType = MappingType::UpperToLower; + transliterationName = "upper_to_lower(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l"; +} + +Transliteration_l2u::Transliteration_l2u() +{ + nMappingType = MappingType::LowerToUpper; + transliterationName = "lower_to_upper(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; +} + +Transliteration_togglecase::Transliteration_togglecase() +{ + // usually nMappingType must NOT be a combination of different flags here, + // but we take care of that problem in Transliteration_body::transliterate above + // before that value is used. There we will decide which of both is to be used on + // a per character basis. + nMappingType = MappingType::LowerToUpper | MappingType::UpperToLower; + transliterationName = "toggle(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; +} + +Transliteration_titlecase::Transliteration_titlecase() +{ + nMappingType = MappingType::ToTitle; + transliterationName = "title(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; +} + +/// @throws RuntimeException +static OUString transliterate_titlecase_Impl( + std::u16string_view inStr, sal_Int32 startPos, sal_Int32 nCount, + const Locale &rLocale, + Sequence< sal_Int32 >* pOffset ) +{ + const OUString aText( inStr.substr( startPos, nCount ) ); + + OUString aRes; + if (!aText.isEmpty()) + { + Reference< XComponentContext > xContext = ::comphelper::getProcessComponentContext(); + rtl::Reference< CharacterClassificationImpl > xCharClassImpl( new CharacterClassificationImpl( xContext ) ); + + // because xCharClassImpl.toTitle does not handle ligatures or Beta but will raise + // an exception we need to handle the first chara manually... + + // we don't want to change surrogates by accident, thuse we use proper code point iteration + sal_uInt32 cFirstChar = aText.iterateCodePoints( &o3tl::temporary(sal_Int32(0)) ); + OUString aResolvedLigature( &cFirstChar, 1 ); + // toUpper can be used to properly resolve ligatures and characters like Beta + aResolvedLigature = xCharClassImpl->toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); + // since toTitle will leave all-uppercase text unchanged we first need to + // use toLower to bring possible 2nd and following chars in lowercase + aResolvedLigature = xCharClassImpl->toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); + sal_Int32 nResolvedLen = aResolvedLigature.getLength(); + + // now we can properly use toTitle to get the expected result for the resolved string. + // The rest of the text should just become lowercase. + aRes = xCharClassImpl->toTitle( aResolvedLigature, 0, nResolvedLen, rLocale ) + + xCharClassImpl->toLower( aText, 1, aText.getLength() - 1, rLocale ); + if (pOffset) + { + pOffset->realloc( aRes.getLength() ); + + auto [begin, end] = asNonConstRange(*pOffset); + sal_Int32* pOffsetInt = std::fill_n(begin, nResolvedLen, 0); + std::iota(pOffsetInt, end, 1); + } + } + return aRes; +} + +// this function expects to be called on a word-by-word basis, +// namely that startPos points to the first char of the word +OUString Transliteration_titlecase::transliterateImpl( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >* pOffset ) +{ + return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, pOffset ); +} + +Transliteration_sentencecase::Transliteration_sentencecase() +{ + nMappingType = MappingType::ToTitle; // though only to be applied to the first word... + transliterationName = "sentence(generic)"; + implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; +} + +// this function expects to be called on a sentence-by-sentence basis, +// namely that startPos points to the first word (NOT first char!) in the sentence +OUString Transliteration_sentencecase::transliterateImpl( + const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, + Sequence< sal_Int32 >* pOffset ) +{ + return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, pOffset ); +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |