diff options
Diffstat (limited to 'i18npool/source/characterclassification')
4 files changed, 1683 insertions, 0 deletions
diff --git a/i18npool/source/characterclassification/cclass_unicode.cxx b/i18npool/source/characterclassification/cclass_unicode.cxx new file mode 100644 index 000000000..f07e9f812 --- /dev/null +++ b/i18npool/source/characterclassification/cclass_unicode.cxx @@ -0,0 +1,307 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <cclass_unicode.hxx> +#include <com/sun/star/i18n/KCharacterType.hpp> +#include <com/sun/star/i18n/WordType.hpp> +#include <com/sun/star/lang/WrappedTargetRuntimeException.hpp> +#include <unicode/uchar.h> +#include <cppuhelper/exc_hlp.hxx> +#include <cppuhelper/supportsservice.hxx> +#include <breakiteratorImpl.hxx> +#include <transliteration_body.hxx> +#include <rtl/ref.hxx> +#include <utility> + +using namespace ::com::sun::star; +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star::lang; + +namespace i18npool { + +// class cclass_Unicode +// ----------------------------------------------------; + +cclass_Unicode::cclass_Unicode( uno::Reference < XComponentContext > xContext ) : + transToUpper( new Transliteration_casemapping() ), + transToLower( new Transliteration_casemapping() ), + transToTitle( new Transliteration_casemapping() ), + m_xContext(std::move( xContext )), + nStartTypes( 0 ), + nContTypes( 0 ), + cGroupSep( ',' ), + cDecimalSep( '.' ), + cDecimalSepAlt( 0 ) +{ + transToUpper->setMappingType(MappingType::ToUpper); + transToLower->setMappingType(MappingType::ToLower); + transToTitle->setMappingType(MappingType::ToTitle); +} + +cclass_Unicode::~cclass_Unicode() { + destroyParserTable(); +} + + +OUString SAL_CALL +cclass_Unicode::toUpper( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& rLocale ) { + sal_Int32 len = Text.getLength(); + if (nPos >= len) + return OUString(); + if (nCount + nPos > len) + nCount = len - nPos; + + transToUpper->setLocale(rLocale); + return transToUpper->transliterateString2String(Text, nPos, nCount); +} + +OUString SAL_CALL +cclass_Unicode::toLower( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& rLocale ) { + sal_Int32 len = Text.getLength(); + if (nPos >= len) + return OUString(); + if (nCount + nPos > len) + nCount = len - nPos; + + transToLower->setLocale(rLocale); + return transToLower->transliterateString2String(Text, nPos, nCount); +} + +OUString SAL_CALL +cclass_Unicode::toTitle( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& rLocale ) { + try + { + sal_Int32 len = Text.getLength(); + if (nPos >= len) + return OUString(); + if (nCount + nPos > len) + nCount = len - nPos; + + transToTitle->setLocale(rLocale); + rtl_uString* pStr = rtl_uString_alloc(nCount); + sal_Unicode* out = pStr->buffer; + rtl::Reference< BreakIteratorImpl > xBrk(new BreakIteratorImpl(m_xContext)); + Boundary bdy = xBrk->getWordBoundary(Text, nPos, rLocale, + WordType::ANYWORD_IGNOREWHITESPACES, true); + for (sal_Int32 i = nPos; i < nCount + nPos; i++, out++) { + if (i >= bdy.endPos) + bdy = xBrk->nextWord(Text, bdy.endPos, rLocale, + WordType::ANYWORD_IGNOREWHITESPACES); + *out = (i == bdy.startPos) ? + transToTitle->transliterateChar2Char(Text[i]) : Text[i]; + } + *out = 0; + return OUString( pStr, SAL_NO_ACQUIRE ); + } + catch (const RuntimeException&) + { + throw; + } + catch (const Exception& e) + { + uno::Any a(cppu::getCaughtException()); + throw lang::WrappedTargetRuntimeException( + "wrapped " + a.getValueTypeName() + ": " + e.Message, + uno::Reference<uno::XInterface>(), a); + } +} + +sal_Int16 SAL_CALL +cclass_Unicode::getType( const OUString& Text, sal_Int32 nPos ) { + if ( nPos < 0 || Text.getLength() <= nPos ) return 0; + return static_cast<sal_Int16>(u_charType(Text.iterateCodePoints(&nPos, 0))); +} + +sal_Int16 SAL_CALL +cclass_Unicode::getCharacterDirection( const OUString& Text, sal_Int32 nPos ) { + if ( nPos < 0 || Text.getLength() <= nPos ) return 0; + return static_cast<sal_Int16>(u_charDirection(Text.iterateCodePoints(&nPos, 0))); +} + + +sal_Int16 SAL_CALL +cclass_Unicode::getScript( const OUString& Text, sal_Int32 nPos ) { + if ( nPos < 0 || Text.getLength() <= nPos ) return 0; + // ICU Unicode script type UBlockCode starts from 1 for Basic Latin, + // while OO.o enum UnicideScript starts from 0. + // To map ICU UBlockCode to OO.o UnicodeScript, it needs to shift 1. + return static_cast<sal_Int16>(ublock_getCode(Text.iterateCodePoints(&nPos, 0)))-1; +} + + +sal_Int32 +cclass_Unicode::getCharType( const OUString& Text, sal_Int32* nPos, sal_Int32 increment) { + using namespace ::com::sun::star::i18n::KCharacterType; + + sal_uInt32 ch = Text.iterateCodePoints(nPos, increment); + switch ( u_charType(ch) ) { + // Upper + case U_UPPERCASE_LETTER : + return UPPER|LETTER|PRINTABLE|BASE_FORM; + + // Lower + case U_LOWERCASE_LETTER : + return LOWER|LETTER|PRINTABLE|BASE_FORM; + + // Title + case U_TITLECASE_LETTER : + return TITLE_CASE|LETTER|PRINTABLE|BASE_FORM; + + // Letter + case U_MODIFIER_LETTER : + case U_OTHER_LETTER : + return LETTER|PRINTABLE|BASE_FORM; + + // Digit + case U_DECIMAL_DIGIT_NUMBER: + case U_LETTER_NUMBER: + case U_OTHER_NUMBER: + return DIGIT|PRINTABLE|BASE_FORM; + + // Base + case U_NON_SPACING_MARK: + case U_ENCLOSING_MARK: + case U_COMBINING_SPACING_MARK: + return BASE_FORM|PRINTABLE; + + // Print + case U_SPACE_SEPARATOR: + + case U_DASH_PUNCTUATION: + case U_INITIAL_PUNCTUATION: + case U_FINAL_PUNCTUATION: + case U_CONNECTOR_PUNCTUATION: + case U_OTHER_PUNCTUATION: + + case U_MATH_SYMBOL: + case U_CURRENCY_SYMBOL: + case U_MODIFIER_SYMBOL: + case U_OTHER_SYMBOL: + return PRINTABLE; + + // Control + case U_CONTROL_CHAR: + case U_FORMAT_CHAR: + return CONTROL; + + case U_LINE_SEPARATOR: + case U_PARAGRAPH_SEPARATOR: + return CONTROL|PRINTABLE; + + // for all others + default: + return U_GENERAL_OTHER_TYPES; + } +} + +sal_Int32 SAL_CALL +cclass_Unicode::getCharacterType( const OUString& Text, sal_Int32 nPos, const Locale& /*rLocale*/ ) { + if ( nPos < 0 || Text.getLength() <= nPos ) return 0; + return getCharType(Text, &nPos, 0); + +} + +sal_Int32 SAL_CALL +cclass_Unicode::getStringType( const OUString& Text, sal_Int32 nPos, sal_Int32 nCount, const Locale& /*rLocale*/ ) { + if ( nPos < 0 || Text.getLength() <= nPos ) return 0; + + sal_Int32 result = 0; + + while (nCount > 0 && nPos < Text.getLength()) + { + sal_Int32 nOrigPos = nPos; + result |= getCharType(Text, &nPos, 1); + sal_Int32 nUtf16Units = nPos - nOrigPos; + nCount -= nUtf16Units; + } + + return result; +} + +ParseResult SAL_CALL cclass_Unicode::parseAnyToken( + const OUString& Text, + sal_Int32 nPos, + const Locale& rLocale, + sal_Int32 startCharTokenType, + const OUString& userDefinedCharactersStart, + sal_Int32 contCharTokenType, + const OUString& userDefinedCharactersCont ) +{ + ParseResult r; + if ( Text.getLength() <= nPos ) + return r; + + setupParserTable( rLocale, + startCharTokenType, userDefinedCharactersStart, + contCharTokenType, userDefinedCharactersCont ); + parseText( r, Text, nPos ); + + return r; +} + + +ParseResult SAL_CALL cclass_Unicode::parsePredefinedToken( + sal_Int32 nTokenType, + const OUString& Text, + sal_Int32 nPos, + const Locale& rLocale, + sal_Int32 startCharTokenType, + const OUString& userDefinedCharactersStart, + sal_Int32 contCharTokenType, + const OUString& userDefinedCharactersCont ) +{ + ParseResult r; + if ( Text.getLength() <= nPos ) + return r; + + setupParserTable( rLocale, + startCharTokenType, userDefinedCharactersStart, + contCharTokenType, userDefinedCharactersCont ); + parseText( r, Text, nPos, nTokenType ); + + return r; +} + +OUString SAL_CALL cclass_Unicode::getImplementationName() +{ + return "com.sun.star.i18n.CharacterClassification_Unicode"; +} + +sal_Bool SAL_CALL cclass_Unicode::supportsService(const OUString& rServiceName) +{ + return cppu::supportsService(this, rServiceName); +} + +Sequence< OUString > SAL_CALL cclass_Unicode::getSupportedServiceNames() +{ + return { "com.sun.star.i18n.CharacterClassification_Unicode" }; +} + +} + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * +com_sun_star_i18n_CharacterClassification_Unicode_get_implementation( + css::uno::XComponentContext *context, + css::uno::Sequence<css::uno::Any> const &) +{ + return cppu::acquire(new i18npool::cclass_Unicode(context)); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/characterclassification/cclass_unicode_parser.cxx b/i18npool/source/characterclassification/cclass_unicode_parser.cxx new file mode 100644 index 000000000..313e42a0f --- /dev/null +++ b/i18npool/source/characterclassification/cclass_unicode_parser.cxx @@ -0,0 +1,1066 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + + +#include <cclass_unicode.hxx> +#include <unicode/uchar.h> +#include <rtl/character.hxx> +#include <rtl/math.hxx> +#include <rtl/ustring.hxx> +#include <com/sun/star/i18n/KParseTokens.hpp> +#include <com/sun/star/i18n/KParseType.hpp> +#include <com/sun/star/i18n/LocaleData2.hpp> +#include <com/sun/star/i18n/NativeNumberMode.hpp> +#include <com/sun/star/i18n/NativeNumberSupplier.hpp> + +#include <string.h> +#include <string_view> + +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star::lang; + +#define TOKEN_DIGIT_FLAGS (ParserFlags::CHAR_VALUE | ParserFlags::VALUE | ParserFlags::VALUE_EXP | ParserFlags::VALUE_EXP_VALUE | ParserFlags::VALUE_DIGIT) + +namespace i18npool { + +// Default identifier/name specification is [A-Za-z_][A-Za-z0-9_]* + +const sal_uInt8 cclass_Unicode::nDefCnt = 128; +const ParserFlags cclass_Unicode::pDefaultParserTable[ nDefCnt ] = +{ +// (...) == Calc formula compiler specific, commented out and modified + + /* \0 */ ParserFlags::EXCLUDED, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + /* 9 \t */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL) + ParserFlags::ILLEGAL, + /* 11 \v */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL) + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + ParserFlags::ILLEGAL, + /* 32 */ ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 33 ! */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 34 " */ ParserFlags::CHAR_STRING | ParserFlags::STRING_SEP, + /* 35 # */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD_SEP) + /* 36 $ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD) + /* 37 % */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::VALUE) + /* 38 & */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 39 ' */ ParserFlags::NAME_SEP, + /* 40 ( */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 41 ) */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 42 * */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 43 + */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP | ParserFlags::VALUE_EXP | ParserFlags::VALUE_SIGN, + /* 44 , */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_VALUE | ParserFlags::VALUE) + /* 45 - */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP | ParserFlags::VALUE_EXP | ParserFlags::VALUE_SIGN, + /* 46 . */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD | ParserFlags::CHAR_VALUE | ParserFlags::VALUE) + /* 47 / */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + //for ( i = 48; i < 58; i++ ) + /* 48 0 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 49 1 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 50 2 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 51 3 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 52 4 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 53 5 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 54 6 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 55 7 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 56 8 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 57 9 */ TOKEN_DIGIT_FLAGS | ParserFlags::WORD, + /* 58 : */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::WORD) + /* 59 ; */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 60 < */ ParserFlags::CHAR_BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 61 = */ ParserFlags::CHAR | ParserFlags::BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 62 > */ ParserFlags::CHAR_BOOL | ParserFlags::BOOL | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 63 ? */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::CHAR_WORD | ParserFlags::WORD) + /* 64 @ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + //for ( i = 65; i < 91; i++ ) + /* 65 A */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 66 B */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 67 C */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 68 D */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 69 E */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 70 F */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 71 G */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 72 H */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 73 I */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 74 J */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 75 K */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 76 L */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 77 M */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 78 N */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 79 O */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 80 P */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 81 Q */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 82 R */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 83 S */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 84 T */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 85 U */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 86 V */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 87 W */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 88 X */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 89 Y */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 90 Z */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 91 [ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 92 \ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 93 ] */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 94 ^ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, + /* 95 _ */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 96 ` */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + //for ( i = 97; i < 123; i++ ) + /* 97 a */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 98 b */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 99 c */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 100 d */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 101 e */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 102 f */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 103 g */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 104 h */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 105 i */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 106 j */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 107 k */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 108 l */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 109 m */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 110 n */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 111 o */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 112 p */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 113 q */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 114 r */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 115 s */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 116 t */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 117 u */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 118 v */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 119 w */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 120 x */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 121 y */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 122 z */ ParserFlags::CHAR_WORD | ParserFlags::WORD, + /* 123 { */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 124 | */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 125 } */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 126 ~ */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP, // (ParserFlags::ILLEGAL // UNUSED) + /* 127 */ ParserFlags::CHAR | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP // (ParserFlags::ILLEGAL // UNUSED) +}; + + +const sal_Int32 cclass_Unicode::pParseTokensType[ nDefCnt ] = +{ + /* \0 */ KParseTokens::ASC_OTHER, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + /* 9 \t */ KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + /* 11 \v */ KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + KParseTokens::ASC_CONTROL, + /* 32 */ KParseTokens::ASC_OTHER, + /* 33 ! */ KParseTokens::ASC_OTHER, + /* 34 " */ KParseTokens::ASC_OTHER, + /* 35 # */ KParseTokens::ASC_OTHER, + /* 36 $ */ KParseTokens::ASC_DOLLAR, + /* 37 % */ KParseTokens::ASC_OTHER, + /* 38 & */ KParseTokens::ASC_OTHER, + /* 39 ' */ KParseTokens::ASC_OTHER, + /* 40 ( */ KParseTokens::ASC_OTHER, + /* 41 ) */ KParseTokens::ASC_OTHER, + /* 42 * */ KParseTokens::ASC_OTHER, + /* 43 + */ KParseTokens::ASC_OTHER, + /* 44 , */ KParseTokens::ASC_OTHER, + /* 45 - */ KParseTokens::ASC_OTHER, + /* 46 . */ KParseTokens::ASC_DOT, + /* 47 / */ KParseTokens::ASC_OTHER, + //for ( i = 48; i < 58; i++ ) + /* 48 0 */ KParseTokens::ASC_DIGIT, + /* 49 1 */ KParseTokens::ASC_DIGIT, + /* 50 2 */ KParseTokens::ASC_DIGIT, + /* 51 3 */ KParseTokens::ASC_DIGIT, + /* 52 4 */ KParseTokens::ASC_DIGIT, + /* 53 5 */ KParseTokens::ASC_DIGIT, + /* 54 6 */ KParseTokens::ASC_DIGIT, + /* 55 7 */ KParseTokens::ASC_DIGIT, + /* 56 8 */ KParseTokens::ASC_DIGIT, + /* 57 9 */ KParseTokens::ASC_DIGIT, + /* 58 : */ KParseTokens::ASC_COLON, + /* 59 ; */ KParseTokens::ASC_OTHER, + /* 60 < */ KParseTokens::ASC_OTHER, + /* 61 = */ KParseTokens::ASC_OTHER, + /* 62 > */ KParseTokens::ASC_OTHER, + /* 63 ? */ KParseTokens::ASC_OTHER, + /* 64 @ */ KParseTokens::ASC_OTHER, + //for ( i = 65; i < 91; i++ ) + /* 65 A */ KParseTokens::ASC_UPALPHA, + /* 66 B */ KParseTokens::ASC_UPALPHA, + /* 67 C */ KParseTokens::ASC_UPALPHA, + /* 68 D */ KParseTokens::ASC_UPALPHA, + /* 69 E */ KParseTokens::ASC_UPALPHA, + /* 70 F */ KParseTokens::ASC_UPALPHA, + /* 71 G */ KParseTokens::ASC_UPALPHA, + /* 72 H */ KParseTokens::ASC_UPALPHA, + /* 73 I */ KParseTokens::ASC_UPALPHA, + /* 74 J */ KParseTokens::ASC_UPALPHA, + /* 75 K */ KParseTokens::ASC_UPALPHA, + /* 76 L */ KParseTokens::ASC_UPALPHA, + /* 77 M */ KParseTokens::ASC_UPALPHA, + /* 78 N */ KParseTokens::ASC_UPALPHA, + /* 79 O */ KParseTokens::ASC_UPALPHA, + /* 80 P */ KParseTokens::ASC_UPALPHA, + /* 81 Q */ KParseTokens::ASC_UPALPHA, + /* 82 R */ KParseTokens::ASC_UPALPHA, + /* 83 S */ KParseTokens::ASC_UPALPHA, + /* 84 T */ KParseTokens::ASC_UPALPHA, + /* 85 U */ KParseTokens::ASC_UPALPHA, + /* 86 V */ KParseTokens::ASC_UPALPHA, + /* 87 W */ KParseTokens::ASC_UPALPHA, + /* 88 X */ KParseTokens::ASC_UPALPHA, + /* 89 Y */ KParseTokens::ASC_UPALPHA, + /* 90 Z */ KParseTokens::ASC_UPALPHA, + /* 91 [ */ KParseTokens::ASC_OTHER, + /* 92 \ */ KParseTokens::ASC_OTHER, + /* 93 ] */ KParseTokens::ASC_OTHER, + /* 94 ^ */ KParseTokens::ASC_OTHER, + /* 95 _ */ KParseTokens::ASC_UNDERSCORE, + /* 96 ` */ KParseTokens::ASC_OTHER, + //for ( i = 97; i < 123; i++ ) + /* 97 a */ KParseTokens::ASC_LOALPHA, + /* 98 b */ KParseTokens::ASC_LOALPHA, + /* 99 c */ KParseTokens::ASC_LOALPHA, + /* 100 d */ KParseTokens::ASC_LOALPHA, + /* 101 e */ KParseTokens::ASC_LOALPHA, + /* 102 f */ KParseTokens::ASC_LOALPHA, + /* 103 g */ KParseTokens::ASC_LOALPHA, + /* 104 h */ KParseTokens::ASC_LOALPHA, + /* 105 i */ KParseTokens::ASC_LOALPHA, + /* 106 j */ KParseTokens::ASC_LOALPHA, + /* 107 k */ KParseTokens::ASC_LOALPHA, + /* 108 l */ KParseTokens::ASC_LOALPHA, + /* 109 m */ KParseTokens::ASC_LOALPHA, + /* 110 n */ KParseTokens::ASC_LOALPHA, + /* 111 o */ KParseTokens::ASC_LOALPHA, + /* 112 p */ KParseTokens::ASC_LOALPHA, + /* 113 q */ KParseTokens::ASC_LOALPHA, + /* 114 r */ KParseTokens::ASC_LOALPHA, + /* 115 s */ KParseTokens::ASC_LOALPHA, + /* 116 t */ KParseTokens::ASC_LOALPHA, + /* 117 u */ KParseTokens::ASC_LOALPHA, + /* 118 v */ KParseTokens::ASC_LOALPHA, + /* 119 w */ KParseTokens::ASC_LOALPHA, + /* 120 x */ KParseTokens::ASC_LOALPHA, + /* 121 y */ KParseTokens::ASC_LOALPHA, + /* 122 z */ KParseTokens::ASC_LOALPHA, + /* 123 { */ KParseTokens::ASC_OTHER, + /* 124 | */ KParseTokens::ASC_OTHER, + /* 125 } */ KParseTokens::ASC_OTHER, + /* 126 ~ */ KParseTokens::ASC_OTHER, + /* 127 */ KParseTokens::ASC_OTHER +}; + + +// static +const sal_Unicode* cclass_Unicode::StrChr( const sal_Unicode* pStr, sal_uInt32 c ) +{ + if ( !pStr ) + return nullptr; + sal_Unicode cs[2]; + auto const n = rtl::splitSurrogates(c, cs); + while ( *pStr ) + { + if ( *pStr == cs[0] && (n == 1 || pStr[1] == cs[1]) ) + return pStr; + pStr++; + } + return nullptr; +} + + +sal_Int32 cclass_Unicode::getParseTokensType(sal_uInt32 const c, bool const isFirst) +{ + if ( c < nDefCnt ) + return pParseTokensType[ sal_uInt8(c) ]; + else + { + + //! all KParseTokens::UNI_... must be matched + switch (u_charType(c)) + { + case U_UPPERCASE_LETTER : + return KParseTokens::UNI_UPALPHA; + case U_LOWERCASE_LETTER : + return KParseTokens::UNI_LOALPHA; + case U_TITLECASE_LETTER : + return KParseTokens::UNI_TITLE_ALPHA; + case U_MODIFIER_LETTER : + return KParseTokens::UNI_MODIFIER_LETTER; + case U_OTHER_LETTER : + // Non_Spacing_Mark could not be as leading character + if (isFirst) break; + [[fallthrough]]; // treat it as Other_Letter. + case U_NON_SPACING_MARK : + return KParseTokens::UNI_OTHER_LETTER; + case U_DECIMAL_DIGIT_NUMBER : + return KParseTokens::UNI_DIGIT; + case U_LETTER_NUMBER : + return KParseTokens::UNI_LETTER_NUMBER; + case U_OTHER_NUMBER : + return KParseTokens::UNI_OTHER_NUMBER; + } + + return KParseTokens::UNI_OTHER; + } +} + +void cclass_Unicode::setupInternational( const Locale& rLocale ) +{ + bool bChanged = (aParserLocale.Language != rLocale.Language + || aParserLocale.Country != rLocale.Country + || aParserLocale.Variant != rLocale.Variant); + if ( bChanged ) + { + aParserLocale.Language = rLocale.Language; + aParserLocale.Country = rLocale.Country; + aParserLocale.Variant = rLocale.Variant; + } + if ( !mxLocaleData.is() ) + { + mxLocaleData.set( LocaleData2::create(m_xContext) ); + } +} + + +void cclass_Unicode::setupParserTable( const Locale& rLocale, sal_Int32 startCharTokenType, + const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType, + const OUString& userDefinedCharactersCont ) +{ + bool bIntlEqual = (rLocale.Language == aParserLocale.Language && + rLocale.Country == aParserLocale.Country && + rLocale.Variant == aParserLocale.Variant); + if ( !pTable || !bIntlEqual || + startCharTokenType != nStartTypes || + contCharTokenType != nContTypes || + userDefinedCharactersStart != aStartChars || + userDefinedCharactersCont != aContChars ) + initParserTable( rLocale, startCharTokenType, userDefinedCharactersStart, + contCharTokenType, userDefinedCharactersCont ); +} + + +void cclass_Unicode::initParserTable( const Locale& rLocale, sal_Int32 startCharTokenType, + const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType, + const OUString& userDefinedCharactersCont ) +{ + // (Re)Init + setupInternational( rLocale ); + // Memory of pTable is reused. + if ( !pTable ) + pTable.reset(new ParserFlags[nDefCnt]); + memcpy( pTable.get(), pDefaultParserTable, sizeof(ParserFlags) * nDefCnt ); + // Start and cont tables only need reallocation if different length. + if ( pStart && userDefinedCharactersStart.getLength() != aStartChars.getLength() ) + { + pStart.reset(); + } + if ( pCont && userDefinedCharactersCont.getLength() != aContChars.getLength() ) + { + pCont.reset(); + } + nStartTypes = startCharTokenType; + nContTypes = contCharTokenType; + aStartChars = userDefinedCharactersStart; + aContChars = userDefinedCharactersCont; + + // specials + if( mxLocaleData.is() ) + { + LocaleDataItem2 aItem = + mxLocaleData->getLocaleItem2( aParserLocale ); +//!TODO: theoretically separators may be a string, adjustment would have to be +//! done here and in parsing and in ::rtl::math::stringToDouble() + cGroupSep = aItem.thousandSeparator[0]; + cDecimalSep = aItem.decimalSeparator[0]; + cDecimalSepAlt = aItem.decimalSeparatorAlternative.toChar(); + } + + if (nContTypes & KParseTokens::GROUP_SEPARATOR_IN_NUMBER) + { + if ( cGroupSep < nDefCnt ) + pTable[cGroupSep] |= ParserFlags::VALUE; + } + else + { + cGroupSep = 0; + } + if ( cDecimalSep < nDefCnt ) + pTable[cDecimalSep] |= ParserFlags::CHAR_VALUE | ParserFlags::VALUE; + if ( cDecimalSepAlt && cDecimalSepAlt < nDefCnt ) + pTable[cDecimalSepAlt] |= ParserFlags::CHAR_VALUE | ParserFlags::VALUE; + + // Modify characters according to KParseTokens definitions. + { + using namespace KParseTokens; + sal_uInt8 i; + + if ( !(nStartTypes & ASC_UPALPHA) ) + for ( i = 65; i < 91; i++ ) + pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character + if ( !(nContTypes & ASC_UPALPHA) ) + for ( i = 65; i < 91; i++ ) + pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character + + if ( !(nStartTypes & ASC_LOALPHA) ) + for ( i = 97; i < 123; i++ ) + pTable[i] &= ~ParserFlags::CHAR_WORD; // not allowed as start character + if ( !(nContTypes & ASC_LOALPHA) ) + for ( i = 97; i < 123; i++ ) + pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character + + if ( nStartTypes & ASC_DIGIT ) + for ( i = 48; i < 58; i++ ) + pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character + if ( !(nContTypes & ASC_DIGIT) ) + for ( i = 48; i < 58; i++ ) + pTable[i] &= ~ParserFlags::WORD; // not allowed as cont character + + if ( !(nStartTypes & ASC_UNDERSCORE) ) + pTable[95] &= ~ParserFlags::CHAR_WORD; // not allowed as start character + if ( !(nContTypes & ASC_UNDERSCORE) ) + pTable[95] &= ~ParserFlags::WORD; // not allowed as cont character + + if ( nStartTypes & ASC_DOLLAR ) + pTable[36] |= ParserFlags::CHAR_WORD; // allowed as start character + if ( nContTypes & ASC_DOLLAR ) + pTable[36] |= ParserFlags::WORD; // allowed as cont character + + if ( nStartTypes & ASC_DOT ) + pTable[46] |= ParserFlags::CHAR_WORD; // allowed as start character + if ( nContTypes & ASC_DOT ) + pTable[46] |= ParserFlags::WORD; // allowed as cont character + + if ( nStartTypes & ASC_COLON ) + pTable[58] |= ParserFlags::CHAR_WORD; // allowed as start character + if ( nContTypes & ASC_COLON ) + pTable[58] |= ParserFlags::WORD; // allowed as cont character + + if ( nStartTypes & ASC_CONTROL ) + for ( i = 1; i < 32; i++ ) + pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character + if ( nContTypes & ASC_CONTROL ) + for ( i = 1; i < 32; i++ ) + pTable[i] |= ParserFlags::WORD; // allowed as cont character + + if ( nStartTypes & ASC_ANY_BUT_CONTROL ) + for ( i = 32; i < nDefCnt; i++ ) + pTable[i] |= ParserFlags::CHAR_WORD; // allowed as start character + if ( nContTypes & ASC_ANY_BUT_CONTROL ) + for ( i = 32; i < nDefCnt; i++ ) + pTable[i] |= ParserFlags::WORD; // allowed as cont character + + } + + // Merge in (positively override with) user defined characters. + // StartChars + sal_Int32 nLen = aStartChars.getLength(); + if ( nLen ) + { + if ( !pStart ) + pStart.reset(new ParserFlags[ nLen ]); + const sal_Unicode* p = aStartChars.getStr(); + for ( sal_Int32 j=0; j<nLen; j++, p++ ) + { + pStart[j] = ParserFlags::CHAR_WORD; + if ( *p < nDefCnt ) + pTable[*p] |= ParserFlags::CHAR_WORD; + } + } + // ContChars + nLen = aContChars.getLength(); + if ( nLen ) + { + if ( !pCont ) + pCont.reset(new ParserFlags[ nLen ]); + const sal_Unicode* p = aContChars.getStr(); + for ( sal_Int32 j=0; j<nLen; j++ ) + { + pCont[j] = ParserFlags::WORD; + if ( *p < nDefCnt ) + pTable[*p] |= ParserFlags::WORD; + } + } +} + + +void cclass_Unicode::destroyParserTable() +{ + pCont.reset(); + pStart.reset(); + pTable.reset(); +} + + +ParserFlags cclass_Unicode::getFlags(sal_uInt32 const c, const cclass_Unicode::ScanState eState) +{ + ParserFlags nMask; + if ( c < nDefCnt ) + nMask = pTable[ sal_uInt8(c) ]; + else + nMask = getFlagsExtended(c, eState); + switch ( eState ) + { + case ssGetChar : + case ssRewindFromValue : + case ssIgnoreLeadingInRewind : + case ssGetWordFirstChar : + if ( !(nMask & ParserFlags::CHAR_WORD) ) + { + nMask |= getStartCharsFlags( c ); + if ( nMask & ParserFlags::CHAR_WORD ) + nMask &= ~ParserFlags::EXCLUDED; + } + break; + case ssGetValue : + case ssGetWord : + if ( !(nMask & ParserFlags::WORD) ) + { + nMask |= getContCharsFlags( c ); + if ( nMask & ParserFlags::WORD ) + nMask &= ~ParserFlags::EXCLUDED; + } + break; + default: + ; // other cases aren't needed, no compiler warning + } + return nMask; +} + + +ParserFlags cclass_Unicode::getFlagsExtended(sal_uInt32 const c, const cclass_Unicode::ScanState eState) const +{ + if ( c == cGroupSep ) + return ParserFlags::VALUE; + else if ( c == cDecimalSep ) + return ParserFlags::CHAR_VALUE | ParserFlags::VALUE; + else if ( cDecimalSepAlt && c == cDecimalSepAlt ) + return ParserFlags::CHAR_VALUE | ParserFlags::VALUE; + bool bStart = (eState == ssGetChar || eState == ssGetWordFirstChar || + eState == ssRewindFromValue || eState == ssIgnoreLeadingInRewind); + sal_Int32 nTypes = (bStart ? nStartTypes : nContTypes); + + //! all KParseTokens::UNI_... must be matched + switch (u_charType(c)) + { + case U_UPPERCASE_LETTER : + return (nTypes & KParseTokens::UNI_UPALPHA) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL; + case U_LOWERCASE_LETTER : + return (nTypes & KParseTokens::UNI_LOALPHA) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL; + case U_TITLECASE_LETTER : + return (nTypes & KParseTokens::UNI_TITLE_ALPHA) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL; + case U_MODIFIER_LETTER : + return (nTypes & KParseTokens::UNI_MODIFIER_LETTER) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL; + case U_NON_SPACING_MARK : + case U_COMBINING_SPACING_MARK : + // Non_Spacing_Mark can't be a leading character, + // nor can a spacing combining mark. + if (bStart) + return ParserFlags::ILLEGAL; + [[fallthrough]]; // treat it as Other_Letter. + case U_OTHER_LETTER : + return (nTypes & KParseTokens::UNI_OTHER_LETTER) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL; + case U_DECIMAL_DIGIT_NUMBER : + return ((nTypes & KParseTokens::UNI_DIGIT) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS; + case U_LETTER_NUMBER : + return ((nTypes & KParseTokens::UNI_LETTER_NUMBER) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS; + case U_OTHER_NUMBER : + return ((nTypes & KParseTokens::UNI_OTHER_NUMBER) ? + (bStart ? ParserFlags::CHAR_WORD : ParserFlags::WORD) : + ParserFlags::ILLEGAL) | TOKEN_DIGIT_FLAGS; + case U_SPACE_SEPARATOR : + return ((nTypes & KParseTokens::IGNORE_LEADING_WS) ? + ParserFlags::CHAR_DONTCARE : (bStart ? ParserFlags::CHAR_WORD : (ParserFlags::CHAR_DONTCARE | ParserFlags::WORD_SEP | ParserFlags::VALUE_SEP) )); + case U_OTHER_PUNCTUATION: + // fdo#61754 Lets see (if we not at the start) if this is midletter + // punctuation and allow it in a word if it is similarly to + // U_NON_SPACING_MARK, for example U+00B7 MIDDLE DOT. + // tdf#123575 for U+30FB KATAKANA MIDDLE DOT property is not + // U_WB_MIDLETTER but U_WB_KATAKANA instead, explicitly test that + // and U+FF65 HALFWIDTH KATAKANA MIDDLE DOT. + if (bStart || (U_WB_MIDLETTER != u_getIntPropertyValue(c, UCHAR_WORD_BREAK) + && c != 0x30FB && c != 0xFF65)) + return ParserFlags::ILLEGAL; + else + { + //allowing it to continue the word + return (nTypes & KParseTokens::UNI_OTHER_LETTER) ? + ParserFlags::WORD : ParserFlags::ILLEGAL; + } + break; + } + + return ParserFlags::ILLEGAL; +} + + +ParserFlags cclass_Unicode::getStartCharsFlags( sal_uInt32 c ) +{ + if ( pStart ) + { + const sal_Unicode* pStr = aStartChars.getStr(); + const sal_Unicode* p = StrChr( pStr, c ); + if ( p ) + return pStart[ p - pStr ]; + } + return ParserFlags::ILLEGAL; +} + + +ParserFlags cclass_Unicode::getContCharsFlags( sal_Unicode c ) +{ + if ( pCont ) + { + const sal_Unicode* pStr = aContChars.getStr(); + const sal_Unicode* p = StrChr( pStr, c ); + if ( p ) + return pCont[ p - pStr ]; + } + return ParserFlags::ILLEGAL; +} + + +void cclass_Unicode::parseText( ParseResult& r, const OUString& rText, sal_Int32 nPos, sal_Int32 nTokenType ) +{ + assert(r.LeadingWhiteSpace == 0); + ScanState eState = ssGetChar; + + //! All the variables below (plus ParseResult) have to be reset on ssRewindFromValue! + OUStringBuffer aSymbol; + bool isFirst(true); + sal_Int32 index(nPos); // index of next code point after current + sal_Int32 postSymbolIndex(index); // index of code point following last quote + sal_uInt32 current((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0); + sal_uInt32 cLast = 0; + sal_Int32 nCodePoints(0); + int nDecSeps = 0; + bool bQuote = false; + bool bMightBeWord = true; + bool bMightBeWordLast = true; + bool bDecSepAltUsed = false; + //! All the variables above (plus ParseResult) have to be reset on ssRewindFromValue! + sal_Int32 nextCharIndex(nPos); // == index of nextChar + + while ((current != 0) && (eState != ssStop)) + { + ++nCodePoints; + ParserFlags nMask = getFlags(current, eState); + if ( nMask & ParserFlags::EXCLUDED ) + eState = ssBounce; + if ( bMightBeWord ) + { // only relevant for ssGetValue fall back + if ( eState == ssGetChar || eState == ssRewindFromValue || + eState == ssIgnoreLeadingInRewind ) + bMightBeWord = bool(nMask & ParserFlags::CHAR_WORD); + else + bMightBeWord = bool(nMask & ParserFlags::WORD); + } + sal_Int32 nParseTokensType = getParseTokensType(current, isFirst); + isFirst = false; + sal_Int32 const nextIndex(nextCharIndex); // == index of char following current + nextCharIndex = index; // == index of nextChar + sal_uInt32 nextChar((index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0); + switch (eState) + { + case ssGetChar : + case ssRewindFromValue : + case ssIgnoreLeadingInRewind : + { + if ( (nMask & ParserFlags::CHAR_VALUE) && eState != ssRewindFromValue + && eState != ssIgnoreLeadingInRewind ) + { //! must be first, may fall back to ssGetWord via bMightBeWord + eState = ssGetValue; + if ( nMask & ParserFlags::VALUE_DIGIT ) + { + if (128 <= current) + r.TokenType = KParseType::UNI_NUMBER; + else + r.TokenType = KParseType::ASC_NUMBER; + } + else if (current == cDecimalSep || (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt))) + { + if (nextChar) + ++nDecSeps; + else + eState = ssRewindFromValue; + // retry for ONE_SINGLE_CHAR or others + } + } + else if ( nMask & ParserFlags::CHAR_WORD ) + { + eState = ssGetWord; + r.TokenType = KParseType::IDENTNAME; + } + else if ( nMask & ParserFlags::NAME_SEP ) + { + eState = ssGetWordFirstChar; + bQuote = true; + postSymbolIndex = nextCharIndex; + nParseTokensType = 0; // will be taken of first real character + r.TokenType = KParseType::SINGLE_QUOTE_NAME; + } + else if ( nMask & ParserFlags::CHAR_STRING ) + { + eState = ssGetString; + postSymbolIndex = nextCharIndex; + nParseTokensType = 0; // will be taken of first real character + r.TokenType = KParseType::DOUBLE_QUOTE_STRING; + } + else if ( nMask & ParserFlags::CHAR_DONTCARE ) + { + if ( nStartTypes & KParseTokens::IGNORE_LEADING_WS ) + { + if (eState == ssRewindFromValue) + eState = ssIgnoreLeadingInRewind; + r.LeadingWhiteSpace = nextCharIndex - nPos; + nCodePoints--; // exclude leading whitespace + postSymbolIndex = nextCharIndex; + nParseTokensType = 0; // wait until real character + bMightBeWord = true; + } + else + eState = ssBounce; + } + else if ( nMask & ParserFlags::CHAR_BOOL ) + { + eState = ssGetBool; + r.TokenType = KParseType::BOOLEAN; + } + else if ( nMask & ParserFlags::CHAR ) + { //! must be last + eState = ssStop; + r.TokenType = KParseType::ONE_SINGLE_CHAR; + } + else + eState = ssBounce; // not known + } + break; + case ssGetValue : + { + if ( nMask & ParserFlags::VALUE_DIGIT ) + { + if (128 <= current) + r.TokenType = KParseType::UNI_NUMBER; + else if ( r.TokenType != KParseType::UNI_NUMBER ) + r.TokenType = KParseType::ASC_NUMBER; + } + if ( nMask & ParserFlags::VALUE ) + { + if (current == cGroupSep) + { + if (getFlags(nextChar, eState) & ParserFlags::VALUE_DIGIT) + nParseTokensType |= KParseTokens::GROUP_SEPARATOR_IN_NUMBER; + else + { + // Trailing group separator character is not a + // group separator. + eState = ssStopBack; + } + } + else if ((current == cDecimalSep || + (bDecSepAltUsed = (cDecimalSepAlt && current == cDecimalSepAlt))) && + ++nDecSeps > 1) + { + if (nCodePoints == 2) + eState = ssRewindFromValue; + // consecutive separators + else + eState = ssStopBack; + } + // else keep it going + } + else if (current == 'E' || current == 'e') + { + ParserFlags nNext = getFlags(nextChar, eState); + if ( nNext & ParserFlags::VALUE_EXP ) + ; // keep it going + else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar)) + { // might be a numerical name (1.2efg) + eState = ssGetWord; + r.TokenType = KParseType::IDENTNAME; + } + else + eState = ssStopBack; + } + else if ( nMask & ParserFlags::VALUE_SIGN ) + { + if ( (cLast == 'E') || (cLast == 'e') ) + { + ParserFlags nNext = getFlags(nextChar, eState); + if ( nNext & ParserFlags::VALUE_EXP_VALUE ) + ; // keep it going + else if (bMightBeWord && ((nNext & ParserFlags::WORD) || !nextChar)) + { // might be a numerical name (1.2e+fg) + eState = ssGetWord; + r.TokenType = KParseType::IDENTNAME; + } + else + eState = ssStopBack; + } + else if ( bMightBeWord ) + { // might be a numerical name (1.2+fg) + eState = ssGetWord; + r.TokenType = KParseType::IDENTNAME; + } + else + eState = ssStopBack; + } + else if ( bMightBeWord && (nMask & ParserFlags::WORD) ) + { // might be a numerical name (1995.A1) + eState = ssGetWord; + r.TokenType = KParseType::IDENTNAME; + } + else + eState = ssStopBack; + } + break; + case ssGetWordFirstChar : + eState = ssGetWord; + [[fallthrough]]; + case ssGetWord : + { + if ( nMask & ParserFlags::WORD ) + ; // keep it going + else if ( nMask & ParserFlags::NAME_SEP ) + { + if ( bQuote ) + { + if ( cLast == '\\' ) + { // escaped + aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2)); + aSymbol.append(OUString(¤t, 1)); + } + else + { + eState = ssStop; + aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1)); + } + postSymbolIndex = nextCharIndex; + } + else + eState = ssStopBack; + } + else if ( bQuote ) + ; // keep it going + else + eState = ssStopBack; + } + break; + case ssGetString : + { + if ( nMask & ParserFlags::STRING_SEP ) + { + if ( cLast == '\\' ) + { // escaped + aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 2)); + aSymbol.append(OUString(¤t, 1)); + } + else if (current == nextChar && + !(nContTypes & KParseTokens::TWO_DOUBLE_QUOTES_BREAK_STRING) ) + { // "" => literal " escaped + aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex)); + nextCharIndex = index; + if (index < rText.getLength()) { ++nCodePoints; } + nextChar = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0; + } + else + { + eState = ssStop; + aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1)); + } + postSymbolIndex = nextCharIndex; + } + } + break; + case ssGetBool : + { + if ( nMask & ParserFlags::BOOL ) + eState = ssStop; // maximum 2: <, >, <>, <=, >= + else + eState = ssStopBack; + } + break; + case ssStopBack : + case ssBounce : + case ssStop : + ; // nothing, no compiler warning + break; + } + if ( eState == ssRewindFromValue ) + { + r = ParseResult(); + index = nPos; + postSymbolIndex = nPos; + nextCharIndex = nPos; + aSymbol.setLength(0); + current = (index < rText.getLength()) ? rText.iterateCodePoints(&index) : 0; + nCodePoints = (nPos < rText.getLength()) ? 1 : 0; + isFirst = true; + cLast = 0; + nDecSeps = 0; + bQuote = false; + bMightBeWord = true; + bMightBeWordLast = true; + bDecSepAltUsed = false; + } + else + { + if ( !(r.TokenType & nTokenType) ) + { + if ( (r.TokenType & (KParseType::ASC_NUMBER | KParseType::UNI_NUMBER)) + && (nTokenType & KParseType::IDENTNAME) && bMightBeWord ) + ; // keep a number that might be a word + else if (r.LeadingWhiteSpace == (nextCharIndex - nPos)) + ; // keep ignored white space + else if ( !r.TokenType && eState == ssGetValue && (nMask & ParserFlags::VALUE_SEP) ) + ; // keep uncertain value + else + eState = ssBounce; + } + if ( eState == ssBounce ) + { + r.TokenType = 0; + eState = ssStopBack; + } + if ( eState == ssStopBack ) + { // put back + nextChar = rText.iterateCodePoints(&index, -1); + nextCharIndex = nextIndex; + --nCodePoints; + bMightBeWord = bMightBeWordLast; + eState = ssStop; + } + if ( eState != ssStop ) + { + if ( !r.StartFlags ) + r.StartFlags |= nParseTokensType; + else + r.ContFlags |= nParseTokensType; + } + bMightBeWordLast = bMightBeWord; + cLast = current; + current = nextChar; + } + } + // r.CharLen is the length in characters (not code units) of the parsed + // token not including any leading white space. + r.CharLen = nCodePoints; + r.EndPos = nextCharIndex; + if ( r.TokenType & KParseType::ASC_NUMBER ) + { + r.Value = rtl_math_uStringToDouble(rText.getStr() + nPos + r.LeadingWhiteSpace, + rText.getStr() + r.EndPos, (bDecSepAltUsed ? cDecimalSepAlt : cDecimalSep), cGroupSep, nullptr, nullptr); + if ( bMightBeWord ) + r.TokenType |= KParseType::IDENTNAME; + } + else if ( r.TokenType & KParseType::UNI_NUMBER ) + { + if ( !xNatNumSup.is() ) + { + if ( m_xContext.is() ) + { + xNatNumSup = NativeNumberSupplier::create( m_xContext ); + } + } + OUString aTmp(rText.getStr() + nPos + r.LeadingWhiteSpace, + r.EndPos - nPos - r.LeadingWhiteSpace); + // transliterate to ASCII + aTmp = xNatNumSup->getNativeNumberString( aTmp, aParserLocale, + NativeNumberMode::NATNUM0 ); + r.Value = ::rtl::math::stringToDouble( aTmp, cDecimalSep, cGroupSep ); + if ( bMightBeWord ) + r.TokenType |= KParseType::IDENTNAME; + } + else if ( r.TokenType & (KParseType::SINGLE_QUOTE_NAME | KParseType::DOUBLE_QUOTE_STRING) ) + { + if (postSymbolIndex < nextCharIndex) + { //! open quote + aSymbol.append(rText.subView(postSymbolIndex, nextCharIndex - postSymbolIndex - 1)); + r.TokenType |= KParseType::MISSING_QUOTE; + } + r.DequotedNameOrString = aSymbol.makeStringAndClear(); + } +} + +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/characterclassification/characterclassificationImpl.cxx b/i18npool/source/characterclassification/characterclassificationImpl.cxx new file mode 100644 index 000000000..aff424d8b --- /dev/null +++ b/i18npool/source/characterclassification/characterclassificationImpl.cxx @@ -0,0 +1,219 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <cppuhelper/supportsservice.hxx> +#include <characterclassificationImpl.hxx> +#include <localedata.hxx> + +#include <com/sun/star/uno/XComponentContext.hpp> + +using namespace com::sun::star::uno; +using namespace ::com::sun::star::i18n; +using namespace com::sun::star::lang; + +namespace i18npool { + +CharacterClassificationImpl::CharacterClassificationImpl( + const Reference < XComponentContext >& rxContext ) : m_xContext( rxContext ) +{ + static constexpr OUStringLiteral sUnicode = u"Unicode"; + if (createLocaleSpecificCharacterClassification(sUnicode, Locale())) + xUCI = cachedItem->xCI; +} + +CharacterClassificationImpl::~CharacterClassificationImpl() { +} + + +OUString SAL_CALL +CharacterClassificationImpl::toUpper( const OUString& Text, sal_Int32 nPos, + sal_Int32 nCount, const Locale& rLocale ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->toUpper(Text, nPos, nCount, rLocale); +} + +OUString SAL_CALL +CharacterClassificationImpl::toLower( const OUString& Text, sal_Int32 nPos, + sal_Int32 nCount, const Locale& rLocale ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->toLower(Text, nPos, nCount, rLocale); +} + +OUString SAL_CALL +CharacterClassificationImpl::toTitle( const OUString& Text, sal_Int32 nPos, + sal_Int32 nCount, const Locale& rLocale ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->toTitle(Text, nPos, nCount, rLocale); +} + +sal_Int16 SAL_CALL +CharacterClassificationImpl::getType( const OUString& Text, sal_Int32 nPos ) +{ + if (xUCI.is()) + return xUCI->getType(Text, nPos); + throw RuntimeException(); +} + +sal_Int16 SAL_CALL +CharacterClassificationImpl::getCharacterDirection( const OUString& Text, sal_Int32 nPos ) +{ + if (xUCI.is()) + return xUCI->getCharacterDirection(Text, nPos); + throw RuntimeException(); +} + +sal_Int16 SAL_CALL +CharacterClassificationImpl::getScript( const OUString& Text, sal_Int32 nPos ) +{ + if (xUCI.is()) + return xUCI->getScript(Text, nPos); + throw RuntimeException(); +} + +sal_Int32 SAL_CALL +CharacterClassificationImpl::getCharacterType( const OUString& Text, sal_Int32 nPos, + const Locale& rLocale ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->getCharacterType(Text, nPos, rLocale); +} + +sal_Int32 SAL_CALL +CharacterClassificationImpl::getStringType( const OUString& Text, sal_Int32 nPos, + sal_Int32 nCount, const Locale& rLocale ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->getStringType(Text, nPos, nCount, rLocale); +} + +ParseResult SAL_CALL CharacterClassificationImpl::parseAnyToken( + const OUString& Text, sal_Int32 nPos, const Locale& rLocale, + sal_Int32 startCharTokenType, const OUString& userDefinedCharactersStart, + sal_Int32 contCharTokenType, const OUString& userDefinedCharactersCont ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->parseAnyToken(Text, nPos, rLocale, + startCharTokenType,userDefinedCharactersStart, + contCharTokenType, userDefinedCharactersCont); +} + + +ParseResult SAL_CALL CharacterClassificationImpl::parsePredefinedToken( + sal_Int32 nTokenType, const OUString& Text, sal_Int32 nPos, + const Locale& rLocale, sal_Int32 startCharTokenType, + const OUString& userDefinedCharactersStart, sal_Int32 contCharTokenType, + const OUString& userDefinedCharactersCont ) +{ + return getLocaleSpecificCharacterClassification(rLocale)->parsePredefinedToken( + nTokenType, Text, nPos, rLocale, startCharTokenType, userDefinedCharactersStart, + contCharTokenType, userDefinedCharactersCont); +} + +bool CharacterClassificationImpl::createLocaleSpecificCharacterClassification(const OUString& serviceName, const Locale& rLocale) +{ + // to share service between same Language but different Country code, like zh_CN and zh_SG + for (size_t l = 0; l < lookupTable.size(); l++) { + cachedItem = lookupTable[l]; + if (serviceName == cachedItem->aName) { + lookupTable.emplace_back( rLocale, serviceName, cachedItem->xCI ); + cachedItem = lookupTable.back(); + return true; + } + } + + Reference < XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext( + "com.sun.star.i18n.CharacterClassification_" + serviceName, m_xContext); + + Reference < XCharacterClassification > xCI; + if ( xI.is() ) { + xCI.set( xI, UNO_QUERY ); + if (xCI.is()) { + lookupTable.emplace_back( rLocale, serviceName, xCI ); + cachedItem = lookupTable.back(); + return true; + } + } + return false; +} + +Reference < XCharacterClassification > const & +CharacterClassificationImpl::getLocaleSpecificCharacterClassification(const Locale& rLocale) +{ + // reuse instance if locale didn't change + if (cachedItem && cachedItem->equals(rLocale)) + return cachedItem->xCI; + else { + for (const auto & i : lookupTable) { + cachedItem = i; + if (cachedItem->equals(rLocale)) + return cachedItem->xCI; + } + + // Load service with name <base>_<lang>_<country> or + // <base>_<bcp47> and fallbacks. + bool bLoaded = createLocaleSpecificCharacterClassification( + LocaleDataImpl::getFirstLocaleServiceName( rLocale), rLocale); + if (!bLoaded) + { + ::std::vector< OUString > aFallbacks( LocaleDataImpl::getFallbackLocaleServiceNames( rLocale)); + for (const auto& rFallback : aFallbacks) + { + bLoaded = createLocaleSpecificCharacterClassification(rFallback, rLocale); + if (bLoaded) + break; + } + } + if (bLoaded) + return cachedItem->xCI; + else if (xUCI.is()) + { + lookupTable.emplace_back( rLocale, "Unicode", xUCI ); + cachedItem = lookupTable.back(); + return cachedItem->xCI; + } + } + throw RuntimeException(); +} + +OUString SAL_CALL +CharacterClassificationImpl::getImplementationName() +{ + return "com.sun.star.i18n.CharacterClassification"; +} + +sal_Bool SAL_CALL +CharacterClassificationImpl::supportsService(const OUString& rServiceName) +{ + return cppu::supportsService(this, rServiceName); +} + +Sequence< OUString > SAL_CALL +CharacterClassificationImpl::getSupportedServiceNames() +{ + return { "com.sun.star.i18n.CharacterClassification" }; +} + +} + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * +com_sun_star_i18n_CharacterClassification_get_implementation( + css::uno::XComponentContext *context, + css::uno::Sequence<css::uno::Any> const &) +{ + return cppu::acquire(new i18npool::CharacterClassificationImpl(context)); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/characterclassification/unoscripttypedetector.cxx b/i18npool/source/characterclassification/unoscripttypedetector.cxx new file mode 100644 index 000000000..afcd2708c --- /dev/null +++ b/i18npool/source/characterclassification/unoscripttypedetector.cxx @@ -0,0 +1,91 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include <unoscripttypedetector.hxx> +#include <cppuhelper/supportsservice.hxx> +#include <i18nutil/scripttypedetector.hxx> + +namespace com::sun::star::uno { class XComponentContext; } + +sal_Int16 SAL_CALL +UnoScriptTypeDetector::getScriptDirection( const OUString& Text, sal_Int32 nPos, sal_Int16 defaultScriptDirection ) +{ + return ScriptTypeDetector::getScriptDirection(Text, nPos, defaultScriptDirection); +} + +// return value '-1' means either the direction on nPos is not same as scriptDirection or nPos is out of range. +sal_Int32 SAL_CALL +UnoScriptTypeDetector::beginOfScriptDirection( const OUString& Text, sal_Int32 nPos, sal_Int16 direction ) +{ + return ScriptTypeDetector::beginOfScriptDirection(Text, nPos, direction); +} + +sal_Int32 SAL_CALL +UnoScriptTypeDetector::endOfScriptDirection( const OUString& Text, sal_Int32 nPos, sal_Int16 direction ) +{ + return ScriptTypeDetector::endOfScriptDirection(Text, nPos, direction); +} + +sal_Int16 SAL_CALL +UnoScriptTypeDetector::getCTLScriptType( const OUString& Text, sal_Int32 nPos ) +{ + return ScriptTypeDetector::getCTLScriptType(Text, nPos); +} + +// Begin of Script Type is inclusive. +sal_Int32 SAL_CALL +UnoScriptTypeDetector::beginOfCTLScriptType( const OUString& Text, sal_Int32 nPos ) +{ + return ScriptTypeDetector::beginOfCTLScriptType(Text, nPos); +} + +// End of the Script Type is exclusive, the return value pointing to the begin of next script type +sal_Int32 SAL_CALL +UnoScriptTypeDetector::endOfCTLScriptType( const OUString& Text, sal_Int32 nPos ) +{ + return ScriptTypeDetector::endOfCTLScriptType(Text, nPos); +} + +OUString SAL_CALL +UnoScriptTypeDetector::getImplementationName() +{ + return "com.sun.star.i18n.ScriptTypeDetector"; +} + +sal_Bool SAL_CALL +UnoScriptTypeDetector::supportsService(const OUString& ServiceName) +{ + return cppu::supportsService(this, ServiceName); +} + +css::uno::Sequence< OUString > SAL_CALL +UnoScriptTypeDetector::getSupportedServiceNames() +{ + return { "com.sun.star.i18n.ScriptTypeDetector" }; +} + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * +com_sun_star_i18n_ScriptTypeDetector_get_implementation( + css::uno::XComponentContext *, + css::uno::Sequence<css::uno::Any> const &) +{ + return cppu::acquire(new UnoScriptTypeDetector); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |