diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:54:39 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:54:39 +0000 |
commit | 267c6f2ac71f92999e969232431ba04678e7437e (patch) | |
tree | 358c9467650e1d0a1d7227a21dac2e3d08b622b2 /i18npool/source/search | |
parent | Initial commit. (diff) | |
download | libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip |
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | i18npool/source/search/i18nsearch.component | 27 | ||||
-rw-r--r-- | i18npool/source/search/levdis.cxx | 373 | ||||
-rw-r--r-- | i18npool/source/search/levdis.hxx | 211 | ||||
-rw-r--r-- | i18npool/source/search/textsearch.cxx | 1567 | ||||
-rw-r--r-- | i18npool/source/search/textsearch.hxx | 158 |
5 files changed, 2336 insertions, 0 deletions
diff --git a/i18npool/source/search/i18nsearch.component b/i18npool/source/search/i18nsearch.component new file mode 100644 index 0000000000..fe7f48c265 --- /dev/null +++ b/i18npool/source/search/i18nsearch.component @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + --> + +<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@" + xmlns="http://openoffice.org/2010/uno-components"> + <implementation name="com.sun.star.util.TextSearch_i18n" + constructor="i18npool_TextSearch_get_implementation"> + <service name="com.sun.star.util.TextSearch"/> + <service name="com.sun.star.util.TextSearch2"/> + </implementation> +</component> diff --git a/i18npool/source/search/levdis.cxx b/i18npool/source/search/levdis.cxx new file mode 100644 index 0000000000..dd9f8fbf58 --- /dev/null +++ b/i18npool/source/search/levdis.cxx @@ -0,0 +1,373 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +/* + + Weighted Levenshtein Distance + including wildcards + '*' for any number (0 or more) of arbitrary characters + '?' for exactly one arbitrary character + escapable with backslash, "\*" or "\?" + + Return: + WLD if WLD <= nLimit, else nLimit+1 + + or, if bSplitCount: + WLD if WLD <= nLimit + -WLD if Replace and Insert and Delete <= nLimit + else nLimit+1 + + Recursive definition of WLD: + + WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) , + WLD( X(i) , Y(j-1) ) + q , + WLD( X(i-1), Y(j) ) + r ) + + X(i) := the first i characters of the word X + Y(j) := the first j characters of the word Y + p(i,j) := 0 if i-th character of X == j-th character of Y, + p else + + Boundary conditions: + WLD( X(0), Y(j) ) := j*q (Y created by j inserts) + WLD( X(i), Y(0) ) := i*r (Y created by i deletes) + WLD( X(0), Y(0) ) := 0 + + Instead of recursions a dynamic algorithm is used. + + See also: German computer magazine + c't 07/89 pages 192-208 and c't 03/94 pages 230-239 +*/ + +#include <algorithm> +#include <numeric> + +#include "levdis.hxx" + +#define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit +#define LEVDISDOUBLEBUF 2048 // no doubling atop this border + +static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr ) +{ + const sal_Unicode* pTempStr = pStr; + while( *pTempStr ) + pTempStr++; + return static_cast<sal_Int32>(pTempStr-pStr); +} + +// Distance from string to pattern +int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen ) +{ + int nSPMin = 0; // penalty point Minimum + int nRepS = 0; // for SplitCount + + // length difference between pattern and string + int nLenDiff = nPatternLen - nStars - nStringLen; + // more insertions or deletions necessary as the limit? Then leave + if ( (nLenDiff * nInsQ0 > nLimit) + || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) ) + return LEVDISBIG; + + // comparative String greater than instantaneous array + // -> adapt array size + if ( nStringLen >= nArrayLen ) + { + // increase size much more to avoid reallocation + if ( nStringLen < LEVDISDOUBLEBUF ) + nArrayLen = 2 * nStringLen; + else + nArrayLen = nStringLen + 1; + npDistance = aDisMem.NewMem( nArrayLen ); + } + + // Calculate start values of the second column (first pattern value). + // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0, + // therefore the minimum is 0 + if ( nPatternLen == 0 ) + { + // Count of deletions to reach pattern + for ( sal_Int32 i=0; i <= nStringLen; i++ ) + npDistance[i] = i * nDelR0; + } + else if ( cpPattern[0] == '*' && bpPatIsWild[0] ) + { + // instead of a '*' you can fit in anything + for ( sal_Int32 i=0; i <= nStringLen; i++ ) + npDistance[i] = 0; + } + else + { + sal_Unicode c; + int nP; + c = cpPattern[0]; + if ( c == '?' && bpPatIsWild[0] ) + nP = 0; // a '?' could be any character. + else + // Minimum of replacement and deletion+insertion weighting + nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 }); + npDistance[0] = nInsQ0; // start with simple insert + npDistance[1] = nInsQ0; + npDistance[2] = nInsQ0; + int nReplacePos = -1; // tristate flag + int nDelCnt = 0; + for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 ) + { + if ( cString[i-1] == c ) + nP = 0; // Replace from this position is 0 + // Deletions to match pattern + Replace + npDistance[i] = nDelCnt + nP; + if ( bSplitCount ) + { + if ( nReplacePos < 0 && nP ) + { // this position will be replaced + nRepS++; + nReplacePos = i; + } + else if ( nReplacePos > 0 && !nP ) + { + // same count of c + int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen ); + if ( !nBalance ) + { // one was replaced that was an insertion instead + nRepS--; + nReplacePos = 0; + } + } + } + } + nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] }); + } + + // calculate distance matrix + sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached + while ( (j < nPatternLen-1) + && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) ) + { + sal_Unicode c; + int nP, nQ, nR, nPij, d2; + + j++; + c = cpPattern[j]; + if ( bpPatIsWild[j] ) // '*' or '?' not escaped + nP = 0; // could be replaced without penalty + else + nP = nRepP0; + if ( c == '*' && bpPatIsWild[j] ) + { + nQ = 0; // insertion and deletion without penalty + nR = 0; + } + else + { + nQ = nInsQ0; // usual weighting + nR = nDelR0; + } + d2 = npDistance[0]; + // increase insert count to get from null string to pattern + npDistance[0] = npDistance[0] + nQ; + nSPMin = npDistance[0]; + int nReplacePos = -1; // tristate flag + // for each pattern column run through the string + for ( sal_Int32 i=1; i <= nStringLen; i++ ) + { + int d1 = d2; // WLD( X(i-1), Y(j-1) ) + d2 = npDistance[i]; // WLD( X(i) , Y(j-1) ) + if ( cString[i-1] == c ) + { + nPij = 0; // p(i,j) + if ( nReplacePos < 0 ) + { + // same count of c + int nBalance = levdisbalance( j, i-1, c, cString, nStringLen ); + if ( !nBalance ) + nReplacePos = 0; // no replacement + } + } + else + nPij = nP; + // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) , + // WLD( X(i) , Y(j-1) ) + q , + // WLD( X(i-1), Y(j) ) + r ) + npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR }); + if ( npDistance[i] < nSPMin ) + nSPMin = npDistance[i]; + if ( bSplitCount ) + { + if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij ) + { // this position will be replaced + nRepS++; + nReplacePos = i; + } + else if ( nReplacePos > 0 && !nPij ) + { + // character is equal in string and pattern + // + // If from this point: + // * pattern and string have the same count of this + // character + // * and character count is the same before this position + // then the replace was none. + // + // Scrambled letters are recognized here and the nRepS + // replacement is withdrawn, whereby the double limit kicks + // in. + + // Same count of c + int nBalance = levdisbalance( j, i-1, c, cString, nStringLen ); + if ( !nBalance ) + { // one was replaced that was an insertion instead + nRepS--; + nReplacePos = 0; + } + } + } + } + } + if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) ) + return npDistance[nStringLen]; + else + { + if ( bSplitCount ) + { + if ( nRepS && nLenDiff > 0 ) + nRepS -= nLenDiff; // Inserts were counted + if ( (nSPMin <= 2 * nLimit) + && (npDistance[nStringLen] <= 2 * nLimit) + && (nRepS * nRepP0 <= nLimit) ) + return -npDistance[nStringLen]; + return LEVDISBIG; + } + return LEVDISBIG; + } +} + +// Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount +// from user values nOtherX, nShorterY, nLongerZ, bRelaxed +void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed ) +{ + if ( nX < 0 ) nX = 0; // only positive values + if ( nY < 0 ) nY = 0; + if ( nZ < 0 ) nZ = 0; + if (0 == std::min({ nX, nY, nZ })) // at least one 0 + { + int nMid, nMax; + nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max + if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0 + nLimit = nMax; // either 0 or the only one >0 + else // one is 0 + nLimit = std::lcm( nMid, nMax ); + } + else // all three of them are not 0 + nLimit = std::lcm(std::lcm(nX, nY), nZ); + nRepP0 = ( nX ? nLimit / nX : nLimit + 1 ); + nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 ); + nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 ); + bSplitCount = bRelaxed; +} + +// The value in the middle +int WLevDistance::Mid3( int x, int y, int z ) +{ + int min = std::min({ x, y, z }); + if ( x == min ) + return std::min(y, z); + else if ( y == min ) + return std::min(x, z); + else // z == min + return std::min(x, y); +} + +// initialize data from CTOR +void WLevDistance::InitData( const sal_Unicode* cPattern ) +{ + cpPattern = aPatMem.GetcPtr(); + bpPatIsWild = aPatMem.GetbPtr(); + npDistance = aDisMem.GetPtr(); + nStars = 0; + const sal_Unicode* cp1 = cPattern; + sal_Unicode* cp2 = cpPattern; + bool* bp = bpPatIsWild; + // copy pattern, count asterisks, escaped Jokers + while ( *cp1 ) + { + if ( *cp1 == '\\' ) // maybe escaped + { + if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker? + { + cp1++; // skip '\\' + nPatternLen--; + } + *bp++ = false; + } + else if ( *cp1 == '*' || *cp1 == '?' ) // Joker + { + if ( *cp1 == '*' ) + nStars++; + *bp++ = true; + } + else + *bp++ = false; + *cp2++ = *cp1++; + } + *cp2 = '\0'; +} + +WLevDistance::WLevDistance( const sal_Unicode* cPattern, + int nOtherX, int nShorterY, int nLongerZ, + bool bRelaxed ) : + nPatternLen( Impl_WLD_StringLen(cPattern) ), + aPatMem( nPatternLen + 1 ), + nArrayLen( nPatternLen + 1 ), + aDisMem( nArrayLen ) +{ + InitData( cPattern ); + CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed ); +} + +// CopyCTor +WLevDistance::WLevDistance( const WLevDistance& rWLD ) : + nPatternLen( rWLD.nPatternLen ), + aPatMem( nPatternLen + 1 ), + nArrayLen( nPatternLen + 1 ), + aDisMem( nArrayLen ), + nLimit( rWLD.nLimit ), + nRepP0( rWLD.nRepP0 ), + nInsQ0( rWLD.nInsQ0 ), + nDelR0( rWLD.nDelR0 ), + nStars( rWLD.nStars ), + bSplitCount( rWLD.bSplitCount ) +{ + cpPattern = aPatMem.GetcPtr(); + bpPatIsWild = aPatMem.GetbPtr(); + npDistance = aDisMem.GetPtr(); + sal_Int32 i; + for ( i=0; i<nPatternLen; i++ ) + { + cpPattern[i] = rWLD.cpPattern[i]; + bpPatIsWild[i] = rWLD.bpPatIsWild[i]; + } + cpPattern[i] = '\0'; +} + +// DTor +WLevDistance::~WLevDistance() +{ +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/search/levdis.hxx b/i18npool/source/search/levdis.hxx new file mode 100644 index 0000000000..b378c03cff --- /dev/null +++ b/i18npool/source/search/levdis.hxx @@ -0,0 +1,211 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#pragma once + +#include <sal/types.h> +#include <memory> + +// Sensible default values for a user interface could be: +// LEVDISDEFAULT_XOTHER 2 +// Maximum X replacements to match query, found data may be different by X +// characters from query. +// LEVDISDEFAULT_YSHORTER 1 +// Maximum Y insertions to match query, found data may be Y characters +// shorter than query. +// LEVDISDEFAULT_ZLONGER 3 +// Maximum Z deletions to match query, found data may be Z characters +// longer than query. +// LEVDISDEFAULT_RELAXED TRUE +// Use relaxed SplitCount instead of mathematical WLD. +// +// Joker/wildcards ('?' and '*') of course do not count as +// replacement/insertion/deletion. At a '?' a replacement is not counted, for a +// '*' the found data may be any number of characters longer than the query. +// +// Strict mathematical WLD: EITHER maximum X replacements OR Y characters +// shorter OR Z characters longer. +// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR +// Z characters longer. Any combination of actions is valid. +// +// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed +// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736. +// +// The corresponding internal default weigh values for these user interface +// values would be: +// LEVDISDEFAULTLIMIT 6 +// Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2 +// LEVDISDEFAULT_P0 3 +// Default nRepP0, weight of replacements. +// LEVDISDEFAULT_Q0 6 +// Default nInsQ0, weight of insertions. +// LEVDISDEFAULT_R0 2 +// Default nDelR0, weight of deletions. + +// The transformation of user input values to weighs is done using CalcLPQR(). +// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ +// characters longer than pattern) then no character can be replaced any more. +// This can be circumvented by increasing nX or/and nZ, but of course with the +// side effect of being less strict then... or the other solution is to use +// relaxed SplitCount (see below), which is the default when using CalcLPQR(). +// +// Attention: shorter = WLD.Insert, longer = WLD.Delete +// +// View and counting is always from data string to pattern, a deletion means +// that something is deleted from data to match pattern. +// +// Deletions weigh less in this example because usually less is known than is +// searched for. Replacements get middle weight, for example because of +// misspellings. Insertions are expensive. +// +// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3 +// Allowed are maximum 4 replacements, no insertion, no deletion. +// Matches the user interface values X = 3, Y = 0, Z = 0 +// +// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value +// of WLD() then isn't necessarily the Levenshtein-Distance, but can be +// negative (-WLD) if the WLD is greater than nLimit but single values are +// within the limits. +// For the above default values that could mean: even if the found string is +// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made +// to reach pattern. Additionally, character swaps count as one replacement. +// Mathematically completely incorrect, but meets user expectations ;-) +// +// Explanation: in the real WLD all actions are withdrawn from a common 100% +// pool, if one gets all there's nothing left for others. With bSplitCount +// replacements have their own pool. + + +/** "Safe" memory allocation in ctor */ +class WLevDisPatternMem +{ + std::unique_ptr<sal_Unicode[]> cp; + std::unique_ptr<bool[]> bp; +public: + explicit WLevDisPatternMem( sal_Int32 s ) + : cp(new sal_Unicode[s]) + , bp(new bool[s]) + { + } + + sal_Unicode* GetcPtr() const { return cp.get(); } + bool* GetbPtr() const { return bp.get(); } +}; + +class WLevDisDistanceMem +{ + std::unique_ptr<int[]> p; +public: + explicit WLevDisDistanceMem( size_t s ) + { + NewMem(s); + } + int* GetPtr() const { return p.get(); } + int* NewMem( size_t s ) + { + p.reset(new int[ s<3 ? 3 : s ]); + return p.get(); + } +}; + +/** Weighted Levenshtein Distance (WLD) + + For a more detailed explanation see documentation in + i18npool/source/search/levdis.hxx + */ +class WLevDistance +{ + sal_Int32 nPatternLen; ///< length of pattern + WLevDisPatternMem aPatMem; ///< manage allocation of pattern array + sal_Unicode* cpPattern; ///< pointer to pattern array + bool* bpPatIsWild; ///< pointer to bool array whether pattern is wildcard + sal_Int32 nArrayLen; ///< length of distance array + WLevDisDistanceMem aDisMem; ///< manage allocation of distance array + int* npDistance; ///< pointer to distance array + int nLimit; ///< WLD limit replacements/insertions/deletions + int nRepP0; ///< replacement weigh + int nInsQ0; ///< insertion weigh + int nDelR0; ///< deletion weigh + int nStars; ///< count of '*' wildcards in pattern + bool bSplitCount; ///< if TRUE, Rep/Ins/Del are counted separately + + void InitData( const sal_Unicode* cPattern ); + static int Mid3( int x, int y, int z ); ///< middle value of 3 values + +public: + + /** CTor with user input. Internally calls CalcLPQR(). + + After this, obtain the resulting limit using GetLimit(). + + @param bRelaxed the mathematically incorrect method is default (TRUE) + */ + WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY, + int nLongerZ, bool bRelaxed ); + + WLevDistance( const WLevDistance& rWLD ); + ~WLevDistance(); + + /** Calculate the Weighted Levenshtein Distance from string to pattern. */ + int WLD( const sal_Unicode* cString, sal_Int32 nStringLen ); + + /** Calculate the internal weighs corresponding to the user input values. + @returns nLimit for later comparison with WLD() + */ + void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ, + bool bRelaxed ); + + int GetLimit() const { return nLimit; } + + // Calculate current balance, keep this inline for performance reasons! + // c == cpPattern[jj] == cString[ii] + // First seek up to found place, if the balance is still equal there then + // also compare after the found place. + int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const + { + int nBalance = 0; + + if ( jj != ii ) + { + sal_Int32 k; + if ( jj > 0 ) + for ( k=0; k < jj; k++ ) + if ( cpPattern[k] == c ) + nBalance++; + if ( ii > 0 ) + for ( k=0; k < ii; k++ ) + if ( cString[k] == c ) + nBalance--; + if ( !nBalance ) + { + for ( k=jj+1; k < nPatternLen; k++ ) + if ( cpPattern[k] == c ) + nBalance++; + for ( k=ii+1; k < nStringLen; k++ ) + if ( cString[k] == c ) + nBalance--; + } + } + + return nBalance; + } +}; + + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx new file mode 100644 index 0000000000..a16c3e1cc4 --- /dev/null +++ b/i18npool/source/search/textsearch.cxx @@ -0,0 +1,1567 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#include "textsearch.hxx" +#include "levdis.hxx" +#include <com/sun/star/i18n/BreakIterator.hpp> +#include <com/sun/star/util/SearchAlgorithms2.hpp> +#include <com/sun/star/util/SearchFlags.hpp> +#include <com/sun/star/i18n/WordType.hpp> +#include <com/sun/star/i18n/ScriptType.hpp> +#include <com/sun/star/i18n/CharacterIteratorMode.hpp> +#include <com/sun/star/i18n/CharacterClassification.hpp> +#include <com/sun/star/i18n/KCharacterType.hpp> +#include <com/sun/star/i18n/Transliteration.hpp> +#include <cppuhelper/supportsservice.hxx> +#include <cppuhelper/weak.hxx> +#include <i18nutil/transliteration.hxx> +#include <rtl/ustrbuf.hxx> +#include <sal/log.hxx> + +#include <unicode/regex.h> + +using namespace ::com::sun::star::util; +using namespace ::com::sun::star::uno; +using namespace ::com::sun::star::lang; +using namespace ::com::sun::star::i18n; +using namespace ::com::sun::star; + +const TransliterationFlags COMPLEX_TRANS_MASK = + TransliterationFlags::ignoreBaFa_ja_JP | + TransliterationFlags::ignoreIterationMark_ja_JP | + TransliterationFlags::ignoreTiJi_ja_JP | + TransliterationFlags::ignoreHyuByu_ja_JP | + TransliterationFlags::ignoreSeZe_ja_JP | + TransliterationFlags::ignoreIandEfollowedByYa_ja_JP | + TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP | + TransliterationFlags::ignoreProlongedSoundMark_ja_JP; + +namespace +{ +TransliterationFlags maskComplexTrans( TransliterationFlags n ) +{ + // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect + // in complex transliteration. + return + n & (COMPLEX_TRANS_MASK | // all set ignore bits + TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit + TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value +} + +bool isComplexTrans( TransliterationFlags n ) +{ + return bool(n & COMPLEX_TRANS_MASK); +} + +TransliterationFlags maskSimpleTrans( TransliterationFlags n ) +{ + return n & ~COMPLEX_TRANS_MASK; +} + +bool isSimpleTrans( TransliterationFlags n ) +{ + return bool(maskSimpleTrans(n)); +} + +// Regex patterns are case sensitive. +TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n ) +{ + TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE; + TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK; + if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE) + v = TransliterationFlags::NONE; + return (m | v) & ~COMPLEX_TRANS_MASK; +} + +bool isSimpleRegexTrans( TransliterationFlags n ) +{ + return bool(maskSimpleRegexTrans(n)); +} +}; + +TextSearch::TextSearch(const Reference < XComponentContext > & rxContext) + : m_xContext( rxContext ) +{ + SearchOptions2 aOpt; + aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE; + aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; + aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; + //aOpt.Locale = ???; + setOptions( aOpt ); +} + +TextSearch::~TextSearch() +{ + pRegexMatcher.reset(); + pWLD.reset(); + pJumpTable.reset(); + pJumpTable2.reset(); +} + +void TextSearch::setOptions2( const SearchOptions2& rOptions ) +{ + std::unique_lock g(m_aMutex); + + aSrchPara = rOptions; + + pRegexMatcher.reset(); + pWLD.reset(); + pJumpTable.reset(); + pJumpTable2.reset(); + maWildcardReversePattern.clear(); + maWildcardReversePattern2.clear(); + TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags); + bSearchApostrophe = false; + bool bReplaceApostrophe = false; + if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP) + { + // RESrchPrepare will consider aSrchPara.transliterateFlags when + // picking the actual regex pattern + // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting + // case-insensitivity. Create transliteration instance, if any, without + // ignore-case so later in TextSearch::searchForward() the string to + // match is not case-altered, leave case-(in)sensitive to regex engine. + transliterateFlags &= ~TransliterationFlags::IGNORE_CASE; + } + else if ( aSrchPara.searchString.indexOf('\'') > - 1 ) + { + bSearchApostrophe = true; + bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1; + } + + // Create Transliteration class + if( isSimpleTrans( transliterateFlags) ) + { + if( !xTranslit.is() ) + xTranslit.set( Transliteration::create( m_xContext ) ); + xTranslit->loadModule( + static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)), + aSrchPara.Locale); + } + else if( xTranslit.is() ) + xTranslit = nullptr; + + // Create Transliteration for 2<->1, 2<->2 transliteration + if ( isComplexTrans( transliterateFlags) ) + { + if( !xTranslit2.is() ) + xTranslit2.set( Transliteration::create( m_xContext ) ); + // Load transliteration module + xTranslit2->loadModule( + static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)), + aSrchPara.Locale); + } + + if ( !xBreak.is() ) + xBreak = css::i18n::BreakIterator::create( m_xContext ); + + sSrchStr = aSrchPara.searchString; + + // Transliterate search string. + if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP) + { + if (isSimpleRegexTrans(transliterateFlags)) + { + if (maskSimpleRegexTrans(transliterateFlags) != + maskSimpleTrans(transliterateFlags)) + { + css::uno::Reference< XExtendedTransliteration > xTranslitPattern( + Transliteration::create( m_xContext )); + if (xTranslitPattern.is()) + { + xTranslitPattern->loadModule( + static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)), + aSrchPara.Locale); + sSrchStr = xTranslitPattern->transliterateString2String( + aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); + } + } + else + { + if (xTranslit.is()) + sSrchStr = xTranslit->transliterateString2String( + aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); + } + // xTranslit2 complex transliterated sSrchStr2 is not used in + // regex, see TextSearch::searchForward() and + // TextSearch::searchBackward() + } + } + else + { + if ( xTranslit.is() && isSimpleTrans(transliterateFlags) ) + sSrchStr = xTranslit->transliterateString2String( + aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); + + if ( xTranslit2.is() && isComplexTrans(transliterateFlags) ) + sSrchStr2 = xTranslit2->transliterateString2String( + aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); + } + + if ( bReplaceApostrophe ) + sSrchStr = sSrchStr.replace(u'\u2019', '\''); + + // Take the new SearchOptions2::AlgorithmType2 field and ignore + // SearchOptions::algorithmType + switch( aSrchPara.AlgorithmType2) + { + case SearchAlgorithms2::REGEXP: + fnForward = &TextSearch::RESrchFrwrd; + fnBackward = &TextSearch::RESrchBkwrd; + RESrchPrepare( aSrchPara); + break; + + case SearchAlgorithms2::APPROXIMATE: + fnForward = &TextSearch::ApproxSrchFrwrd; + fnBackward = &TextSearch::ApproxSrchBkwrd; + + pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, + aSrchPara.insertedChars, aSrchPara.deletedChars, + 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) ); + + nLimit = pWLD->GetLimit(); + break; + + case SearchAlgorithms2::WILDCARD: + mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter); + mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0); + fnForward = &TextSearch::WildcardSrchFrwrd; + fnBackward = &TextSearch::WildcardSrchBkwrd; + break; + + default: + SAL_WARN("i18npool","TextSearch::setOptions2 - default what?"); + [[fallthrough]]; + case SearchAlgorithms2::ABSOLUTE: + fnForward = &TextSearch::NSrchFrwrd; + fnBackward = &TextSearch::NSrchBkwrd; + break; + } +} + +void TextSearch::setOptions( const SearchOptions& rOptions ) +{ + sal_Int16 nAlgorithmType2; + switch (rOptions.algorithmType) + { + case SearchAlgorithms_REGEXP: + nAlgorithmType2 = SearchAlgorithms2::REGEXP; + break; + case SearchAlgorithms_APPROXIMATE: + nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE; + break; + default: + SAL_WARN("i18npool","TextSearch::setOptions - default what?"); + [[fallthrough]]; + case SearchAlgorithms_ABSOLUTE: + nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE; + break; + } + // It would be nice if an inherited struct had a ctor that takes an + // instance of the object the struct derived from... + SearchOptions2 aOptions2( + rOptions.algorithmType, + rOptions.searchFlag, + rOptions.searchString, + rOptions.replaceString, + rOptions.Locale, + rOptions.changedChars, + rOptions.deletedChars, + rOptions.insertedChars, + rOptions.transliterateFlags, + nAlgorithmType2, + 0 // no wildcard search, no escape character... + ); + setOptions2( aOptions2); +} + +static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) +{ + auto pOff = std::find_if(rOff.begin(), rOff.end(), + [nPos](const sal_Int32 nOff) { return nOff >= nPos; }); + return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff)); +} + +SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) +{ + std::unique_lock g(m_aMutex); + + SearchResult sres; + + OUString in_str(searchStr); + + // in non-regex mode, allow searching typographical apostrophe with the ASCII one + // to avoid regression after using automatic conversion to U+2019 during typing in Writer + bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1; + + bUsePrimarySrchStr = true; + + if ( xTranslit.is() ) + { + // apply normal transliteration (1<->1, 1<->0) + + sal_Int32 nInStartPos = startPos; + if (pRegexMatcher && startPos > 0) + { + // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet + // transliterate enough of the leading text to allow sensible look-behind assertions. + // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit. + // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax. + // When search regex doesn't start with an assertion, 3 is to allow startPos to be in + // the middle of a surrogate pair, preceded by another surrogate pair. + const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3; + nInStartPos -= std::min(nMaxLeadingLen, startPos); + } + sal_Int32 nInEndPos = endPos; + if (pRegexMatcher && endPos < searchStr.getLength()) + { + // tdf#65038: ditto for look-ahead assertions + const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3; + nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos); + } + + css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos); + in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset); + + if ( bReplaceApostrophe ) + in_str = in_str.replace(u'\u2019', '\''); + + // JP 20.6.2001: also the start and end positions must be corrected! + sal_Int32 newStartPos = + (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos ); + + sal_Int32 newEndPos = (endPos < searchStr.getLength()) + ? FindPosInSeq_Impl( offset, endPos ) + : in_str.getLength(); + + sres = (this->*fnForward)( in_str, newStartPos, newEndPos ); + + // Map offsets back to untransliterated string. + const sal_Int32 nOffsets = offset.getLength(); + if (nOffsets) + { + auto sres_startOffsetRange = asNonConstRange(sres.startOffset); + auto sres_endOffsetRange = asNonConstRange(sres.endOffset); + // For regex nGroups is the number of groups+1 with group 0 being + // the entire match. + const sal_Int32 nGroups = sres.startOffset.getLength(); + for ( sal_Int32 k = 0; k < nGroups; k++ ) + { + const sal_Int32 nStart = sres.startOffset[k]; + // Result offsets are negative (-1) if a group expression was + // not matched. + if (nStart >= 0) + sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1)); + // JP 20.6.2001: end is ever exclusive and then don't return + // the position of the next character - return the + // next position behind the last found character! + // "a b c" find "b" must return 2,3 and not 2,4!!! + const sal_Int32 nStop = sres.endOffset[k]; + if (nStop >= 0) + { + if (nStop > 0) + sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1; + else + sres_endOffsetRange[k] = offset[0]; + } + } + } + } + else + { + if ( bReplaceApostrophe ) + in_str = in_str.replace(u'\u2019', '\''); + + sres = (this->*fnForward)( in_str, startPos, endPos ); + } + + if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP) + { + SearchResult sres2; + + in_str = searchStr; + css::uno::Sequence <sal_Int32> offset( in_str.getLength()); + + in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); + + if( startPos ) + startPos = FindPosInSeq_Impl( offset, startPos ); + + if( endPos < searchStr.getLength() ) + endPos = FindPosInSeq_Impl( offset, endPos ); + else + endPos = in_str.getLength(); + + bUsePrimarySrchStr = false; + sres2 = (this->*fnForward)( in_str, startPos, endPos ); + auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset); + auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset); + + for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) + { + if (sres2.startOffset[k]) + sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1; + if (sres2.endOffset[k]) + sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1; + } + + // pick first and long one + if ( sres.subRegExpressions == 0) + return sres2; + if ( sres2.subRegExpressions == 1) + { + if ( sres.startOffset[0] > sres2.startOffset[0]) + return sres2; + else if ( sres.startOffset[0] == sres2.startOffset[0] && + sres.endOffset[0] < sres2.endOffset[0]) + return sres2; + } + } + + return sres; +} + +SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) +{ + std::unique_lock g(m_aMutex); + + SearchResult sres; + + OUString in_str(searchStr); + + // in non-regex mode, allow searching typographical apostrophe with the ASCII one + // to avoid regression after using automatic conversion to U+2019 during typing in Writer + bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1; + + bUsePrimarySrchStr = true; + + if ( xTranslit.is() ) + { + // apply only simple 1<->1 transliteration here + css::uno::Sequence<sal_Int32> offset(startPos - endPos); + in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset ); + + if ( bReplaceApostrophe ) + in_str = in_str.replace(u'\u2019', '\''); + + // JP 20.6.2001: also the start and end positions must be corrected! + sal_Int32 const newStartPos = (startPos < searchStr.getLength()) + ? FindPosInSeq_Impl( offset, startPos ) + : in_str.getLength(); + + sal_Int32 const newEndPos = + (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos ); + + // TODO: this would need nExtraOffset handling to avoid $ matching + // if (pRegexMatcher && startPos < searchStr.getLength()) + // but that appears to be impossible with ICU regex + + sres = (this->*fnBackward)( in_str, newStartPos, newEndPos ); + + // Map offsets back to untransliterated string. + const sal_Int32 nOffsets = offset.getLength(); + if (nOffsets) + { + auto sres_startOffsetRange = asNonConstRange(sres.startOffset); + auto sres_endOffsetRange = asNonConstRange(sres.endOffset); + // For regex nGroups is the number of groups+1 with group 0 being + // the entire match. + const sal_Int32 nGroups = sres.startOffset.getLength(); + for ( sal_Int32 k = 0; k < nGroups; k++ ) + { + const sal_Int32 nStart = sres.startOffset[k]; + // Result offsets are negative (-1) if a group expression was + // not matched. + if (nStart >= 0) + { + if (nStart > 0) + sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1; + else + sres_startOffsetRange[k] = offset[0]; + } + // JP 20.6.2001: end is ever exclusive and then don't return + // the position of the next character - return the + // next position behind the last found character! + // "a b c" find "b" must return 2,3 and not 2,4!!! + const sal_Int32 nStop = sres.endOffset[k]; + if (nStop >= 0) + sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1)); + } + } + } + else + { + if ( bReplaceApostrophe ) + in_str = in_str.replace(u'\u2019', '\''); + + sres = (this->*fnBackward)( in_str, startPos, endPos ); + } + + if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP ) + { + SearchResult sres2; + + in_str = searchStr; + css::uno::Sequence <sal_Int32> offset( in_str.getLength()); + + in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); + + if( startPos < searchStr.getLength() ) + startPos = FindPosInSeq_Impl( offset, startPos ); + else + startPos = in_str.getLength(); + + if( endPos ) + endPos = FindPosInSeq_Impl( offset, endPos ); + + bUsePrimarySrchStr = false; + sres2 = (this->*fnBackward)( in_str, startPos, endPos ); + auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset); + auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset); + + for( int k = 0; k < sres2.startOffset.getLength(); k++ ) + { + if (sres2.startOffset[k]) + sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1; + if (sres2.endOffset[k]) + sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1; + } + + // pick last and long one + if ( sres.subRegExpressions == 0 ) + return sres2; + if ( sres2.subRegExpressions == 1 ) + { + if ( sres.startOffset[0] < sres2.startOffset[0] ) + return sres2; + if ( sres.startOffset[0] == sres2.startOffset[0] && + sres.endOffset[0] > sres2.endOffset[0] ) + return sres2; + } + } + + return sres; +} + + +bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const +{ + bool bRet = true; + if( '\x7f' != rStr[nPos]) + { + if ( !xCharClass.is() ) + xCharClass = CharacterClassification::create( m_xContext ); + sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, + aSrchPara.Locale ); + if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | + KCharacterType::LETTER ) & nCType ) ) + bRet = false; + } + return bRet; +} + +// --------- helper methods for Boyer-Moore like text searching ---------- +// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available + +void TextSearch::MakeForwardTab() +{ + // create the jumptable for the search text + + if( pJumpTable && bIsForwardTab ) + { + return; // the jumpTable is ok + } + bIsForwardTab = true; + + sal_Int32 n, nLen = sSrchStr.getLength(); + pJumpTable.reset( new TextSearchJumpTable ); + + for( n = 0; n < nLen - 1; ++n ) + { + sal_Unicode cCh = sSrchStr[n]; + sal_Int32 nDiff = nLen - n - 1; + TextSearchJumpTable::value_type aEntry( cCh, nDiff ); + + ::std::pair< TextSearchJumpTable::iterator, bool > aPair = + pJumpTable->insert( aEntry ); + if ( !aPair.second ) + (*(aPair.first)).second = nDiff; + } +} + +void TextSearch::MakeForwardTab2() +{ + // create the jumptable for the search text + if( pJumpTable2 && bIsForwardTab ) + { + return; // the jumpTable is ok + } + bIsForwardTab = true; + + sal_Int32 n, nLen = sSrchStr2.getLength(); + pJumpTable2.reset( new TextSearchJumpTable ); + + for( n = 0; n < nLen - 1; ++n ) + { + sal_Unicode cCh = sSrchStr2[n]; + sal_Int32 nDiff = nLen - n - 1; + + TextSearchJumpTable::value_type aEntry( cCh, nDiff ); + ::std::pair< TextSearchJumpTable::iterator, bool > aPair = + pJumpTable2->insert( aEntry ); + if ( !aPair.second ) + (*(aPair.first)).second = nDiff; + } +} + +void TextSearch::MakeBackwardTab() +{ + // create the jumptable for the search text + if( pJumpTable && !bIsForwardTab) + { + return; // the jumpTable is ok + } + bIsForwardTab = false; + + sal_Int32 n, nLen = sSrchStr.getLength(); + pJumpTable.reset( new TextSearchJumpTable ); + + for( n = nLen-1; n > 0; --n ) + { + sal_Unicode cCh = sSrchStr[n]; + TextSearchJumpTable::value_type aEntry( cCh, n ); + ::std::pair< TextSearchJumpTable::iterator, bool > aPair = + pJumpTable->insert( aEntry ); + if ( !aPair.second ) + (*(aPair.first)).second = n; + } +} + +void TextSearch::MakeBackwardTab2() +{ + // create the jumptable for the search text + if( pJumpTable2 && !bIsForwardTab ) + { + return; // the jumpTable is ok + } + bIsForwardTab = false; + + sal_Int32 n, nLen = sSrchStr2.getLength(); + pJumpTable2.reset( new TextSearchJumpTable ); + + for( n = nLen-1; n > 0; --n ) + { + sal_Unicode cCh = sSrchStr2[n]; + TextSearchJumpTable::value_type aEntry( cCh, n ); + ::std::pair< TextSearchJumpTable::iterator, bool > aPair = + pJumpTable2->insert( aEntry ); + if ( !aPair.second ) + (*(aPair.first)).second = n; + } +} + +sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const +{ + TextSearchJumpTable *pJump; + OUString sSearchKey; + + if ( bUsePrimarySrchStr ) { + pJump = pJumpTable.get(); + sSearchKey = sSrchStr; + } else { + pJump = pJumpTable2.get(); + sSearchKey = sSrchStr2; + } + + TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); + if ( iLook == pJump->end() ) + return sSearchKey.getLength(); + return (*iLook).second; +} + + +SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) +{ + SearchResult aRet; + aRet.subRegExpressions = 0; + + OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; + + sal_Int32 nSuchIdx = searchStr.getLength(); + sal_Int32 nEnd = endPos; + if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) + return aRet; + + + if( nEnd < sSearchKey.getLength() ) // position inside the search region ? + return aRet; + + nEnd -= sSearchKey.getLength(); + + if (bUsePrimarySrchStr) + MakeForwardTab(); // create the jumptable + else + MakeForwardTab2(); + + for (sal_Int32 nCmpIdx = startPos; // start position for the search + nCmpIdx <= nEnd; + nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1])) + { + nSuchIdx = sSearchKey.getLength() - 1; + while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx]) + { + if( nSuchIdx == 0 ) + { + if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) + { + sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); + bool bAtStart = !nCmpIdx; + bool bAtEnd = nFndEnd == endPos; + bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 ); + bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd ); + // * 1 -> only one word in the paragraph + // * 2 -> at begin of paragraph + // * 3 -> at end of paragraph + // * 4 -> inside the paragraph + if( !( ( bAtStart && bAtEnd ) || // 1 + ( bAtStart && bDelimBehind ) || // 2 + ( bAtEnd && bDelimBefore ) || // 3 + ( bDelimBefore && bDelimBehind ))) // 4 + break; + } + + aRet.subRegExpressions = 1; + aRet.startOffset = { nCmpIdx }; + aRet.endOffset = { nCmpIdx + sSearchKey.getLength() }; + + return aRet; + } + else + nSuchIdx--; + } + } + return aRet; +} + +SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) +{ + SearchResult aRet; + aRet.subRegExpressions = 0; + + OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; + + sal_Int32 nSuchIdx = searchStr.getLength(); + sal_Int32 nEnd = endPos; + if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx) + return aRet; + + if (bUsePrimarySrchStr) + MakeBackwardTab(); // create the jumptable + else + MakeBackwardTab2(); + + if( nEnd == nSuchIdx ) // end position for the search + nEnd = sSearchKey.getLength(); + else + nEnd += sSearchKey.getLength(); + + sal_Int32 nCmpIdx = startPos; // start position for the search + + while (nCmpIdx >= nEnd) + { + nSuchIdx = 0; + while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == + searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) + nSuchIdx++; + if( nSuchIdx >= sSearchKey.getLength() ) + { + if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) + { + sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); + bool bAtStart = !nFndStt; + bool bAtEnd = nCmpIdx == startPos; + bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx ); + bool bDelimBefore = bAtStart || // begin of paragraph + IsDelimiter( searchStr, nFndStt-1 ); + // * 1 -> only one word in the paragraph + // * 2 -> at begin of paragraph + // * 3 -> at end of paragraph + // * 4 -> inside the paragraph + if( ( bAtStart && bAtEnd ) || // 1 + ( bAtStart && bDelimBehind ) || // 2 + ( bAtEnd && bDelimBefore ) || // 3 + ( bDelimBefore && bDelimBehind )) // 4 + { + aRet.subRegExpressions = 1; + aRet.startOffset = { nCmpIdx }; + aRet.endOffset = { nCmpIdx - sSearchKey.getLength() }; + return aRet; + } + } + else + { + aRet.subRegExpressions = 1; + aRet.startOffset = { nCmpIdx }; + aRet.endOffset = { nCmpIdx - sSearchKey.getLength() }; + return aRet; + } + } + nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] ); + if( nCmpIdx < nSuchIdx ) + return aRet; + nCmpIdx -= nSuchIdx; + } + return aRet; +} + +void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions) +{ + TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags); + // select the transliterated pattern string + const OUString& rPatternStr = + (isSimpleTrans(transliterateFlags) ? sSrchStr + : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString)); + + sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability + // map css::util::SearchFlags to ICU uregex.h flags + // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE + // REG_NEWLINE is neither properly defined nor used anywhere => not implemented + // REG_NOSUB is not used anywhere => not implemented + // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute + // LEV_RELAXED is only used for SearchAlgorithm==Approximate + // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO + // probably because the transliteration flag IGNORE_CASE handles it as well. + if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0 + || (transliterateFlags & TransliterationFlags::IGNORE_CASE)) + nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; + UErrorCode nIcuErr = U_ZERO_ERROR; + // assumption: transliteration didn't mangle regexp control chars + icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength()); +#ifndef DISABLE_WORDBOUND_EMULATION + // for convenience specific syntax elements of the old regex engine are emulated + // - by replacing \< with "word-break followed by a look-ahead word-char" + static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant); + static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant); + static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); + aChevronMatcherB.reset( aIcuSearchPatStr); + aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); + aChevronMatcherB.reset(); + // - by replacing \> with "look-behind word-char followed by a word-break" + static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant); + static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant); + static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); + aChevronMatcherE.reset( aIcuSearchPatStr); + aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); + aChevronMatcherE.reset(); +#endif + pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) ); + if (nIcuErr) + { + SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr); + pRegexMatcher.reset(); + } + else + { + // Pathological patterns may result in exponential run time making the + // application appear to be frozen. Limit that. Documentation for this + // call says + // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00 + // "The units of the limit are steps of the match engine. + // Correspondence with actual processor time will depend on the speed + // of the processor and the details of the specific pattern, but will + // typically be on the order of milliseconds." + // Just what is a good value? 42 is always an answer ... the 23 enigma + // as well... which on the dev's machine is roughly 50 seconds with the + // pattern of fdo#70627. + /* TODO: make this a configuration settable value and possibly take + * complexity of expression into account and maybe even length of text + * to be matched; currently (2013-11-25) that is at most one 64k + * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */ + pRegexMatcher->setTimeLimit( 23*1000, nIcuErr); + } +} + + +static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher, + sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr) +{ + pRegexMatcher->region(nStartPos, nEndPos, rIcuErr); + pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's + pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for + // look-ahead/behind assertions + + if (!pRegexMatcher->find(rIcuErr)) + { + /* TODO: future versions could pass the UErrorCode or translations + * thereof to the caller, for example to inform the user of + * U_REGEX_TIME_OUT. The strange thing though is that an error is set + * only after the second call that returns immediately and not if + * timeout occurred on the first call?!? */ + SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr); + return false; + } + return true; +} + +SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) +{ + SearchResult aRet; + aRet.subRegExpressions = 0; + if( !pRegexMatcher) + return aRet; + + if( endPos > searchStr.getLength()) + endPos = searchStr.getLength(); + + // use the ICU RegexMatcher to find the matches + UErrorCode nIcuErr = U_ZERO_ERROR; + const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()), + searchStr.getLength()); + pRegexMatcher->reset( aSearchTargetStr); + // search until there is a valid match + for(;;) + { + if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr)) + return aRet; + + // #i118887# ignore zero-length matches e.g. "a*" in "bc" + int nStartOfs = pRegexMatcher->start( nIcuErr); + int nEndOfs = pRegexMatcher->end( nIcuErr); + if( nStartOfs < nEndOfs) + break; + // If the zero-length match is behind the string, do not match it again + // and again until startPos reaches there. A match behind the string is + // a "$" anchor. + if (nStartOfs == endPos) + break; + // try at next position if there was a zero-length match + if( ++startPos >= endPos) + return aRet; + } + + // extract the result of the search + const int nGroupCount = pRegexMatcher->groupCount(); + aRet.subRegExpressions = nGroupCount + 1; + aRet.startOffset.realloc( aRet.subRegExpressions); + auto pstartOffset = aRet.startOffset.getArray(); + aRet.endOffset.realloc( aRet.subRegExpressions); + auto pendOffset = aRet.endOffset.getArray(); + pstartOffset[0] = pRegexMatcher->start( nIcuErr); + pendOffset[0] = pRegexMatcher->end( nIcuErr); + for( int i = 1; i <= nGroupCount; ++i) { + pstartOffset[i] = pRegexMatcher->start( i, nIcuErr); + pendOffset[i] = pRegexMatcher->end( i, nIcuErr); + } + + return aRet; +} + +SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) +{ + // NOTE: for backwards search callers provide startPos/endPos inverted! + SearchResult aRet; + aRet.subRegExpressions = 0; + if( !pRegexMatcher) + return aRet; + + if( startPos > searchStr.getLength()) + startPos = searchStr.getLength(); + + // use the ICU RegexMatcher to find the matches + // TODO: use ICU's backward searching once it becomes available + // as its replacement using forward search is not as good as the real thing + UErrorCode nIcuErr = U_ZERO_ERROR; + const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()), + searchStr.getLength()); + pRegexMatcher->reset( aSearchTargetStr); + if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr)) + return aRet; + + // find the last match + int nLastPos = 0; + int nFoundEnd = 0; + int nGoodPos = 0, nGoodEnd = 0; + bool bFirst = true; + do { + nLastPos = pRegexMatcher->start( nIcuErr); + nFoundEnd = pRegexMatcher->end( nIcuErr); + if (nLastPos < nFoundEnd) + { + // remember last non-zero-length match + nGoodPos = nLastPos; + nGoodEnd = nFoundEnd; + } + if( nFoundEnd >= startPos) + break; + bFirst = false; + if( nFoundEnd == nLastPos) + ++nFoundEnd; + } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr)); + + // Ignore all zero-length matches except "$" anchor on first match. + if (nGoodPos == nGoodEnd) + { + if (bFirst && nLastPos == startPos) + nGoodPos = nLastPos; + else + return aRet; + } + + // find last match again to get its details + lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr); + + // fill in the details of the last match + const int nGroupCount = pRegexMatcher->groupCount(); + aRet.subRegExpressions = nGroupCount + 1; + aRet.startOffset.realloc( aRet.subRegExpressions); + auto pstartOffset = aRet.startOffset.getArray(); + aRet.endOffset.realloc( aRet.subRegExpressions); + auto pendOffset = aRet.endOffset.getArray(); + // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! + pstartOffset[0] = pRegexMatcher->end( nIcuErr); + pendOffset[0] = pRegexMatcher->start( nIcuErr); + for( int i = 1; i <= nGroupCount; ++i) { + pstartOffset[i] = pRegexMatcher->end( i, nIcuErr); + pendOffset[i] = pRegexMatcher->start( i, nIcuErr); + } + + return aRet; +} + + +// search for words phonetically +SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) +{ + SearchResult aRet; + aRet.subRegExpressions = 0; + + if( !xBreak.is() ) + return aRet; + + sal_Int32 nStt, nEnd; + + Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos, + aSrchPara.Locale, + WordType::ANYWORD_IGNOREWHITESPACES, true ); + + do + { + if( aWBnd.startPos >= endPos ) + break; + nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; + nEnd = std::min(aWBnd.endPos, endPos); + + if( nStt < nEnd && + pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit ) + { + aRet.subRegExpressions = 1; + aRet.startOffset = { nStt }; + aRet.endOffset = { nEnd }; + break; + } + + nStt = nEnd - 1; + aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale, + WordType::ANYWORD_IGNOREWHITESPACES); + } while( aWBnd.startPos != aWBnd.endPos || + (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) ); + // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only + // whitespace) in searchStr, getWordBoundary() returned startPos,startPos + // and nextWord() does also => don't loop forever. + return aRet; +} + +SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) +{ + SearchResult aRet; + aRet.subRegExpressions = 0; + + if( !xBreak.is() ) + return aRet; + + sal_Int32 nStt, nEnd; + + Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos, + aSrchPara.Locale, + WordType::ANYWORD_IGNOREWHITESPACES, true ); + + do + { + if( aWBnd.endPos <= endPos ) + break; + nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; + nEnd = std::min(aWBnd.endPos, startPos); + + if( nStt < nEnd && + pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit ) + { + aRet.subRegExpressions = 1; + aRet.startOffset = { nEnd }; + aRet.endOffset = { nStt }; + break; + } + if( !nStt ) + break; + + aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale, + WordType::ANYWORD_IGNOREWHITESPACES); + } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() ); + return aRet; +} + + +namespace { +void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset ) +{ + rRes.subRegExpressions = 1; + rRes.startOffset = { nStartOffset }; + rRes.endOffset = { nEndOffset }; +} +} + +SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos ) +{ + SearchResult aRes; + aRes.subRegExpressions = 0; // no match + sal_Int32 nStartOffset = nStartPos; + sal_Int32 nEndOffset = nEndPos; + + const sal_Int32 nStringLen = searchStr.getLength(); + + // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty + // string match with [0,0). + if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos || + (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos))) + return aRes; + + const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2); + const sal_Int32 nPatternLen = rPattern.getLength(); + + // Handle special cases empty pattern and/or string outside of the loop to + // not add performance penalties there and simplify. + if (nStartPos == nEndPos) + { + sal_Int32 i = 0; + while (i < nPatternLen && rPattern[i] == '*') + ++i; + if (i == nPatternLen) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + // Empty pattern does not match any non-empty string. + if (!nPatternLen) + return aRes; + + bool bRewind = false; + sal_uInt32 cPattern = 0; + sal_Int32 nPattern = 0; + sal_Int32 nAfterFakePattern = nPattern; + if (mbWildcardAllowSubstring) + { + // Fake a leading '*' wildcard. + cPattern = '*'; + bRewind = true; + // Assume a non-'*' pattern character follows. If it is a '*' instead + // that will be handled in the loop by setting nPat. + sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern); + if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen) + rPattern.iterateCodePoints( &nAfterFakePattern); + } + + sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1; + sal_uInt32 cPatternAfterAsterisk = 0; + bool bEscaped = false, bEscapedAfterAsterisk = false; + + // The loop code tries to avoid costly calls to iterateCodePoints() when + // possible. + + do + { + if (bRewind) + { + // Reuse cPattern after '*', nPattern was correspondingly + // incremented to point behind cPattern. + bRewind = false; + } + else if (nPattern < nPatternLen) + { + // nPattern will be incremented by iterateCodePoints(). + cPattern = rPattern.iterateCodePoints( &nPattern); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen) + { + bEscaped = true; + cPattern = rPattern.iterateCodePoints( &nPattern); + } + } + else + { + // A trailing '*' is handled below. + if (mbWildcardAllowSubstring) + { + // If the pattern is consumed and substring match allowed we're good. + setWildcardMatch( aRes, nStartOffset, nString); + return aRes; + } + else if (nString < nEndPos && nLastAsterisk >= 0) + { + // If substring match is not allowed try a greedy '*' match. + nPattern = nLastAsterisk; + continue; // do + } + else + return aRes; + } + + if (cPattern == '*' && !bEscaped) + { + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern < nPatternLen && rPattern[nPattern] == '*') + ++nPattern; + + if (nPattern >= nPatternLen) + { + // Last pattern is '*', remaining string matches. + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + nLastAsterisk = nPattern; // Remember last encountered '*'. + + // cPattern will be the next non-'*' character, nPattern + // incremented. + cPattern = rPattern.iterateCodePoints( &nPattern); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen) + { + bEscaped = true; + cPattern = rPattern.iterateCodePoints( &nPattern); + } + + cPatternAfterAsterisk = cPattern; + bEscapedAfterAsterisk = bEscaped; + nPat = nPattern; // Remember position of pattern behind '*', already incremented. + nStr = nString; // Remember the current string to be matched. + } + + if (nString >= nEndPos) + // Whatever follows in pattern, string will not match. + return aRes; + + // nString will be incremented by iterateCodePoints(). + sal_uInt32 cString = searchStr.iterateCodePoints( &nString); + + if ((cPattern != '?' || bEscaped) && cPattern != cString) + { + if (nPat == -1) + // Non-match already without any '*' pattern. + return aRes; + + bRewind = true; + nPattern = nPat; // Rewind pattern to character behind '*', already incremented. + cPattern = cPatternAfterAsterisk; + bEscaped = bEscapedAfterAsterisk; + searchStr.iterateCodePoints( &nStr); + nString = nStr; // Restore incremented remembered string position. + if (nPat == nAfterFakePattern) + { + // Next start offset will be the next character. + nStartOffset = nString; + } + } + else + { + // An unescaped '?' pattern matched any character, or characters + // matched. Reset only escaped state. + bEscaped = false; + } + } + while (nString < nEndPos); + + if (bRewind) + return aRes; + + // Eat trailing '*' pattern that matches anything, including nothing. + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern < nPatternLen && rPattern[nPattern] == '*') + ++nPattern; + + if (nPattern == nPatternLen) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; +} + +SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos ) +{ + SearchResult aRes; + aRes.subRegExpressions = 0; // no match + + sal_Int32 nStartOffset = nStartPos; + sal_Int32 nEndOffset = nEndPos; + + const sal_Int32 nStringLen = searchStr.getLength(); + + // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty + // string match with (0,0]. + if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos || + (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos))) + return aRes; + + const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2); + sal_Int32 nPatternLen = rPattern.getLength(); + + // Handle special cases empty pattern and/or string outside of the loop to + // not add performance penalties there and simplify. + if (nStartPos == nEndPos) + { + sal_Int32 i = 0; + while (i < nPatternLen && rPattern[i] == '*') + ++i; + if (i == nPatternLen) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + // Empty pattern does not match any non-empty string. + if (!nPatternLen) + return aRes; + + // Reverse escaped patterns to ease the handling of escapes, keeping escape + // and following character as one sequence in backward direction. + if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) || + (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty())) + { + OUStringBuffer aPatternBuf( rPattern); + sal_Int32 nIndex = 0; + while (nIndex < nPatternLen) + { + const sal_Int32 nOld = nIndex; + const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex); + if (cu == mcWildcardEscapeChar) + { + if (nIndex < nPatternLen) + { + if (nIndex - nOld == 1) + { + // Simply move code units, we already memorized the one + // in 'cu'. + const sal_Int32 nOld2 = nIndex; + rPattern.iterateCodePoints( &nIndex); + for (sal_Int32 i=0; i < nIndex - nOld2; ++i) + aPatternBuf[nOld+i] = rPattern[nOld2+i]; + aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu); + } + else + { + // Copy the escape character code units first in the + // unlikely case that it would not be of BMP. + assert(nIndex - nOld == 2); // it's UTF-16, so... + sal_Unicode buf[2]; + buf[0] = rPattern[nOld]; + buf[1] = rPattern[nOld+1]; + const sal_Int32 nOld2 = nIndex; + rPattern.iterateCodePoints( &nIndex); + for (sal_Int32 i=0; i < nIndex - nOld2; ++i) + aPatternBuf[nOld+i] = rPattern[nOld2+i]; + aPatternBuf[nIndex-2] = buf[0]; + aPatternBuf[nIndex-1] = buf[1]; + } + } + else + { + // Trailing escape would become leading escape, do what? + // Eliminate. + aPatternBuf.remove( nOld, nIndex - nOld); + } + } + } + if (bUsePrimarySrchStr) + maWildcardReversePattern = aPatternBuf.makeStringAndClear(); + else + maWildcardReversePattern2 = aPatternBuf.makeStringAndClear(); + } + const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2); + nPatternLen = rReversePattern.getLength(); + + bool bRewind = false; + sal_uInt32 cPattern = 0; + sal_Int32 nPattern = nPatternLen; + sal_Int32 nAfterFakePattern = nPattern; + if (mbWildcardAllowSubstring) + { + // Fake a trailing '*' wildcard. + cPattern = '*'; + bRewind = true; + // Assume a non-'*' pattern character follows. If it is a '*' instead + // that will be handled in the loop by setting nPat. + sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); + if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0) + rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); + } + + sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1; + sal_uInt32 cPatternAfterAsterisk = 0; + bool bEscaped = false, bEscapedAfterAsterisk = false; + + // The loop code tries to avoid costly calls to iterateCodePoints() when + // possible. + + do + { + if (bRewind) + { + // Reuse cPattern after '*', nPattern was correspondingly + // decremented to point before cPattern. + bRewind = false; + } + else if (nPattern > 0) + { + // nPattern will be decremented by iterateCodePoints(). + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0) + { + bEscaped = true; + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + } + } + else + { + // A trailing '*' is handled below. + if (mbWildcardAllowSubstring) + { + // If the pattern is consumed and substring match allowed we're good. + setWildcardMatch( aRes, nStartOffset, nString); + return aRes; + } + else if (nString > nEndPos && nLastAsterisk >= 0) + { + // If substring match is not allowed try a greedy '*' match. + nPattern = nLastAsterisk; + continue; // do + } + else + return aRes; + } + + if (cPattern == '*' && !bEscaped) + { + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern > 0 && rReversePattern[nPattern-1] == '*') + --nPattern; + + if (nPattern <= 0) + { + // First pattern is '*', remaining string matches. + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; + } + + nLastAsterisk = nPattern; // Remember last encountered '*'. + + // cPattern will be the previous non-'*' character, nPattern + // decremented. + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0) + { + bEscaped = true; + cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); + } + + cPatternAfterAsterisk = cPattern; + bEscapedAfterAsterisk = bEscaped; + nPat = nPattern; // Remember position of pattern before '*', already decremented. + nStr = nString; // Remember the current string to be matched. + } + + if (nString <= nEndPos) + // Whatever leads in pattern, string will not match. + return aRes; + + // nString will be decremented by iterateCodePoints(). + sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1); + + if ((cPattern != '?' || bEscaped) && cPattern != cString) + { + if (nPat == -1) + // Non-match already without any '*' pattern. + return aRes; + + bRewind = true; + nPattern = nPat; // Rewind pattern to character before '*', already decremented. + cPattern = cPatternAfterAsterisk; + bEscaped = bEscapedAfterAsterisk; + searchStr.iterateCodePoints( &nStr, -1); + nString = nStr; // Restore decremented remembered string position. + if (nPat == nAfterFakePattern) + { + // Next start offset will be this character (exclusive). + nStartOffset = nString; + } + } + else + { + // An unescaped '?' pattern matched any character, or characters + // matched. Reset only escaped state. + bEscaped = false; + } + } + while (nString > nEndPos); + + if (bRewind) + return aRes; + + // Eat leading '*' pattern that matches anything, including nothing. + // '*' is one code unit, so not using iterateCodePoints() is ok. + while (nPattern > 0 && rReversePattern[nPattern-1] == '*') + --nPattern; + + if (nPattern == 0) + setWildcardMatch( aRes, nStartOffset, nEndOffset); + return aRes; +} + + +OUString SAL_CALL +TextSearch::getImplementationName() +{ + return "com.sun.star.util.TextSearch_i18n"; +} + +sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName) +{ + return cppu::supportsService(this, rServiceName); +} + +Sequence< OUString > SAL_CALL +TextSearch::getSupportedServiceNames() +{ + return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" }; +} + +extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface* +i18npool_TextSearch_get_implementation( + css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&) +{ + return cppu::acquire(new TextSearch(context)); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/i18npool/source/search/textsearch.hxx b/i18npool/source/search/textsearch.hxx new file mode 100644 index 0000000000..10c1d59e23 --- /dev/null +++ b/i18npool/source/search/textsearch.hxx @@ -0,0 +1,158 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#pragma once + +#include <cppuhelper/implbase.hxx> +#include <com/sun/star/util/XTextSearch2.hpp> +#include <com/sun/star/lang/XServiceInfo.hpp> + +#include <map> +#include <memory> +#include <mutex> + +#include <unicode/regex.h> +#include <unicode/unistr.h> +#include <unicode/uversion.h> + +namespace com::sun::star::i18n { class XBreakIterator; } +namespace com::sun::star::i18n { class XCharacterClassification; } +namespace com::sun::star::i18n { class XExtendedTransliteration; } +namespace com::sun::star::uno { class XComponentContext; } + + +class WLevDistance; +typedef ::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable; + +class TextSearch: public cppu::WeakImplHelper +< + css::util::XTextSearch2, + css::lang::XServiceInfo +> +{ + std::mutex m_aMutex; + css::uno::Reference < css::uno::XComponentContext > m_xContext; + + css::util::SearchOptions2 aSrchPara; + OUString sSrchStr; + OUString sSrchStr2; + + mutable css::uno::Reference< css::i18n::XCharacterClassification > xCharClass; + + css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit; + css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2; + + // define a function pointer for the different search methods + typedef css::util::SearchResult + (SAL_CALL TextSearch::*FnSrch)( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + + FnSrch fnForward; + FnSrch fnBackward; + + // to fix UX regression, U+0027 matches also U+2019 in non-regex search + bool bSearchApostrophe; + + // Members and methods for the normal (Boyer-Moore) search + std::unique_ptr<TextSearchJumpTable> pJumpTable; + std::unique_ptr<TextSearchJumpTable> pJumpTable2; + bool bIsForwardTab; + bool bUsePrimarySrchStr; + void MakeForwardTab(); + void MakeForwardTab2(); + void MakeBackwardTab(); + void MakeBackwardTab2(); + sal_Int32 GetDiff( const sal_Unicode ) const; + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + NSrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + NSrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + + // Members and methods for the regular expression search + std::unique_ptr<icu::RegexMatcher> pRegexMatcher; + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + RESrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + RESrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + void RESrchPrepare( const css::util::SearchOptions2&); + + // Members and methods for the "Weight Levenshtein-Distance" search + int nLimit; + std::unique_ptr<WLevDistance> pWLD; + css::uno::Reference < css::i18n::XBreakIterator > xBreak; + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + ApproxSrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + ApproxSrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + + // Members and methods for the wildcard search + OUString maWildcardReversePattern; + OUString maWildcardReversePattern2; + sal_uInt32 mcWildcardEscapeChar; + bool mbWildcardAllowSubstring; + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + WildcardSrchFrwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + /// @throws css::uno::RuntimeException + css::util::SearchResult SAL_CALL + WildcardSrchBkwrd( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ); + + bool IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const; + +public: + explicit TextSearch( + const css::uno::Reference < css::uno::XComponentContext >& rxContext ); + + virtual ~TextSearch() override; + + // XTextSearch + virtual void SAL_CALL + setOptions( const css::util::SearchOptions& options ) override; + virtual css::util::SearchResult SAL_CALL + searchForward( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) override; + virtual css::util::SearchResult SAL_CALL + searchBackward( const OUString& searchStr, + sal_Int32 startPos, sal_Int32 endPos ) override; + + // XTextSearch2 + virtual void SAL_CALL + setOptions2( const css::util::SearchOptions2& options ) override; + + //XServiceInfo + virtual OUString SAL_CALL getImplementationName() override; + virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override; + virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override; +}; + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |