Adding upstream version 4:7.4.7.upstream/4%7.4.7 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 09:06:44 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 09:06:44 +0000
commit: ed5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch)
tree: 7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /i18npool/source/search
parent: Initial commit. (diff)
download: libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.tar.xz
libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.zip
5 files changed, 2342 insertions, 0 deletions
diff --git a/i18npool/source/search/i18nsearch.component b/i18npool/source/search/i18nsearch.component
new file mode 100644
index 000000000..fe7f48c26
--- /dev/null
+++ b/i18npool/source/search/i18nsearch.component
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ -->
+
+<component loader="com.sun.star.loader.SharedLibrary" environment="@CPPU_ENV@"
+    xmlns="http://openoffice.org/2010/uno-components">
+  <implementation name="com.sun.star.util.TextSearch_i18n"
+    constructor="i18npool_TextSearch_get_implementation">
+    <service name="com.sun.star.util.TextSearch"/>
+    <service name="com.sun.star.util.TextSearch2"/>
+  </implementation>
+</component>
diff --git a/i18npool/source/search/levdis.cxx b/i18npool/source/search/levdis.cxx
new file mode 100644
index 000000000..dd9f8fbf5
--- /dev/null
+++ b/i18npool/source/search/levdis.cxx
@@ -0,0 +1,373 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+/*
+
+    Weighted Levenshtein Distance
+    including wildcards
+    '*' for any number (0 or more) of arbitrary characters
+    '?' for exactly one arbitrary character
+    escapable with  backslash, "\*" or "\?"
+
+    Return:
+        WLD if WLD <= nLimit, else nLimit+1
+
+    or, if bSplitCount:
+        WLD if WLD <= nLimit
+        -WLD if Replace and Insert and Delete <= nLimit
+        else nLimit+1
+
+    Recursive definition of WLD:
+
+    WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
+                             WLD( X(i)  , Y(j-1) ) + q      ,
+                             WLD( X(i-1), Y(j)   ) + r      )
+
+    X(i)   := the first i characters of the word X
+    Y(j)   := the first j characters of the word Y
+    p(i,j) := 0 if i-th character of X == j-th character of Y,
+              p else
+
+    Boundary conditions:
+    WLD( X(0), Y(j) ) := j*q  (Y created by j inserts)
+    WLD( X(i), Y(0) ) := i*r  (Y created by i deletes)
+    WLD( X(0), Y(0) ) := 0
+
+    Instead of recursions a dynamic algorithm is used.
+
+    See also: German computer magazine
+    c't 07/89 pages 192-208 and c't 03/94 pages 230-239
+*/
+
+#include <algorithm>
+#include <numeric>
+
+#include "levdis.hxx"
+
+#define LEVDISBIG   (nLimit + 1)    // Return value if distance > nLimit
+#define LEVDISDOUBLEBUF 2048        // no doubling atop this border
+
+static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
+{
+    const sal_Unicode* pTempStr = pStr;
+    while( *pTempStr )
+        pTempStr++;
+    return static_cast<sal_Int32>(pTempStr-pStr);
+}
+
+// Distance from string to pattern
+int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
+{
+    int nSPMin = 0;     // penalty point Minimum
+    int nRepS = 0;      // for SplitCount
+
+    // length difference between pattern and string
+    int nLenDiff = nPatternLen - nStars - nStringLen;
+    // more insertions or deletions necessary as the limit? Then leave
+    if ( (nLenDiff * nInsQ0 > nLimit)
+            || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
+        return LEVDISBIG;
+
+     // comparative String greater than  instantaneous array
+    // -> adapt array size
+    if ( nStringLen >= nArrayLen )
+    {
+        // increase size much more to avoid reallocation
+        if ( nStringLen < LEVDISDOUBLEBUF )
+            nArrayLen = 2 * nStringLen;
+        else
+            nArrayLen = nStringLen + 1;
+        npDistance = aDisMem.NewMem( nArrayLen );
+    }
+
+    // Calculate start values of the second column (first pattern value).
+    // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
+    // therefore the minimum is 0
+    if ( nPatternLen == 0 )
+    {
+        // Count of deletions to reach pattern
+        for ( sal_Int32 i=0; i <= nStringLen; i++ )
+            npDistance[i] = i * nDelR0;
+    }
+    else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
+    {
+        // instead of a '*' you can fit in anything
+        for ( sal_Int32 i=0; i <= nStringLen; i++ )
+            npDistance[i] = 0;
+    }
+    else
+    {
+        sal_Unicode c;
+        int nP;
+        c = cpPattern[0];
+        if ( c == '?' && bpPatIsWild[0] )
+            nP = 0;     // a '?' could be any character.
+        else
+            // Minimum of replacement and deletion+insertion weighting
+            nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
+        npDistance[0] = nInsQ0;     // start with simple insert
+        npDistance[1] = nInsQ0;
+        npDistance[2] = nInsQ0;
+        int nReplacePos = -1;       // tristate flag
+        int nDelCnt = 0;
+        for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
+        {
+            if ( cString[i-1] == c )
+                nP = 0;     // Replace from this position is 0
+            // Deletions to match pattern + Replace
+            npDistance[i] = nDelCnt + nP;
+            if ( bSplitCount )
+            {
+                if ( nReplacePos < 0 && nP )
+                {   // this position will be replaced
+                    nRepS++;
+                    nReplacePos = i;
+                }
+                else if ( nReplacePos > 0 && !nP )
+                {
+                    // same count of c
+                    int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
+                    if ( !nBalance )
+                    {   // one was replaced that was an insertion instead
+                        nRepS--;
+                        nReplacePos = 0;
+                    }
+                }
+            }
+        }
+        nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
+    }
+
+    // calculate distance matrix
+    sal_Int32 j = 0;        // for all columns of the pattern, till limit is not reached
+    while ( (j < nPatternLen-1)
+            && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
+    {
+        sal_Unicode c;
+        int nP, nQ, nR, nPij, d2;
+
+        j++;
+        c = cpPattern[j];
+        if ( bpPatIsWild[j] )   // '*' or '?' not escaped
+            nP = 0;     // could be replaced without penalty
+        else
+            nP = nRepP0;
+        if ( c == '*' && bpPatIsWild[j] )
+        {
+            nQ = 0;     // insertion and deletion without penalty
+            nR = 0;
+        }
+        else
+        {
+            nQ = nInsQ0;    // usual weighting
+            nR = nDelR0;
+        }
+        d2 = npDistance[0];
+        // increase insert count to get from null string to pattern
+        npDistance[0] = npDistance[0] + nQ;
+        nSPMin = npDistance[0];
+        int nReplacePos = -1;       // tristate flag
+        // for each pattern column run through the string
+        for ( sal_Int32 i=1; i <= nStringLen; i++ )
+        {
+            int d1 = d2;            // WLD( X(i-1), Y(j-1) )
+            d2 = npDistance[i];     // WLD( X(i)  , Y(j-1) )
+            if ( cString[i-1] == c )
+            {
+                nPij = 0;           // p(i,j)
+                if ( nReplacePos < 0 )
+                {
+                    // same count of c
+                    int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
+                    if ( !nBalance )
+                        nReplacePos = 0;    // no replacement
+                }
+            }
+            else
+                nPij = nP;
+            // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
+            //                          WLD( X(i)  , Y(j-1) ) + q      ,
+            //                          WLD( X(i-1), Y(j)   ) + r      )
+            npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
+            if ( npDistance[i] < nSPMin )
+                nSPMin = npDistance[i];
+            if ( bSplitCount )
+            {
+                if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
+                {   // this position will be replaced
+                    nRepS++;
+                    nReplacePos = i;
+                }
+                else if ( nReplacePos > 0 && !nPij )
+                {
+                    // character is equal in string and pattern
+                    //
+                    // If from this point:
+                    // * pattern and string have the same count of this
+                    //   character
+                    // * and character count is the same before this position
+                    // then the replace was none.
+                    //
+                    // Scrambled letters are recognized here and the nRepS
+                    // replacement is withdrawn, whereby the double limit kicks
+                    // in.
+
+                    // Same count of c
+                    int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
+                    if ( !nBalance )
+                    {   // one was replaced that was an insertion instead
+                        nRepS--;
+                        nReplacePos = 0;
+                    }
+                }
+            }
+        }
+    }
+    if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
+        return npDistance[nStringLen];
+    else
+    {
+        if ( bSplitCount )
+        {
+            if ( nRepS && nLenDiff > 0 )
+                nRepS -= nLenDiff;      // Inserts were counted
+            if ( (nSPMin <= 2 * nLimit)
+                    && (npDistance[nStringLen] <= 2 * nLimit)
+                    && (nRepS * nRepP0 <= nLimit) )
+                return -npDistance[nStringLen];
+            return LEVDISBIG;
+        }
+        return LEVDISBIG;
+    }
+}
+
+// Calculating      nLimit,   nReplP0,    nInsQ0,     nDelR0,     bSplitCount
+// from user values           nOtherX,    nShorterY,  nLongerZ,   bRelaxed
+void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
+{
+    if ( nX < 0 ) nX = 0;       // only positive values
+    if ( nY < 0 ) nY = 0;
+    if ( nZ < 0 ) nZ = 0;
+    if (0 == std::min({ nX, nY, nZ })) // at least one 0
+    {
+        int nMid, nMax;
+        nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
+        if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
+            nLimit = nMax;                      // either 0 or the only one >0
+        else                                    // one is 0
+            nLimit = std::lcm( nMid, nMax );
+    }
+    else                                        // all three of them are not 0
+        nLimit = std::lcm(std::lcm(nX, nY), nZ);
+    nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
+    nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
+    nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
+    bSplitCount = bRelaxed;
+}
+
+// The value in the middle
+int WLevDistance::Mid3( int x, int y, int z )
+{
+    int min = std::min({ x, y, z });
+    if ( x == min )
+        return std::min(y, z);
+    else if ( y == min )
+        return std::min(x, z);
+    else        // z == min
+        return std::min(x, y);
+}
+
+// initialize data from CTOR
+void WLevDistance::InitData( const sal_Unicode* cPattern )
+{
+    cpPattern = aPatMem.GetcPtr();
+    bpPatIsWild = aPatMem.GetbPtr();
+    npDistance = aDisMem.GetPtr();
+    nStars = 0;
+    const sal_Unicode* cp1 = cPattern;
+    sal_Unicode* cp2 = cpPattern;
+    bool* bp = bpPatIsWild;
+    // copy pattern, count asterisks, escaped Jokers
+    while ( *cp1 )
+    {
+        if ( *cp1 == '\\' )     // maybe escaped
+        {
+            if ( *(cp1+1) == '*' || *(cp1+1) == '?' )   // next Joker?
+            {
+                cp1++;          // skip '\\'
+                nPatternLen--;
+            }
+            *bp++ = false;
+        }
+        else if ( *cp1 == '*' || *cp1 == '?' )      // Joker
+        {
+            if ( *cp1 == '*' )
+                nStars++;
+            *bp++ = true;
+        }
+        else
+            *bp++ = false;
+        *cp2++ = *cp1++;
+    }
+    *cp2 = '\0';
+}
+
+WLevDistance::WLevDistance( const sal_Unicode* cPattern,
+                            int nOtherX, int nShorterY, int nLongerZ,
+                            bool bRelaxed ) :
+    nPatternLen( Impl_WLD_StringLen(cPattern) ),
+    aPatMem( nPatternLen + 1 ),
+    nArrayLen( nPatternLen + 1 ),
+    aDisMem( nArrayLen )
+{
+    InitData( cPattern );
+    CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
+}
+
+// CopyCTor
+WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
+    nPatternLen( rWLD.nPatternLen ),
+    aPatMem( nPatternLen + 1 ),
+    nArrayLen( nPatternLen + 1 ),
+    aDisMem( nArrayLen ),
+    nLimit( rWLD.nLimit ),
+    nRepP0( rWLD.nRepP0 ),
+    nInsQ0( rWLD.nInsQ0 ),
+    nDelR0( rWLD.nDelR0 ),
+    nStars( rWLD.nStars ),
+    bSplitCount( rWLD.bSplitCount )
+{
+    cpPattern = aPatMem.GetcPtr();
+    bpPatIsWild = aPatMem.GetbPtr();
+    npDistance = aDisMem.GetPtr();
+    sal_Int32 i;
+    for ( i=0; i<nPatternLen; i++ )
+    {
+        cpPattern[i] = rWLD.cpPattern[i];
+        bpPatIsWild[i] = rWLD.bpPatIsWild[i];
+    }
+    cpPattern[i] = '\0';
+}
+
+// DTor
+WLevDistance::~WLevDistance()
+{
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/source/search/levdis.hxx b/i18npool/source/search/levdis.hxx
new file mode 100644
index 000000000..c58fba842
--- /dev/null
+++ b/i18npool/source/search/levdis.hxx
@@ -0,0 +1,214 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#ifndef INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
+#define INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
+
+#include <sal/types.h>
+#include <memory>
+
+// Sensible default values for a user interface could be:
+//  LEVDISDEFAULT_XOTHER    2
+//      Maximum X replacements to match query, found data may be different by X
+//      characters from query.
+//  LEVDISDEFAULT_YSHORTER  1
+//      Maximum Y insertions to match query, found data may be Y characters
+//      shorter than query.
+//  LEVDISDEFAULT_ZLONGER   3
+//      Maximum Z deletions to match query, found data may be Z characters
+//      longer than query.
+//  LEVDISDEFAULT_RELAXED   TRUE
+//      Use relaxed SplitCount instead of mathematical WLD.
+//
+// Joker/wildcards ('?' and '*') of course do not count as
+// replacement/insertion/deletion. At a '?' a replacement is not counted, for a
+// '*' the found data may be any number of characters longer than the query.
+//
+// Strict mathematical WLD: EITHER maximum X replacements OR Y characters
+// shorter OR Z characters longer.
+// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
+// Z characters longer. Any combination of actions is valid.
+//
+// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
+// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
+//
+// The corresponding internal default weigh values for these user interface
+// values would be:
+//  LEVDISDEFAULTLIMIT  6
+//      Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
+//  LEVDISDEFAULT_P0    3
+//      Default nRepP0, weight of replacements.
+//  LEVDISDEFAULT_Q0    6
+//      Default nInsQ0, weight of insertions.
+//  LEVDISDEFAULT_R0    2
+//      Default nDelR0, weight of deletions.
+
+// The transformation of user input values to weighs is done using CalcLPQR().
+// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
+// characters longer than pattern) then no character can be replaced any more.
+// This can be circumvented by increasing nX or/and nZ, but of course with the
+// side effect of being less strict then... or the other solution is to use
+// relaxed SplitCount (see below), which is the default when using CalcLPQR().
+//
+// Attention: shorter = WLD.Insert, longer = WLD.Delete
+//
+// View and counting is always from data string to pattern, a deletion means
+// that something is deleted from data to match pattern.
+//
+// Deletions weigh less in this example because usually less is known than is
+// searched for. Replacements get middle weight, for example because of
+// misspellings. Insertions are expensive.
+//
+// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
+// Allowed are maximum 4 replacements, no insertion, no deletion.
+// Matches the user interface values X = 3, Y = 0, Z = 0
+//
+// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
+// of WLD() then isn't necessarily the Levenshtein-Distance, but can be
+// negative (-WLD) if the WLD is greater than nLimit but single values are
+// within the limits.
+// For the above default values that could mean: even if the found string is
+// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
+// to reach pattern. Additionally, character swaps count as one replacement.
+// Mathematically completely incorrect, but meets user expectations ;-)
+//
+// Explanation: in the real WLD all actions are withdrawn from a common 100%
+// pool, if one gets all there's nothing left for others. With bSplitCount
+// replacements have their own pool.
+
+
+/** "Safe" memory allocation in ctor */
+class WLevDisPatternMem
+{
+    std::unique_ptr<sal_Unicode[]> cp;
+    std::unique_ptr<bool[]>        bp;
+public:
+    explicit WLevDisPatternMem( sal_Int32 s )
+        : cp(new sal_Unicode[s])
+        , bp(new bool[s])
+    {
+    }
+
+    sal_Unicode* GetcPtr() const        { return cp.get(); }
+    bool* GetbPtr() const               { return bp.get(); }
+};
+
+class WLevDisDistanceMem
+{
+    std::unique_ptr<int[]> p;
+public:
+    explicit WLevDisDistanceMem( size_t s )
+    {
+        NewMem(s);
+    }
+    int* GetPtr() const             { return p.get(); }
+    int* NewMem( size_t s )
+    {
+        p.reset(new int[ s<3 ? 3 : s ]);
+        return p.get();
+    }
+};
+
+/** Weighted Levenshtein Distance (WLD)
+
+    For a more detailed explanation see documentation in
+    i18npool/source/search/levdis.hxx
+ */
+class WLevDistance
+{
+    sal_Int32       nPatternLen;    ///< length of pattern
+    WLevDisPatternMem   aPatMem;    ///< manage allocation of pattern array
+    sal_Unicode*    cpPattern;      ///< pointer to pattern array
+    bool*           bpPatIsWild;    ///< pointer to bool array whether pattern is wildcard
+    sal_Int32       nArrayLen;      ///< length of distance array
+    WLevDisDistanceMem  aDisMem;    ///< manage allocation of distance array
+    int*            npDistance;     ///< pointer to distance array
+    int             nLimit;         ///< WLD limit replacements/insertions/deletions
+    int             nRepP0;         ///< replacement weigh
+    int             nInsQ0;         ///< insertion weigh
+    int             nDelR0;         ///< deletion weigh
+    int             nStars;         ///< count of '*' wildcards in pattern
+    bool            bSplitCount;    ///< if TRUE, Rep/Ins/Del are counted separately
+
+    void InitData( const sal_Unicode* cPattern );
+    static int Mid3( int x, int y, int z );        ///< middle value of 3 values
+
+public:
+
+    /** CTor with user input. Internally calls CalcLPQR().
+
+        After this, obtain the resulting limit using GetLimit().
+
+        @param  bRelaxed    the mathematically incorrect method is default (TRUE)
+     */
+    WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
+                    int nLongerZ, bool bRelaxed );
+
+    WLevDistance( const WLevDistance& rWLD );
+    ~WLevDistance();
+
+    /** Calculate the Weighted Levenshtein Distance from string to pattern. */
+    int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
+
+    /** Calculate the internal weighs corresponding to the user input values.
+        @returns nLimit for later comparison with WLD()
+     */
+    void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
+                    bool bRelaxed );
+
+    int GetLimit() const     { return nLimit; }
+
+    // Calculate current balance, keep this inline for performance reasons!
+    // c == cpPattern[jj] == cString[ii]
+    // First seek up to found place, if the balance is still equal there then
+    // also compare after the found place.
+    int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const
+    {
+        int nBalance = 0;
+
+        if ( jj != ii )
+        {
+            sal_Int32 k;
+            if ( jj > 0 )
+                for ( k=0; k < jj; k++ )
+                    if ( cpPattern[k] == c )
+                        nBalance++;
+            if ( ii > 0 )
+                for ( k=0; k < ii; k++ )
+                    if ( cString[k] == c )
+                        nBalance--;
+            if ( !nBalance )
+            {
+                for ( k=jj+1; k < nPatternLen; k++ )
+                    if ( cpPattern[k] == c )
+                        nBalance++;
+                for ( k=ii+1; k < nStringLen; k++ )
+                    if ( cString[k] == c )
+                        nBalance--;
+            }
+        }
+
+        return nBalance;
+    }
+};
+
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
new file mode 100644
index 000000000..a16c3e1cc
--- /dev/null
+++ b/i18npool/source/search/textsearch.cxx
@@ -0,0 +1,1567 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#include "textsearch.hxx"
+#include "levdis.hxx"
+#include <com/sun/star/i18n/BreakIterator.hpp>
+#include <com/sun/star/util/SearchAlgorithms2.hpp>
+#include <com/sun/star/util/SearchFlags.hpp>
+#include <com/sun/star/i18n/WordType.hpp>
+#include <com/sun/star/i18n/ScriptType.hpp>
+#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
+#include <com/sun/star/i18n/CharacterClassification.hpp>
+#include <com/sun/star/i18n/KCharacterType.hpp>
+#include <com/sun/star/i18n/Transliteration.hpp>
+#include <cppuhelper/supportsservice.hxx>
+#include <cppuhelper/weak.hxx>
+#include <i18nutil/transliteration.hxx>
+#include <rtl/ustrbuf.hxx>
+#include <sal/log.hxx>
+
+#include <unicode/regex.h>
+
+using namespace ::com::sun::star::util;
+using namespace ::com::sun::star::uno;
+using namespace ::com::sun::star::lang;
+using namespace ::com::sun::star::i18n;
+using namespace ::com::sun::star;
+
+const TransliterationFlags COMPLEX_TRANS_MASK =
+    TransliterationFlags::ignoreBaFa_ja_JP |
+    TransliterationFlags::ignoreIterationMark_ja_JP |
+    TransliterationFlags::ignoreTiJi_ja_JP |
+    TransliterationFlags::ignoreHyuByu_ja_JP |
+    TransliterationFlags::ignoreSeZe_ja_JP |
+    TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
+    TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
+    TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
+
+namespace
+{
+TransliterationFlags maskComplexTrans( TransliterationFlags n )
+{
+    // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
+    // in complex transliteration.
+    return
+        n & (COMPLEX_TRANS_MASK |                       // all set ignore bits
+        TransliterationFlags::IGNORE_KANA |            // plus IGNORE_KANA bit
+        TransliterationFlags::FULLWIDTH_HALFWIDTH);    // and the FULLWIDTH_HALFWIDTH value
+}
+
+bool isComplexTrans( TransliterationFlags n )
+{
+    return bool(n & COMPLEX_TRANS_MASK);
+}
+
+TransliterationFlags maskSimpleTrans( TransliterationFlags n )
+{
+    return n & ~COMPLEX_TRANS_MASK;
+}
+
+bool isSimpleTrans( TransliterationFlags n )
+{
+    return bool(maskSimpleTrans(n));
+}
+
+// Regex patterns are case sensitive.
+TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
+{
+    TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
+    TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
+    if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
+        v = TransliterationFlags::NONE;
+    return (m | v) & ~COMPLEX_TRANS_MASK;
+}
+
+bool isSimpleRegexTrans( TransliterationFlags n )
+{
+    return bool(maskSimpleRegexTrans(n));
+}
+};
+
+TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
+        : m_xContext( rxContext )
+{
+    SearchOptions2 aOpt;
+    aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
+    aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
+    aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
+    //aOpt.Locale = ???;
+    setOptions( aOpt );
+}
+
+TextSearch::~TextSearch()
+{
+    pRegexMatcher.reset();
+    pWLD.reset();
+    pJumpTable.reset();
+    pJumpTable2.reset();
+}
+
+void TextSearch::setOptions2( const SearchOptions2& rOptions )
+{
+    std::unique_lock g(m_aMutex);
+
+    aSrchPara = rOptions;
+
+    pRegexMatcher.reset();
+    pWLD.reset();
+    pJumpTable.reset();
+    pJumpTable2.reset();
+    maWildcardReversePattern.clear();
+    maWildcardReversePattern2.clear();
+    TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
+    bSearchApostrophe = false;
+    bool bReplaceApostrophe = false;
+    if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
+    {
+        // RESrchPrepare will consider aSrchPara.transliterateFlags when
+        // picking the actual regex pattern
+        // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
+        // case-insensitivity. Create transliteration instance, if any, without
+        // ignore-case so later in TextSearch::searchForward() the string to
+        // match is not case-altered, leave case-(in)sensitive to regex engine.
+        transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
+    }
+    else if ( aSrchPara.searchString.indexOf('\'') > - 1 )
+    {
+        bSearchApostrophe = true;
+        bReplaceApostrophe = aSrchPara.searchString.indexOf(u'\u2019') > -1;
+    }
+
+    // Create Transliteration class
+    if( isSimpleTrans( transliterateFlags) )
+    {
+        if( !xTranslit.is() )
+            xTranslit.set( Transliteration::create( m_xContext ) );
+        xTranslit->loadModule(
+             static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
+             aSrchPara.Locale);
+    }
+    else if( xTranslit.is() )
+        xTranslit = nullptr;
+
+    // Create Transliteration for 2<->1, 2<->2 transliteration
+    if ( isComplexTrans( transliterateFlags) )
+    {
+        if( !xTranslit2.is() )
+            xTranslit2.set( Transliteration::create( m_xContext ) );
+        // Load transliteration module
+        xTranslit2->loadModule(
+             static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
+             aSrchPara.Locale);
+    }
+
+    if ( !xBreak.is() )
+        xBreak = css::i18n::BreakIterator::create( m_xContext );
+
+    sSrchStr = aSrchPara.searchString;
+
+    // Transliterate search string.
+    if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
+    {
+        if (isSimpleRegexTrans(transliterateFlags))
+        {
+            if (maskSimpleRegexTrans(transliterateFlags) !=
+                    maskSimpleTrans(transliterateFlags))
+            {
+                css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
+                         Transliteration::create( m_xContext ));
+                if (xTranslitPattern.is())
+                {
+                    xTranslitPattern->loadModule(
+                            static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
+                            aSrchPara.Locale);
+                    sSrchStr = xTranslitPattern->transliterateString2String(
+                            aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
+                }
+            }
+            else
+            {
+                if (xTranslit.is())
+                    sSrchStr = xTranslit->transliterateString2String(
+                            aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
+            }
+            // xTranslit2 complex transliterated sSrchStr2 is not used in
+            // regex, see TextSearch::searchForward() and
+            // TextSearch::searchBackward()
+        }
+    }
+    else
+    {
+        if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
+            sSrchStr = xTranslit->transliterateString2String(
+                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
+
+        if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
+            sSrchStr2 = xTranslit2->transliterateString2String(
+                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
+    }
+
+    if ( bReplaceApostrophe )
+        sSrchStr = sSrchStr.replace(u'\u2019', '\'');
+
+    // Take the new SearchOptions2::AlgorithmType2 field and ignore
+    // SearchOptions::algorithmType
+    switch( aSrchPara.AlgorithmType2)
+    {
+        case SearchAlgorithms2::REGEXP:
+            fnForward = &TextSearch::RESrchFrwrd;
+            fnBackward = &TextSearch::RESrchBkwrd;
+            RESrchPrepare( aSrchPara);
+            break;
+
+        case SearchAlgorithms2::APPROXIMATE:
+            fnForward = &TextSearch::ApproxSrchFrwrd;
+            fnBackward = &TextSearch::ApproxSrchBkwrd;
+
+            pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
+                    aSrchPara.insertedChars, aSrchPara.deletedChars,
+                    0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
+
+            nLimit = pWLD->GetLimit();
+            break;
+
+        case SearchAlgorithms2::WILDCARD:
+            mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
+            mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
+            fnForward = &TextSearch::WildcardSrchFrwrd;
+            fnBackward = &TextSearch::WildcardSrchBkwrd;
+            break;
+
+        default:
+            SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
+            [[fallthrough]];
+        case SearchAlgorithms2::ABSOLUTE:
+            fnForward = &TextSearch::NSrchFrwrd;
+            fnBackward = &TextSearch::NSrchBkwrd;
+            break;
+    }
+}
+
+void TextSearch::setOptions( const SearchOptions& rOptions )
+{
+    sal_Int16 nAlgorithmType2;
+    switch (rOptions.algorithmType)
+    {
+        case SearchAlgorithms_REGEXP:
+            nAlgorithmType2 = SearchAlgorithms2::REGEXP;
+            break;
+        case SearchAlgorithms_APPROXIMATE:
+            nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
+            break;
+        default:
+            SAL_WARN("i18npool","TextSearch::setOptions - default what?");
+            [[fallthrough]];
+        case SearchAlgorithms_ABSOLUTE:
+            nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
+            break;
+    }
+    // It would be nice if an inherited struct had a ctor that takes an
+    // instance of the object the struct derived from...
+    SearchOptions2 aOptions2(
+            rOptions.algorithmType,
+            rOptions.searchFlag,
+            rOptions.searchString,
+            rOptions.replaceString,
+            rOptions.Locale,
+            rOptions.changedChars,
+            rOptions.deletedChars,
+            rOptions.insertedChars,
+            rOptions.transliterateFlags,
+            nAlgorithmType2,
+            0   // no wildcard search, no escape character...
+            );
+    setOptions2( aOptions2);
+}
+
+static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
+{
+    auto pOff = std::find_if(rOff.begin(), rOff.end(),
+        [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
+    return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
+}
+
+SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
+{
+    std::unique_lock g(m_aMutex);
+
+    SearchResult sres;
+
+    OUString in_str(searchStr);
+
+    // in non-regex mode, allow searching typographical apostrophe with the ASCII one
+    // to avoid regression after using automatic conversion to U+2019 during typing in Writer
+    bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
+
+    bUsePrimarySrchStr = true;
+
+    if ( xTranslit.is() )
+    {
+        // apply normal transliteration (1<->1, 1<->0)
+
+        sal_Int32 nInStartPos = startPos;
+        if (pRegexMatcher && startPos > 0)
+        {
+            // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
+            // transliterate enough of the leading text to allow sensible look-behind assertions.
+            // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
+            // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
+            // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
+            // the middle of a surrogate pair, preceded by another surrogate pair.
+            const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
+            nInStartPos -= std::min(nMaxLeadingLen, startPos);
+        }
+        sal_Int32 nInEndPos = endPos;
+        if (pRegexMatcher && endPos < searchStr.getLength())
+        {
+            // tdf#65038: ditto for look-ahead assertions
+            const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
+            nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
+        }
+
+        css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
+        in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
+
+        if ( bReplaceApostrophe )
+            in_str = in_str.replace(u'\u2019', '\'');
+
+        // JP 20.6.2001: also the start and end positions must be corrected!
+        sal_Int32 newStartPos =
+            (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
+
+        sal_Int32 newEndPos = (endPos < searchStr.getLength())
+            ? FindPosInSeq_Impl( offset, endPos )
+            : in_str.getLength();
+
+        sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
+
+        // Map offsets back to untransliterated string.
+        const sal_Int32 nOffsets = offset.getLength();
+        if (nOffsets)
+        {
+            auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
+            auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
+            // For regex nGroups is the number of groups+1 with group 0 being
+            // the entire match.
+            const sal_Int32 nGroups = sres.startOffset.getLength();
+            for ( sal_Int32 k = 0; k < nGroups; k++ )
+            {
+                const sal_Int32 nStart = sres.startOffset[k];
+                // Result offsets are negative (-1) if a group expression was
+                // not matched.
+                if (nStart >= 0)
+                    sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
+                // JP 20.6.2001: end is ever exclusive and then don't return
+                //               the position of the next character - return the
+                //               next position behind the last found character!
+                //               "a b c" find "b" must return 2,3 and not 2,4!!!
+                const sal_Int32 nStop = sres.endOffset[k];
+                if (nStop >= 0)
+                {
+                    if (nStop > 0)
+                        sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
+                    else
+                        sres_endOffsetRange[k] = offset[0];
+                }
+            }
+        }
+    }
+    else
+    {
+        if ( bReplaceApostrophe )
+            in_str = in_str.replace(u'\u2019', '\'');
+
+        sres = (this->*fnForward)( in_str, startPos, endPos );
+    }
+
+    if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
+    {
+        SearchResult sres2;
+
+        in_str = searchStr;
+        css::uno::Sequence <sal_Int32> offset( in_str.getLength());
+
+        in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
+
+        if( startPos )
+            startPos = FindPosInSeq_Impl( offset, startPos );
+
+        if( endPos < searchStr.getLength() )
+            endPos = FindPosInSeq_Impl( offset, endPos );
+        else
+            endPos = in_str.getLength();
+
+        bUsePrimarySrchStr = false;
+        sres2 = (this->*fnForward)( in_str, startPos, endPos );
+        auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
+        auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
+
+        for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
+        {
+            if (sres2.startOffset[k])
+                sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
+            if (sres2.endOffset[k])
+                sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
+        }
+
+        // pick first and long one
+        if ( sres.subRegExpressions == 0)
+            return sres2;
+        if ( sres2.subRegExpressions == 1)
+        {
+            if ( sres.startOffset[0] > sres2.startOffset[0])
+                return sres2;
+            else if ( sres.startOffset[0] == sres2.startOffset[0] &&
+                    sres.endOffset[0] < sres2.endOffset[0])
+                return sres2;
+        }
+    }
+
+    return sres;
+}
+
+SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
+{
+    std::unique_lock g(m_aMutex);
+
+    SearchResult sres;
+
+    OUString in_str(searchStr);
+
+    // in non-regex mode, allow searching typographical apostrophe with the ASCII one
+    // to avoid regression after using automatic conversion to U+2019 during typing in Writer
+    bool bReplaceApostrophe = bSearchApostrophe && in_str.indexOf(u'\u2019') > -1;
+
+    bUsePrimarySrchStr = true;
+
+    if ( xTranslit.is() )
+    {
+        // apply only simple 1<->1 transliteration here
+        css::uno::Sequence<sal_Int32> offset(startPos - endPos);
+        in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
+
+        if ( bReplaceApostrophe )
+            in_str = in_str.replace(u'\u2019', '\'');
+
+        // JP 20.6.2001: also the start and end positions must be corrected!
+        sal_Int32 const newStartPos = (startPos < searchStr.getLength())
+            ? FindPosInSeq_Impl( offset, startPos )
+            : in_str.getLength();
+
+        sal_Int32 const newEndPos =
+            (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
+
+        // TODO: this would need nExtraOffset handling to avoid $ matching
+        // if (pRegexMatcher && startPos < searchStr.getLength())
+        // but that appears to be impossible with ICU regex
+
+        sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
+
+        // Map offsets back to untransliterated string.
+        const sal_Int32 nOffsets = offset.getLength();
+        if (nOffsets)
+        {
+            auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
+            auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
+            // For regex nGroups is the number of groups+1 with group 0 being
+            // the entire match.
+            const sal_Int32 nGroups = sres.startOffset.getLength();
+            for ( sal_Int32 k = 0; k < nGroups; k++ )
+            {
+                const sal_Int32 nStart = sres.startOffset[k];
+                // Result offsets are negative (-1) if a group expression was
+                // not matched.
+                if (nStart >= 0)
+                {
+                    if (nStart > 0)
+                        sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
+                    else
+                        sres_startOffsetRange[k] = offset[0];
+                }
+                // JP 20.6.2001: end is ever exclusive and then don't return
+                //               the position of the next character - return the
+                //               next position behind the last found character!
+                //               "a b c" find "b" must return 2,3 and not 2,4!!!
+                const sal_Int32 nStop = sres.endOffset[k];
+                if (nStop >= 0)
+                    sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
+            }
+        }
+    }
+    else
+    {
+        if ( bReplaceApostrophe )
+            in_str = in_str.replace(u'\u2019', '\'');
+
+        sres = (this->*fnBackward)( in_str, startPos, endPos );
+    }
+
+    if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
+    {
+        SearchResult sres2;
+
+        in_str = searchStr;
+        css::uno::Sequence <sal_Int32> offset( in_str.getLength());
+
+        in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
+
+        if( startPos < searchStr.getLength() )
+            startPos = FindPosInSeq_Impl( offset, startPos );
+        else
+            startPos = in_str.getLength();
+
+        if( endPos )
+            endPos = FindPosInSeq_Impl( offset, endPos );
+
+        bUsePrimarySrchStr = false;
+        sres2 = (this->*fnBackward)( in_str, startPos, endPos );
+        auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
+        auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
+
+        for( int k = 0; k < sres2.startOffset.getLength(); k++ )
+        {
+            if (sres2.startOffset[k])
+                sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
+            if (sres2.endOffset[k])
+                sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
+        }
+
+        // pick last and long one
+        if ( sres.subRegExpressions == 0 )
+            return sres2;
+        if ( sres2.subRegExpressions == 1 )
+        {
+            if ( sres.startOffset[0] < sres2.startOffset[0] )
+                return sres2;
+            if ( sres.startOffset[0] == sres2.startOffset[0] &&
+                    sres.endOffset[0] > sres2.endOffset[0] )
+                return sres2;
+        }
+    }
+
+    return sres;
+}
+
+
+bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
+{
+    bool bRet = true;
+    if( '\x7f' != rStr[nPos])
+    {
+        if ( !xCharClass.is() )
+             xCharClass = CharacterClassification::create( m_xContext );
+        sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
+                aSrchPara.Locale );
+        if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
+                        KCharacterType::LETTER ) & nCType ) )
+            bRet = false;
+    }
+    return bRet;
+}
+
+// --------- helper methods for Boyer-Moore like text searching ----------
+// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
+
+void TextSearch::MakeForwardTab()
+{
+    // create the jumptable for the search text
+
+    if( pJumpTable && bIsForwardTab )
+    {
+        return; // the jumpTable is ok
+    }
+    bIsForwardTab = true;
+
+    sal_Int32 n, nLen = sSrchStr.getLength();
+    pJumpTable.reset( new TextSearchJumpTable );
+
+    for( n = 0; n < nLen - 1; ++n )
+    {
+        sal_Unicode cCh = sSrchStr[n];
+        sal_Int32 nDiff = nLen - n - 1;
+        TextSearchJumpTable::value_type aEntry( cCh, nDiff );
+
+        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
+            pJumpTable->insert( aEntry );
+        if ( !aPair.second )
+            (*(aPair.first)).second = nDiff;
+    }
+}
+
+void TextSearch::MakeForwardTab2()
+{
+    // create the jumptable for the search text
+    if( pJumpTable2 && bIsForwardTab )
+    {
+        return;        // the jumpTable is ok
+    }
+    bIsForwardTab = true;
+
+    sal_Int32 n, nLen = sSrchStr2.getLength();
+    pJumpTable2.reset( new TextSearchJumpTable );
+
+    for( n = 0; n < nLen - 1; ++n )
+    {
+        sal_Unicode cCh = sSrchStr2[n];
+        sal_Int32 nDiff = nLen - n - 1;
+
+        TextSearchJumpTable::value_type aEntry( cCh, nDiff );
+        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
+            pJumpTable2->insert( aEntry );
+        if ( !aPair.second )
+            (*(aPair.first)).second = nDiff;
+    }
+}
+
+void TextSearch::MakeBackwardTab()
+{
+    // create the jumptable for the search text
+    if( pJumpTable && !bIsForwardTab)
+    {
+        return;   // the jumpTable is ok
+    }
+    bIsForwardTab = false;
+
+    sal_Int32 n, nLen = sSrchStr.getLength();
+    pJumpTable.reset( new TextSearchJumpTable );
+
+    for( n = nLen-1; n > 0; --n )
+    {
+        sal_Unicode cCh = sSrchStr[n];
+        TextSearchJumpTable::value_type aEntry( cCh, n );
+        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
+            pJumpTable->insert( aEntry );
+        if ( !aPair.second )
+            (*(aPair.first)).second = n;
+    }
+}
+
+void TextSearch::MakeBackwardTab2()
+{
+    // create the jumptable for the search text
+    if( pJumpTable2 && !bIsForwardTab )
+    {
+        return;    // the jumpTable is ok
+    }
+    bIsForwardTab = false;
+
+    sal_Int32 n, nLen = sSrchStr2.getLength();
+    pJumpTable2.reset( new TextSearchJumpTable );
+
+    for( n = nLen-1; n > 0; --n )
+    {
+        sal_Unicode cCh = sSrchStr2[n];
+        TextSearchJumpTable::value_type aEntry( cCh, n );
+        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
+            pJumpTable2->insert( aEntry );
+        if ( !aPair.second )
+            (*(aPair.first)).second = n;
+    }
+}
+
+sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
+{
+    TextSearchJumpTable *pJump;
+    OUString sSearchKey;
+
+    if ( bUsePrimarySrchStr ) {
+        pJump = pJumpTable.get();
+        sSearchKey = sSrchStr;
+    } else {
+        pJump = pJumpTable2.get();
+        sSearchKey = sSrchStr2;
+    }
+
+    TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
+    if ( iLook == pJump->end() )
+        return sSearchKey.getLength();
+    return (*iLook).second;
+}
+
+
+SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
+{
+    SearchResult aRet;
+    aRet.subRegExpressions = 0;
+
+    OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
+
+    sal_Int32 nSuchIdx = searchStr.getLength();
+    sal_Int32 nEnd = endPos;
+    if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
+        return aRet;
+
+
+    if( nEnd < sSearchKey.getLength() )   // position inside the search region ?
+        return aRet;
+
+    nEnd -= sSearchKey.getLength();
+
+    if (bUsePrimarySrchStr)
+      MakeForwardTab();                   // create the jumptable
+    else
+      MakeForwardTab2();
+
+    for (sal_Int32 nCmpIdx = startPos; // start position for the search
+            nCmpIdx <= nEnd;
+            nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
+    {
+        nSuchIdx = sSearchKey.getLength() - 1;
+        while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
+        {
+            if( nSuchIdx == 0 )
+            {
+                if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
+                {
+                    sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
+                    bool bAtStart = !nCmpIdx;
+                    bool bAtEnd = nFndEnd == endPos;
+                    bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
+                    bool bDelimBehind = bAtEnd || IsDelimiter(  searchStr, nFndEnd );
+                    //  *       1 -> only one word in the paragraph
+                    //  *       2 -> at begin of paragraph
+                    //  *       3 -> at end of paragraph
+                    //  *       4 -> inside the paragraph
+                    if( !(  ( bAtStart && bAtEnd ) ||           // 1
+                                ( bAtStart && bDelimBehind ) ||     // 2
+                                ( bAtEnd && bDelimBefore ) ||       // 3
+                                ( bDelimBefore && bDelimBehind )))  // 4
+                        break;
+                }
+
+                aRet.subRegExpressions = 1;
+                aRet.startOffset = { nCmpIdx };
+                aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
+
+                return aRet;
+            }
+            else
+                nSuchIdx--;
+        }
+    }
+    return aRet;
+}
+
+SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
+{
+    SearchResult aRet;
+    aRet.subRegExpressions = 0;
+
+    OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
+
+    sal_Int32 nSuchIdx = searchStr.getLength();
+    sal_Int32 nEnd = endPos;
+    if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
+        return aRet;
+
+    if (bUsePrimarySrchStr)
+        MakeBackwardTab();                  // create the jumptable
+    else
+        MakeBackwardTab2();
+
+    if( nEnd == nSuchIdx )                  // end position for the search
+        nEnd = sSearchKey.getLength();
+    else
+        nEnd += sSearchKey.getLength();
+
+    sal_Int32 nCmpIdx = startPos;          // start position for the search
+
+    while (nCmpIdx >= nEnd)
+    {
+        nSuchIdx = 0;
+        while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
+                searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
+            nSuchIdx++;
+        if( nSuchIdx >= sSearchKey.getLength() )
+        {
+            if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
+            {
+                sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
+                bool bAtStart = !nFndStt;
+                bool bAtEnd = nCmpIdx == startPos;
+                bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
+                bool bDelimBefore = bAtStart || // begin of paragraph
+                    IsDelimiter( searchStr, nFndStt-1 );
+                //  *       1 -> only one word in the paragraph
+                //  *       2 -> at begin of paragraph
+                //  *       3 -> at end of paragraph
+                //  *       4 -> inside the paragraph
+                if( ( bAtStart && bAtEnd ) ||           // 1
+                        ( bAtStart && bDelimBehind ) ||     // 2
+                        ( bAtEnd && bDelimBefore ) ||       // 3
+                        ( bDelimBefore && bDelimBehind ))   // 4
+                {
+                    aRet.subRegExpressions = 1;
+                    aRet.startOffset = { nCmpIdx };
+                    aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
+                    return aRet;
+                }
+            }
+            else
+            {
+                aRet.subRegExpressions = 1;
+                aRet.startOffset = { nCmpIdx };
+                aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
+                return aRet;
+            }
+        }
+        nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
+        if( nCmpIdx < nSuchIdx )
+            return aRet;
+        nCmpIdx -= nSuchIdx;
+    }
+    return aRet;
+}
+
+void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
+{
+    TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
+    // select the transliterated pattern string
+    const OUString& rPatternStr =
+        (isSimpleTrans(transliterateFlags) ? sSrchStr
+        : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
+
+    sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
+    // map css::util::SearchFlags to ICU uregex.h flags
+    // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
+    // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
+    // REG_NOSUB is not used anywhere => not implemented
+    // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
+    // LEV_RELAXED is only used for SearchAlgorithm==Approximate
+    // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
+    // probably because the transliteration flag IGNORE_CASE handles it as well.
+    if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
+    ||  (transliterateFlags & TransliterationFlags::IGNORE_CASE))
+        nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
+    UErrorCode nIcuErr = U_ZERO_ERROR;
+    // assumption: transliteration didn't mangle regexp control chars
+    icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
+#ifndef DISABLE_WORDBOUND_EMULATION
+    // for convenience specific syntax elements of the old regex engine are emulated
+    // - by replacing \< with "word-break followed by a look-ahead word-char"
+    static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
+    static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
+    static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
+    aChevronMatcherB.reset( aIcuSearchPatStr);
+    aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
+    aChevronMatcherB.reset();
+    // - by replacing \> with "look-behind word-char followed by a word-break"
+    static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
+    static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
+    static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
+    aChevronMatcherE.reset( aIcuSearchPatStr);
+    aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
+    aChevronMatcherE.reset();
+#endif
+    pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
+    if (nIcuErr)
+    {
+        SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
+        pRegexMatcher.reset();
+    }
+    else
+    {
+        // Pathological patterns may result in exponential run time making the
+        // application appear to be frozen. Limit that. Documentation for this
+        // call says
+        // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
+        // "The units of the limit are steps of the match engine.
+        // Correspondence with actual processor time will depend on the speed
+        // of the processor and the details of the specific pattern, but will
+        // typically be on the order of milliseconds."
+        // Just what is a good value? 42 is always an answer ... the 23 enigma
+        // as well... which on the dev's machine is roughly 50 seconds with the
+        // pattern of fdo#70627.
+        /* TODO: make this a configuration settable value and possibly take
+         * complexity of expression into account and maybe even length of text
+         * to be matched; currently (2013-11-25) that is at most one 64k
+         * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
+        pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
+    }
+}
+
+
+static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
+                          sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
+{
+    pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
+    pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
+    pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
+                                               // look-ahead/behind assertions
+
+    if (!pRegexMatcher->find(rIcuErr))
+    {
+        /* TODO: future versions could pass the UErrorCode or translations
+         * thereof to the caller, for example to inform the user of
+         * U_REGEX_TIME_OUT. The strange thing though is that an error is set
+         * only after the second call that returns immediately and not if
+         * timeout occurred on the first call?!? */
+        SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
+        return false;
+    }
+    return true;
+}
+
+SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
+                                      sal_Int32 startPos, sal_Int32 endPos )
+{
+    SearchResult aRet;
+    aRet.subRegExpressions = 0;
+    if( !pRegexMatcher)
+        return aRet;
+
+    if( endPos > searchStr.getLength())
+        endPos = searchStr.getLength();
+
+    // use the ICU RegexMatcher to find the matches
+    UErrorCode nIcuErr = U_ZERO_ERROR;
+    const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
+                                        searchStr.getLength());
+    pRegexMatcher->reset( aSearchTargetStr);
+    // search until there is a valid match
+    for(;;)
+    {
+        if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
+            return aRet;
+
+        // #i118887# ignore zero-length matches e.g. "a*" in "bc"
+        int nStartOfs = pRegexMatcher->start( nIcuErr);
+        int nEndOfs = pRegexMatcher->end( nIcuErr);
+        if( nStartOfs < nEndOfs)
+            break;
+        // If the zero-length match is behind the string, do not match it again
+        // and again until startPos reaches there. A match behind the string is
+        // a "$" anchor.
+        if (nStartOfs == endPos)
+            break;
+        // try at next position if there was a zero-length match
+        if( ++startPos >= endPos)
+            return aRet;
+    }
+
+    // extract the result of the search
+    const int nGroupCount = pRegexMatcher->groupCount();
+    aRet.subRegExpressions = nGroupCount + 1;
+    aRet.startOffset.realloc( aRet.subRegExpressions);
+    auto pstartOffset = aRet.startOffset.getArray();
+    aRet.endOffset.realloc( aRet.subRegExpressions);
+    auto pendOffset = aRet.endOffset.getArray();
+    pstartOffset[0] = pRegexMatcher->start( nIcuErr);
+    pendOffset[0]   = pRegexMatcher->end( nIcuErr);
+    for( int i = 1; i <= nGroupCount; ++i) {
+        pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
+        pendOffset[i]   = pRegexMatcher->end( i, nIcuErr);
+    }
+
+    return aRet;
+}
+
+SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
+                                      sal_Int32 startPos, sal_Int32 endPos )
+{
+    // NOTE: for backwards search callers provide startPos/endPos inverted!
+    SearchResult aRet;
+    aRet.subRegExpressions = 0;
+    if( !pRegexMatcher)
+        return aRet;
+
+    if( startPos > searchStr.getLength())
+        startPos = searchStr.getLength();
+
+    // use the ICU RegexMatcher to find the matches
+    // TODO: use ICU's backward searching once it becomes available
+    //       as its replacement using forward search is not as good as the real thing
+    UErrorCode nIcuErr = U_ZERO_ERROR;
+    const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
+                                        searchStr.getLength());
+    pRegexMatcher->reset( aSearchTargetStr);
+    if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
+        return aRet;
+
+    // find the last match
+    int nLastPos = 0;
+    int nFoundEnd = 0;
+    int nGoodPos = 0, nGoodEnd = 0;
+    bool bFirst = true;
+    do {
+        nLastPos = pRegexMatcher->start( nIcuErr);
+        nFoundEnd = pRegexMatcher->end( nIcuErr);
+        if (nLastPos < nFoundEnd)
+        {
+            // remember last non-zero-length match
+            nGoodPos = nLastPos;
+            nGoodEnd = nFoundEnd;
+        }
+        if( nFoundEnd >= startPos)
+            break;
+        bFirst = false;
+        if( nFoundEnd == nLastPos)
+            ++nFoundEnd;
+    } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
+
+    // Ignore all zero-length matches except "$" anchor on first match.
+    if (nGoodPos == nGoodEnd)
+    {
+        if (bFirst && nLastPos == startPos)
+            nGoodPos = nLastPos;
+        else
+            return aRet;
+    }
+
+    // find last match again to get its details
+    lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
+
+    // fill in the details of the last match
+    const int nGroupCount = pRegexMatcher->groupCount();
+    aRet.subRegExpressions = nGroupCount + 1;
+    aRet.startOffset.realloc( aRet.subRegExpressions);
+    auto pstartOffset = aRet.startOffset.getArray();
+    aRet.endOffset.realloc( aRet.subRegExpressions);
+    auto pendOffset = aRet.endOffset.getArray();
+    // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
+    pstartOffset[0] = pRegexMatcher->end( nIcuErr);
+    pendOffset[0]   = pRegexMatcher->start( nIcuErr);
+    for( int i = 1; i <= nGroupCount; ++i) {
+        pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
+        pendOffset[i]   = pRegexMatcher->start( i, nIcuErr);
+    }
+
+    return aRet;
+}
+
+
+// search for words phonetically
+SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
+                                          sal_Int32 startPos, sal_Int32 endPos )
+{
+    SearchResult aRet;
+    aRet.subRegExpressions = 0;
+
+    if( !xBreak.is() )
+        return aRet;
+
+    sal_Int32 nStt, nEnd;
+
+    Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
+            aSrchPara.Locale,
+            WordType::ANYWORD_IGNOREWHITESPACES, true );
+
+    do
+    {
+        if( aWBnd.startPos >= endPos )
+            break;
+        nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
+        nEnd = std::min(aWBnd.endPos, endPos);
+
+        if( nStt < nEnd &&
+                pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
+        {
+            aRet.subRegExpressions = 1;
+            aRet.startOffset = { nStt };
+            aRet.endOffset = { nEnd };
+            break;
+        }
+
+        nStt = nEnd - 1;
+        aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
+                WordType::ANYWORD_IGNOREWHITESPACES);
+    } while( aWBnd.startPos != aWBnd.endPos ||
+            (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
+    // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
+    // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
+    // and nextWord() does also => don't loop forever.
+    return aRet;
+}
+
+SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
+                                          sal_Int32 startPos, sal_Int32 endPos )
+{
+    SearchResult aRet;
+    aRet.subRegExpressions = 0;
+
+    if( !xBreak.is() )
+        return aRet;
+
+    sal_Int32 nStt, nEnd;
+
+    Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
+            aSrchPara.Locale,
+            WordType::ANYWORD_IGNOREWHITESPACES, true );
+
+    do
+    {
+        if( aWBnd.endPos <= endPos )
+            break;
+        nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
+        nEnd = std::min(aWBnd.endPos, startPos);
+
+        if( nStt < nEnd &&
+                pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
+        {
+            aRet.subRegExpressions = 1;
+            aRet.startOffset = { nEnd };
+            aRet.endOffset = { nStt };
+            break;
+        }
+        if( !nStt )
+            break;
+
+        aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
+                WordType::ANYWORD_IGNOREWHITESPACES);
+    } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
+    return aRet;
+}
+
+
+namespace {
+void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
+{
+    rRes.subRegExpressions = 1;
+    rRes.startOffset = { nStartOffset };
+    rRes.endOffset = { nEndOffset };
+}
+}
+
+SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
+{
+    SearchResult aRes;
+    aRes.subRegExpressions = 0;     // no match
+    sal_Int32 nStartOffset = nStartPos;
+    sal_Int32 nEndOffset = nEndPos;
+
+    const sal_Int32 nStringLen = searchStr.getLength();
+
+    // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
+    // string match with [0,0).
+    if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
+            (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
+        return aRes;
+
+    const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
+    const sal_Int32 nPatternLen = rPattern.getLength();
+
+    // Handle special cases empty pattern and/or string outside of the loop to
+    // not add performance penalties there and simplify.
+    if (nStartPos == nEndPos)
+    {
+        sal_Int32 i = 0;
+        while (i < nPatternLen && rPattern[i] == '*')
+            ++i;
+        if (i == nPatternLen)
+            setWildcardMatch( aRes, nStartOffset, nEndOffset);
+        return aRes;
+    }
+
+    // Empty pattern does not match any non-empty string.
+    if (!nPatternLen)
+        return aRes;
+
+    bool bRewind = false;
+    sal_uInt32 cPattern = 0;
+    sal_Int32 nPattern = 0;
+    sal_Int32 nAfterFakePattern = nPattern;
+    if (mbWildcardAllowSubstring)
+    {
+        // Fake a leading '*' wildcard.
+        cPattern = '*';
+        bRewind = true;
+        // Assume a non-'*' pattern character follows. If it is a '*' instead
+        // that will be handled in the loop by setting nPat.
+        sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
+        if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
+            rPattern.iterateCodePoints( &nAfterFakePattern);
+    }
+
+    sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
+    sal_uInt32 cPatternAfterAsterisk = 0;
+    bool bEscaped = false, bEscapedAfterAsterisk = false;
+
+    // The loop code tries to avoid costly calls to iterateCodePoints() when
+    // possible.
+
+    do
+    {
+        if (bRewind)
+        {
+            // Reuse cPattern after '*', nPattern was correspondingly
+            // incremented to point behind cPattern.
+            bRewind = false;
+        }
+        else if (nPattern < nPatternLen)
+        {
+            // nPattern will be incremented by iterateCodePoints().
+            cPattern = rPattern.iterateCodePoints( &nPattern);
+            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
+            {
+                bEscaped = true;
+                cPattern = rPattern.iterateCodePoints( &nPattern);
+            }
+        }
+        else
+        {
+            // A trailing '*' is handled below.
+            if (mbWildcardAllowSubstring)
+            {
+                // If the pattern is consumed and substring match allowed we're good.
+                setWildcardMatch( aRes, nStartOffset, nString);
+                return aRes;
+            }
+            else if (nString < nEndPos && nLastAsterisk >= 0)
+            {
+                // If substring match is not allowed try a greedy '*' match.
+                nPattern = nLastAsterisk;
+                continue;   // do
+            }
+            else
+                return aRes;
+        }
+
+        if (cPattern == '*' && !bEscaped)
+        {
+            // '*' is one code unit, so not using iterateCodePoints() is ok.
+            while (nPattern < nPatternLen && rPattern[nPattern] == '*')
+                ++nPattern;
+
+            if (nPattern >= nPatternLen)
+            {
+                // Last pattern is '*', remaining string matches.
+                setWildcardMatch( aRes, nStartOffset, nEndOffset);
+                return aRes;
+            }
+
+            nLastAsterisk = nPattern;   // Remember last encountered '*'.
+
+            // cPattern will be the next non-'*' character, nPattern
+            // incremented.
+            cPattern = rPattern.iterateCodePoints( &nPattern);
+            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
+            {
+                bEscaped = true;
+                cPattern = rPattern.iterateCodePoints( &nPattern);
+            }
+
+            cPatternAfterAsterisk = cPattern;
+            bEscapedAfterAsterisk = bEscaped;
+            nPat = nPattern;    // Remember position of pattern behind '*', already incremented.
+            nStr = nString;     // Remember the current string to be matched.
+        }
+
+        if (nString >= nEndPos)
+            // Whatever follows in pattern, string will not match.
+            return aRes;
+
+        // nString will be incremented by iterateCodePoints().
+        sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
+
+        if ((cPattern != '?' || bEscaped) && cPattern != cString)
+        {
+            if (nPat == -1)
+                // Non-match already without any '*' pattern.
+                return aRes;
+
+            bRewind = true;
+            nPattern = nPat;                    // Rewind pattern to character behind '*', already incremented.
+            cPattern = cPatternAfterAsterisk;
+            bEscaped = bEscapedAfterAsterisk;
+            searchStr.iterateCodePoints( &nStr);
+            nString = nStr;                     // Restore incremented remembered string position.
+            if (nPat == nAfterFakePattern)
+            {
+                // Next start offset will be the next character.
+                nStartOffset = nString;
+            }
+        }
+        else
+        {
+            // An unescaped '?' pattern matched any character, or characters
+            // matched. Reset only escaped state.
+            bEscaped = false;
+        }
+    }
+    while (nString < nEndPos);
+
+    if (bRewind)
+        return aRes;
+
+    // Eat trailing '*' pattern that matches anything, including nothing.
+    // '*' is one code unit, so not using iterateCodePoints() is ok.
+    while (nPattern < nPatternLen && rPattern[nPattern] == '*')
+        ++nPattern;
+
+    if (nPattern == nPatternLen)
+        setWildcardMatch( aRes, nStartOffset, nEndOffset);
+    return aRes;
+}
+
+SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
+{
+    SearchResult aRes;
+    aRes.subRegExpressions = 0;     // no match
+
+    sal_Int32 nStartOffset = nStartPos;
+    sal_Int32 nEndOffset = nEndPos;
+
+    const sal_Int32 nStringLen = searchStr.getLength();
+
+    // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
+    // string match with (0,0].
+    if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
+            (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
+        return aRes;
+
+    const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
+    sal_Int32 nPatternLen = rPattern.getLength();
+
+    // Handle special cases empty pattern and/or string outside of the loop to
+    // not add performance penalties there and simplify.
+    if (nStartPos == nEndPos)
+    {
+        sal_Int32 i = 0;
+        while (i < nPatternLen && rPattern[i] == '*')
+            ++i;
+        if (i == nPatternLen)
+            setWildcardMatch( aRes, nStartOffset, nEndOffset);
+        return aRes;
+    }
+
+    // Empty pattern does not match any non-empty string.
+    if (!nPatternLen)
+        return aRes;
+
+    // Reverse escaped patterns to ease the handling of escapes, keeping escape
+    // and following character as one sequence in backward direction.
+    if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
+            (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
+    {
+        OUStringBuffer aPatternBuf( rPattern);
+        sal_Int32 nIndex = 0;
+        while (nIndex < nPatternLen)
+        {
+            const sal_Int32 nOld = nIndex;
+            const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
+            if (cu == mcWildcardEscapeChar)
+            {
+                if (nIndex < nPatternLen)
+                {
+                    if (nIndex - nOld == 1)
+                    {
+                        // Simply move code units, we already memorized the one
+                        // in 'cu'.
+                        const sal_Int32 nOld2 = nIndex;
+                        rPattern.iterateCodePoints( &nIndex);
+                        for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
+                            aPatternBuf[nOld+i] = rPattern[nOld2+i];
+                        aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
+                    }
+                    else
+                    {
+                        // Copy the escape character code units first in the
+                        // unlikely case that it would not be of BMP.
+                        assert(nIndex - nOld == 2);  // it's UTF-16, so...
+                        sal_Unicode buf[2];
+                        buf[0] = rPattern[nOld];
+                        buf[1] = rPattern[nOld+1];
+                        const sal_Int32 nOld2 = nIndex;
+                        rPattern.iterateCodePoints( &nIndex);
+                        for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
+                            aPatternBuf[nOld+i] = rPattern[nOld2+i];
+                        aPatternBuf[nIndex-2] = buf[0];
+                        aPatternBuf[nIndex-1] = buf[1];
+                    }
+                }
+                else
+                {
+                    // Trailing escape would become leading escape, do what?
+                    // Eliminate.
+                    aPatternBuf.remove( nOld, nIndex - nOld);
+                }
+            }
+        }
+        if (bUsePrimarySrchStr)
+            maWildcardReversePattern = aPatternBuf.makeStringAndClear();
+        else
+            maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
+    }
+    const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
+    nPatternLen = rReversePattern.getLength();
+
+    bool bRewind = false;
+    sal_uInt32 cPattern = 0;
+    sal_Int32 nPattern = nPatternLen;
+    sal_Int32 nAfterFakePattern = nPattern;
+    if (mbWildcardAllowSubstring)
+    {
+        // Fake a trailing '*' wildcard.
+        cPattern = '*';
+        bRewind = true;
+        // Assume a non-'*' pattern character follows. If it is a '*' instead
+        // that will be handled in the loop by setting nPat.
+        sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
+        if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
+            rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
+    }
+
+    sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
+    sal_uInt32 cPatternAfterAsterisk = 0;
+    bool bEscaped = false, bEscapedAfterAsterisk = false;
+
+    // The loop code tries to avoid costly calls to iterateCodePoints() when
+    // possible.
+
+    do
+    {
+        if (bRewind)
+        {
+            // Reuse cPattern after '*', nPattern was correspondingly
+            // decremented to point before cPattern.
+            bRewind = false;
+        }
+        else if (nPattern > 0)
+        {
+            // nPattern will be decremented by iterateCodePoints().
+            cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
+            {
+                bEscaped = true;
+                cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+            }
+        }
+        else
+        {
+            // A trailing '*' is handled below.
+            if (mbWildcardAllowSubstring)
+            {
+                // If the pattern is consumed and substring match allowed we're good.
+                setWildcardMatch( aRes, nStartOffset, nString);
+                return aRes;
+            }
+            else if (nString > nEndPos && nLastAsterisk >= 0)
+            {
+                // If substring match is not allowed try a greedy '*' match.
+                nPattern = nLastAsterisk;
+                continue;   // do
+            }
+            else
+                return aRes;
+        }
+
+        if (cPattern == '*' && !bEscaped)
+        {
+            // '*' is one code unit, so not using iterateCodePoints() is ok.
+            while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
+                --nPattern;
+
+            if (nPattern <= 0)
+            {
+                // First pattern is '*', remaining string matches.
+                setWildcardMatch( aRes, nStartOffset, nEndOffset);
+                return aRes;
+            }
+
+            nLastAsterisk = nPattern;   // Remember last encountered '*'.
+
+            // cPattern will be the previous non-'*' character, nPattern
+            // decremented.
+            cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
+            {
+                bEscaped = true;
+                cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
+            }
+
+            cPatternAfterAsterisk = cPattern;
+            bEscapedAfterAsterisk = bEscaped;
+            nPat = nPattern;    // Remember position of pattern before '*', already decremented.
+            nStr = nString;     // Remember the current string to be matched.
+        }
+
+        if (nString <= nEndPos)
+            // Whatever leads in pattern, string will not match.
+            return aRes;
+
+        // nString will be decremented by iterateCodePoints().
+        sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
+
+        if ((cPattern != '?' || bEscaped) && cPattern != cString)
+        {
+            if (nPat == -1)
+                // Non-match already without any '*' pattern.
+                return aRes;
+
+            bRewind = true;
+            nPattern = nPat;                    // Rewind pattern to character before '*', already decremented.
+            cPattern = cPatternAfterAsterisk;
+            bEscaped = bEscapedAfterAsterisk;
+            searchStr.iterateCodePoints( &nStr, -1);
+            nString = nStr;                     // Restore decremented remembered string position.
+            if (nPat == nAfterFakePattern)
+            {
+                // Next start offset will be this character (exclusive).
+                nStartOffset = nString;
+            }
+        }
+        else
+        {
+            // An unescaped '?' pattern matched any character, or characters
+            // matched. Reset only escaped state.
+            bEscaped = false;
+        }
+    }
+    while (nString > nEndPos);
+
+    if (bRewind)
+        return aRes;
+
+    // Eat leading '*' pattern that matches anything, including nothing.
+    // '*' is one code unit, so not using iterateCodePoints() is ok.
+    while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
+        --nPattern;
+
+    if (nPattern == 0)
+        setWildcardMatch( aRes, nStartOffset, nEndOffset);
+    return aRes;
+}
+
+
+OUString SAL_CALL
+TextSearch::getImplementationName()
+{
+    return "com.sun.star.util.TextSearch_i18n";
+}
+
+sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
+{
+    return cppu::supportsService(this, rServiceName);
+}
+
+Sequence< OUString > SAL_CALL
+TextSearch::getSupportedServiceNames()
+{
+    return { "com.sun.star.util.TextSearch", "com.sun.star.util.TextSearch2" };
+}
+
+extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
+i18npool_TextSearch_get_implementation(
+    css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
+{
+    return cppu::acquire(new TextSearch(context));
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/i18npool/source/search/textsearch.hxx b/i18npool/source/search/textsearch.hxx
new file mode 100644
index 000000000..43a643537
--- /dev/null
+++ b/i18npool/source/search/textsearch.hxx
@@ -0,0 +1,161 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#ifndef INCLUDED_I18NPOOL_SOURCE_SEARCH_TEXTSEARCH_HXX
+#define INCLUDED_I18NPOOL_SOURCE_SEARCH_TEXTSEARCH_HXX
+
+#include <cppuhelper/implbase.hxx>
+#include <com/sun/star/util/XTextSearch2.hpp>
+#include <com/sun/star/lang/XServiceInfo.hpp>
+
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include <unicode/regex.h>
+#include <unicode/unistr.h>
+#include <unicode/uversion.h>
+
+namespace com::sun::star::i18n { class XBreakIterator; }
+namespace com::sun::star::i18n { class XCharacterClassification; }
+namespace com::sun::star::i18n { class XExtendedTransliteration; }
+namespace com::sun::star::uno { class XComponentContext; }
+
+
+class WLevDistance;
+typedef ::std::map< sal_Unicode, sal_Int32 > TextSearchJumpTable;
+
+class TextSearch: public cppu::WeakImplHelper
+<
+    css::util::XTextSearch2,
+    css::lang::XServiceInfo
+>
+{
+    std::mutex m_aMutex;
+    css::uno::Reference < css::uno::XComponentContext > m_xContext;
+
+    css::util::SearchOptions2 aSrchPara;
+    OUString sSrchStr;
+    OUString sSrchStr2;
+
+    mutable css::uno::Reference< css::i18n::XCharacterClassification > xCharClass;
+
+    css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit;
+    css::uno::Reference< css::i18n::XExtendedTransliteration > xTranslit2;
+
+    // define a function pointer for the different search methods
+    typedef css::util::SearchResult
+        (SAL_CALL TextSearch::*FnSrch)( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+
+    FnSrch fnForward;
+    FnSrch fnBackward;
+
+    // to fix UX regression, U+0027 matches also U+2019 in non-regex search
+    bool bSearchApostrophe;
+
+    // Members and methods for the normal (Boyer-Moore) search
+    std::unique_ptr<TextSearchJumpTable> pJumpTable;
+    std::unique_ptr<TextSearchJumpTable> pJumpTable2;
+    bool bIsForwardTab;
+    bool bUsePrimarySrchStr;
+    void MakeForwardTab();
+    void MakeForwardTab2();
+    void MakeBackwardTab();
+    void MakeBackwardTab2();
+    sal_Int32 GetDiff( const sal_Unicode ) const;
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        NSrchFrwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        NSrchBkwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+
+    // Members and methods for the regular expression search
+    std::unique_ptr<icu::RegexMatcher> pRegexMatcher;
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        RESrchFrwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        RESrchBkwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+    void RESrchPrepare( const css::util::SearchOptions2&);
+
+    // Members and methods for the "Weight Levenshtein-Distance" search
+    int nLimit;
+    std::unique_ptr<WLevDistance> pWLD;
+    css::uno::Reference < css::i18n::XBreakIterator > xBreak;
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        ApproxSrchFrwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        ApproxSrchBkwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+
+    // Members and methods for the wildcard search
+    OUString    maWildcardReversePattern;
+    OUString    maWildcardReversePattern2;
+    sal_uInt32  mcWildcardEscapeChar;
+    bool        mbWildcardAllowSubstring;
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        WildcardSrchFrwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+    /// @throws css::uno::RuntimeException
+    css::util::SearchResult SAL_CALL
+        WildcardSrchBkwrd( const OUString& searchStr,
+                                sal_Int32 startPos, sal_Int32 endPos );
+
+    bool IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const;
+
+public:
+    explicit TextSearch(
+        const css::uno::Reference < css::uno::XComponentContext >& rxContext );
+
+    virtual ~TextSearch() override;
+
+    // XTextSearch
+    virtual void SAL_CALL
+        setOptions( const css::util::SearchOptions& options ) override;
+    virtual css::util::SearchResult SAL_CALL
+        searchForward( const OUString& searchStr,
+                        sal_Int32 startPos, sal_Int32 endPos ) override;
+    virtual css::util::SearchResult SAL_CALL
+        searchBackward( const OUString& searchStr,
+                        sal_Int32 startPos, sal_Int32 endPos ) override;
+
+    // XTextSearch2
+    virtual void SAL_CALL
+        setOptions2( const css::util::SearchOptions2& options ) override;
+
+    //XServiceInfo
+    virtual OUString SAL_CALL getImplementationName() override;
+    virtual sal_Bool SAL_CALL supportsService(const OUString& ServiceName) override;
+    virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override;
+};
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 09:06:44 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 09:06:44 +0000
commit	ed5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch)
tree	7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /i18npool/source/search
parent	Initial commit. (diff)
download	libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.tar.xz libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.zip