Adding upstream version 4:24.2.0.upstream/4%24.2.0

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:54:39 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-15 05:54:39 +0000
commit: 267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree: 358c9467650e1d0a1d7227a21dac2e3d08b622b2 /i18npool/source/search/levdis.hxx
parent: Initial commit. (diff)
download: libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz
libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip
1 files changed, 211 insertions, 0 deletions
diff --git a/i18npool/source/search/levdis.hxx b/i18npool/source/search/levdis.hxx
new file mode 100644
index 0000000000..b378c03cff
--- /dev/null
+++ b/i18npool/source/search/levdis.hxx
@@ -0,0 +1,211 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * This file incorporates work covered by the following license notice:
+ *
+ *   Licensed to the Apache Software Foundation (ASF) under one or more
+ *   contributor license agreements. See the NOTICE file distributed
+ *   with this work for additional information regarding copyright
+ *   ownership. The ASF licenses this file to you under the Apache
+ *   License, Version 2.0 (the "License"); you may not use this file
+ *   except in compliance with the License. You may obtain a copy of
+ *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
+ */
+
+#pragma once
+
+#include <sal/types.h>
+#include <memory>
+
+// Sensible default values for a user interface could be:
+//  LEVDISDEFAULT_XOTHER    2
+//      Maximum X replacements to match query, found data may be different by X
+//      characters from query.
+//  LEVDISDEFAULT_YSHORTER  1
+//      Maximum Y insertions to match query, found data may be Y characters
+//      shorter than query.
+//  LEVDISDEFAULT_ZLONGER   3
+//      Maximum Z deletions to match query, found data may be Z characters
+//      longer than query.
+//  LEVDISDEFAULT_RELAXED   TRUE
+//      Use relaxed SplitCount instead of mathematical WLD.
+//
+// Joker/wildcards ('?' and '*') of course do not count as
+// replacement/insertion/deletion. At a '?' a replacement is not counted, for a
+// '*' the found data may be any number of characters longer than the query.
+//
+// Strict mathematical WLD: EITHER maximum X replacements OR Y characters
+// shorter OR Z characters longer.
+// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
+// Z characters longer. Any combination of actions is valid.
+//
+// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
+// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
+//
+// The corresponding internal default weigh values for these user interface
+// values would be:
+//  LEVDISDEFAULTLIMIT  6
+//      Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
+//  LEVDISDEFAULT_P0    3
+//      Default nRepP0, weight of replacements.
+//  LEVDISDEFAULT_Q0    6
+//      Default nInsQ0, weight of insertions.
+//  LEVDISDEFAULT_R0    2
+//      Default nDelR0, weight of deletions.
+
+// The transformation of user input values to weighs is done using CalcLPQR().
+// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
+// characters longer than pattern) then no character can be replaced any more.
+// This can be circumvented by increasing nX or/and nZ, but of course with the
+// side effect of being less strict then... or the other solution is to use
+// relaxed SplitCount (see below), which is the default when using CalcLPQR().
+//
+// Attention: shorter = WLD.Insert, longer = WLD.Delete
+//
+// View and counting is always from data string to pattern, a deletion means
+// that something is deleted from data to match pattern.
+//
+// Deletions weigh less in this example because usually less is known than is
+// searched for. Replacements get middle weight, for example because of
+// misspellings. Insertions are expensive.
+//
+// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
+// Allowed are maximum 4 replacements, no insertion, no deletion.
+// Matches the user interface values X = 3, Y = 0, Z = 0
+//
+// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
+// of WLD() then isn't necessarily the Levenshtein-Distance, but can be
+// negative (-WLD) if the WLD is greater than nLimit but single values are
+// within the limits.
+// For the above default values that could mean: even if the found string is
+// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
+// to reach pattern. Additionally, character swaps count as one replacement.
+// Mathematically completely incorrect, but meets user expectations ;-)
+//
+// Explanation: in the real WLD all actions are withdrawn from a common 100%
+// pool, if one gets all there's nothing left for others. With bSplitCount
+// replacements have their own pool.
+
+
+/** "Safe" memory allocation in ctor */
+class WLevDisPatternMem
+{
+    std::unique_ptr<sal_Unicode[]> cp;
+    std::unique_ptr<bool[]>        bp;
+public:
+    explicit WLevDisPatternMem( sal_Int32 s )
+        : cp(new sal_Unicode[s])
+        , bp(new bool[s])
+    {
+    }
+
+    sal_Unicode* GetcPtr() const        { return cp.get(); }
+    bool* GetbPtr() const               { return bp.get(); }
+};
+
+class WLevDisDistanceMem
+{
+    std::unique_ptr<int[]> p;
+public:
+    explicit WLevDisDistanceMem( size_t s )
+    {
+        NewMem(s);
+    }
+    int* GetPtr() const             { return p.get(); }
+    int* NewMem( size_t s )
+    {
+        p.reset(new int[ s<3 ? 3 : s ]);
+        return p.get();
+    }
+};
+
+/** Weighted Levenshtein Distance (WLD)
+
+    For a more detailed explanation see documentation in
+    i18npool/source/search/levdis.hxx
+ */
+class WLevDistance
+{
+    sal_Int32       nPatternLen;    ///< length of pattern
+    WLevDisPatternMem   aPatMem;    ///< manage allocation of pattern array
+    sal_Unicode*    cpPattern;      ///< pointer to pattern array
+    bool*           bpPatIsWild;    ///< pointer to bool array whether pattern is wildcard
+    sal_Int32       nArrayLen;      ///< length of distance array
+    WLevDisDistanceMem  aDisMem;    ///< manage allocation of distance array
+    int*            npDistance;     ///< pointer to distance array
+    int             nLimit;         ///< WLD limit replacements/insertions/deletions
+    int             nRepP0;         ///< replacement weigh
+    int             nInsQ0;         ///< insertion weigh
+    int             nDelR0;         ///< deletion weigh
+    int             nStars;         ///< count of '*' wildcards in pattern
+    bool            bSplitCount;    ///< if TRUE, Rep/Ins/Del are counted separately
+
+    void InitData( const sal_Unicode* cPattern );
+    static int Mid3( int x, int y, int z );        ///< middle value of 3 values
+
+public:
+
+    /** CTor with user input. Internally calls CalcLPQR().
+
+        After this, obtain the resulting limit using GetLimit().
+
+        @param  bRelaxed    the mathematically incorrect method is default (TRUE)
+     */
+    WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
+                    int nLongerZ, bool bRelaxed );
+
+    WLevDistance( const WLevDistance& rWLD );
+    ~WLevDistance();
+
+    /** Calculate the Weighted Levenshtein Distance from string to pattern. */
+    int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
+
+    /** Calculate the internal weighs corresponding to the user input values.
+        @returns nLimit for later comparison with WLD()
+     */
+    void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
+                    bool bRelaxed );
+
+    int GetLimit() const     { return nLimit; }
+
+    // Calculate current balance, keep this inline for performance reasons!
+    // c == cpPattern[jj] == cString[ii]
+    // First seek up to found place, if the balance is still equal there then
+    // also compare after the found place.
+    int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const
+    {
+        int nBalance = 0;
+
+        if ( jj != ii )
+        {
+            sal_Int32 k;
+            if ( jj > 0 )
+                for ( k=0; k < jj; k++ )
+                    if ( cpPattern[k] == c )
+                        nBalance++;
+            if ( ii > 0 )
+                for ( k=0; k < ii; k++ )
+                    if ( cString[k] == c )
+                        nBalance--;
+            if ( !nBalance )
+            {
+                for ( k=jj+1; k < nPatternLen; k++ )
+                    if ( cpPattern[k] == c )
+                        nBalance++;
+                for ( k=ii+1; k < nStringLen; k++ )
+                    if ( cString[k] == c )
+                        nBalance--;
+            }
+        }
+
+        return nBalance;
+    }
+};
+
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:54:39 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-15 05:54:39 +0000
commit	267c6f2ac71f92999e969232431ba04678e7437e (patch)
tree	358c9467650e1d0a1d7227a21dac2e3d08b622b2 /i18npool/source/search/levdis.hxx
parent	Initial commit. (diff)
download	libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip