summaryrefslogtreecommitdiffstats
path: root/i18npool/source/search/levdis.hxx
blob: b378c03cff4eb1720a15c9271e992b66859d3d7f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#pragma once

#include <sal/types.h>
#include <memory>

// Sensible default values for a user interface could be:
//  LEVDISDEFAULT_XOTHER    2
//      Maximum X replacements to match query, found data may be different by X
//      characters from query.
//  LEVDISDEFAULT_YSHORTER  1
//      Maximum Y insertions to match query, found data may be Y characters
//      shorter than query.
//  LEVDISDEFAULT_ZLONGER   3
//      Maximum Z deletions to match query, found data may be Z characters
//      longer than query.
//  LEVDISDEFAULT_RELAXED   TRUE
//      Use relaxed SplitCount instead of mathematical WLD.
//
// Joker/wildcards ('?' and '*') of course do not count as
// replacement/insertion/deletion. At a '?' a replacement is not counted, for a
// '*' the found data may be any number of characters longer than the query.
//
// Strict mathematical WLD: EITHER maximum X replacements OR Y characters
// shorter OR Z characters longer.
// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
// Z characters longer. Any combination of actions is valid.
//
// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
//
// The corresponding internal default weigh values for these user interface
// values would be:
//  LEVDISDEFAULTLIMIT  6
//      Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
//  LEVDISDEFAULT_P0    3
//      Default nRepP0, weight of replacements.
//  LEVDISDEFAULT_Q0    6
//      Default nInsQ0, weight of insertions.
//  LEVDISDEFAULT_R0    2
//      Default nDelR0, weight of deletions.

// The transformation of user input values to weighs is done using CalcLPQR().
// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
// characters longer than pattern) then no character can be replaced any more.
// This can be circumvented by increasing nX or/and nZ, but of course with the
// side effect of being less strict then... or the other solution is to use
// relaxed SplitCount (see below), which is the default when using CalcLPQR().
//
// Attention: shorter = WLD.Insert, longer = WLD.Delete
//
// View and counting is always from data string to pattern, a deletion means
// that something is deleted from data to match pattern.
//
// Deletions weigh less in this example because usually less is known than is
// searched for. Replacements get middle weight, for example because of
// misspellings. Insertions are expensive.
//
// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
// Allowed are maximum 4 replacements, no insertion, no deletion.
// Matches the user interface values X = 3, Y = 0, Z = 0
//
// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
// of WLD() then isn't necessarily the Levenshtein-Distance, but can be
// negative (-WLD) if the WLD is greater than nLimit but single values are
// within the limits.
// For the above default values that could mean: even if the found string is
// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
// to reach pattern. Additionally, character swaps count as one replacement.
// Mathematically completely incorrect, but meets user expectations ;-)
//
// Explanation: in the real WLD all actions are withdrawn from a common 100%
// pool, if one gets all there's nothing left for others. With bSplitCount
// replacements have their own pool.


/** "Safe" memory allocation in ctor */
class WLevDisPatternMem
{
    std::unique_ptr<sal_Unicode[]> cp;
    std::unique_ptr<bool[]>        bp;
public:
    explicit WLevDisPatternMem( sal_Int32 s )
        : cp(new sal_Unicode[s])
        , bp(new bool[s])
    {
    }

    sal_Unicode* GetcPtr() const        { return cp.get(); }
    bool* GetbPtr() const               { return bp.get(); }
};

class WLevDisDistanceMem
{
    std::unique_ptr<int[]> p;
public:
    explicit WLevDisDistanceMem( size_t s )
    {
        NewMem(s);
    }
    int* GetPtr() const             { return p.get(); }
    int* NewMem( size_t s )
    {
        p.reset(new int[ s<3 ? 3 : s ]);
        return p.get();
    }
};

/** Weighted Levenshtein Distance (WLD)

    For a more detailed explanation see documentation in
    i18npool/source/search/levdis.hxx
 */
class WLevDistance
{
    sal_Int32       nPatternLen;    ///< length of pattern
    WLevDisPatternMem   aPatMem;    ///< manage allocation of pattern array
    sal_Unicode*    cpPattern;      ///< pointer to pattern array
    bool*           bpPatIsWild;    ///< pointer to bool array whether pattern is wildcard
    sal_Int32       nArrayLen;      ///< length of distance array
    WLevDisDistanceMem  aDisMem;    ///< manage allocation of distance array
    int*            npDistance;     ///< pointer to distance array
    int             nLimit;         ///< WLD limit replacements/insertions/deletions
    int             nRepP0;         ///< replacement weigh
    int             nInsQ0;         ///< insertion weigh
    int             nDelR0;         ///< deletion weigh
    int             nStars;         ///< count of '*' wildcards in pattern
    bool            bSplitCount;    ///< if TRUE, Rep/Ins/Del are counted separately

    void InitData( const sal_Unicode* cPattern );
    static int Mid3( int x, int y, int z );        ///< middle value of 3 values

public:

    /** CTor with user input. Internally calls CalcLPQR().

        After this, obtain the resulting limit using GetLimit().

        @param  bRelaxed    the mathematically incorrect method is default (TRUE)
     */
    WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
                    int nLongerZ, bool bRelaxed );

    WLevDistance( const WLevDistance& rWLD );
    ~WLevDistance();

    /** Calculate the Weighted Levenshtein Distance from string to pattern. */
    int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );

    /** Calculate the internal weighs corresponding to the user input values.
        @returns nLimit for later comparison with WLD()
     */
    void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
                    bool bRelaxed );

    int GetLimit() const     { return nLimit; }

    // Calculate current balance, keep this inline for performance reasons!
    // c == cpPattern[jj] == cString[ii]
    // First seek up to found place, if the balance is still equal there then
    // also compare after the found place.
    int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const
    {
        int nBalance = 0;

        if ( jj != ii )
        {
            sal_Int32 k;
            if ( jj > 0 )
                for ( k=0; k < jj; k++ )
                    if ( cpPattern[k] == c )
                        nBalance++;
            if ( ii > 0 )
                for ( k=0; k < ii; k++ )
                    if ( cString[k] == c )
                        nBalance--;
            if ( !nBalance )
            {
                for ( k=jj+1; k < nPatternLen; k++ )
                    if ( cpPattern[k] == c )
                        nBalance++;
                for ( k=ii+1; k < nStringLen; k++ )
                    if ( cString[k] == c )
                        nBalance--;
            }
        }

        return nBalance;
    }
};


/* vim:set shiftwidth=4 softtabstop=4 expandtab: */