diff options
Diffstat (limited to 'i18npool/source/search/levdis.hxx')
-rw-r--r-- | i18npool/source/search/levdis.hxx | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/i18npool/source/search/levdis.hxx b/i18npool/source/search/levdis.hxx new file mode 100644 index 0000000000..b378c03cff --- /dev/null +++ b/i18npool/source/search/levdis.hxx @@ -0,0 +1,211 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +#pragma once + +#include <sal/types.h> +#include <memory> + +// Sensible default values for a user interface could be: +// LEVDISDEFAULT_XOTHER 2 +// Maximum X replacements to match query, found data may be different by X +// characters from query. +// LEVDISDEFAULT_YSHORTER 1 +// Maximum Y insertions to match query, found data may be Y characters +// shorter than query. +// LEVDISDEFAULT_ZLONGER 3 +// Maximum Z deletions to match query, found data may be Z characters +// longer than query. +// LEVDISDEFAULT_RELAXED TRUE +// Use relaxed SplitCount instead of mathematical WLD. +// +// Joker/wildcards ('?' and '*') of course do not count as +// replacement/insertion/deletion. At a '?' a replacement is not counted, for a +// '*' the found data may be any number of characters longer than the query. +// +// Strict mathematical WLD: EITHER maximum X replacements OR Y characters +// shorter OR Z characters longer. +// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR +// Z characters longer. Any combination of actions is valid. +// +// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed +// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736. +// +// The corresponding internal default weigh values for these user interface +// values would be: +// LEVDISDEFAULTLIMIT 6 +// Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2 +// LEVDISDEFAULT_P0 3 +// Default nRepP0, weight of replacements. +// LEVDISDEFAULT_Q0 6 +// Default nInsQ0, weight of insertions. +// LEVDISDEFAULT_R0 2 +// Default nDelR0, weight of deletions. + +// The transformation of user input values to weighs is done using CalcLPQR(). +// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ +// characters longer than pattern) then no character can be replaced any more. +// This can be circumvented by increasing nX or/and nZ, but of course with the +// side effect of being less strict then... or the other solution is to use +// relaxed SplitCount (see below), which is the default when using CalcLPQR(). +// +// Attention: shorter = WLD.Insert, longer = WLD.Delete +// +// View and counting is always from data string to pattern, a deletion means +// that something is deleted from data to match pattern. +// +// Deletions weigh less in this example because usually less is known than is +// searched for. Replacements get middle weight, for example because of +// misspellings. Insertions are expensive. +// +// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3 +// Allowed are maximum 4 replacements, no insertion, no deletion. +// Matches the user interface values X = 3, Y = 0, Z = 0 +// +// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value +// of WLD() then isn't necessarily the Levenshtein-Distance, but can be +// negative (-WLD) if the WLD is greater than nLimit but single values are +// within the limits. +// For the above default values that could mean: even if the found string is +// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made +// to reach pattern. Additionally, character swaps count as one replacement. +// Mathematically completely incorrect, but meets user expectations ;-) +// +// Explanation: in the real WLD all actions are withdrawn from a common 100% +// pool, if one gets all there's nothing left for others. With bSplitCount +// replacements have their own pool. + + +/** "Safe" memory allocation in ctor */ +class WLevDisPatternMem +{ + std::unique_ptr<sal_Unicode[]> cp; + std::unique_ptr<bool[]> bp; +public: + explicit WLevDisPatternMem( sal_Int32 s ) + : cp(new sal_Unicode[s]) + , bp(new bool[s]) + { + } + + sal_Unicode* GetcPtr() const { return cp.get(); } + bool* GetbPtr() const { return bp.get(); } +}; + +class WLevDisDistanceMem +{ + std::unique_ptr<int[]> p; +public: + explicit WLevDisDistanceMem( size_t s ) + { + NewMem(s); + } + int* GetPtr() const { return p.get(); } + int* NewMem( size_t s ) + { + p.reset(new int[ s<3 ? 3 : s ]); + return p.get(); + } +}; + +/** Weighted Levenshtein Distance (WLD) + + For a more detailed explanation see documentation in + i18npool/source/search/levdis.hxx + */ +class WLevDistance +{ + sal_Int32 nPatternLen; ///< length of pattern + WLevDisPatternMem aPatMem; ///< manage allocation of pattern array + sal_Unicode* cpPattern; ///< pointer to pattern array + bool* bpPatIsWild; ///< pointer to bool array whether pattern is wildcard + sal_Int32 nArrayLen; ///< length of distance array + WLevDisDistanceMem aDisMem; ///< manage allocation of distance array + int* npDistance; ///< pointer to distance array + int nLimit; ///< WLD limit replacements/insertions/deletions + int nRepP0; ///< replacement weigh + int nInsQ0; ///< insertion weigh + int nDelR0; ///< deletion weigh + int nStars; ///< count of '*' wildcards in pattern + bool bSplitCount; ///< if TRUE, Rep/Ins/Del are counted separately + + void InitData( const sal_Unicode* cPattern ); + static int Mid3( int x, int y, int z ); ///< middle value of 3 values + +public: + + /** CTor with user input. Internally calls CalcLPQR(). + + After this, obtain the resulting limit using GetLimit(). + + @param bRelaxed the mathematically incorrect method is default (TRUE) + */ + WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY, + int nLongerZ, bool bRelaxed ); + + WLevDistance( const WLevDistance& rWLD ); + ~WLevDistance(); + + /** Calculate the Weighted Levenshtein Distance from string to pattern. */ + int WLD( const sal_Unicode* cString, sal_Int32 nStringLen ); + + /** Calculate the internal weighs corresponding to the user input values. + @returns nLimit for later comparison with WLD() + */ + void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ, + bool bRelaxed ); + + int GetLimit() const { return nLimit; } + + // Calculate current balance, keep this inline for performance reasons! + // c == cpPattern[jj] == cString[ii] + // First seek up to found place, if the balance is still equal there then + // also compare after the found place. + int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const + { + int nBalance = 0; + + if ( jj != ii ) + { + sal_Int32 k; + if ( jj > 0 ) + for ( k=0; k < jj; k++ ) + if ( cpPattern[k] == c ) + nBalance++; + if ( ii > 0 ) + for ( k=0; k < ii; k++ ) + if ( cString[k] == c ) + nBalance--; + if ( !nBalance ) + { + for ( k=jj+1; k < nPatternLen; k++ ) + if ( cpPattern[k] == c ) + nBalance++; + for ( k=ii+1; k < nStringLen; k++ ) + if ( cString[k] == c ) + nBalance--; + } + } + + return nBalance; + } +}; + + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |