diff options
Diffstat (limited to 'i18npool/source/search/levdis.cxx')
-rw-r--r-- | i18npool/source/search/levdis.cxx | 417 |
1 files changed, 417 insertions, 0 deletions
diff --git a/i18npool/source/search/levdis.cxx b/i18npool/source/search/levdis.cxx new file mode 100644 index 000000000..5842abd1e --- /dev/null +++ b/i18npool/source/search/levdis.cxx @@ -0,0 +1,417 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * This file incorporates work covered by the following license notice: + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed + * with this work for additional information regarding copyright + * ownership. The ASF licenses this file to you under the Apache + * License, Version 2.0 (the "License"); you may not use this file + * except in compliance with the License. You may obtain a copy of + * the License at http://www.apache.org/licenses/LICENSE-2.0 . + */ + +/* + + Weighted Levenshtein Distance + including wildcards + '*' for any number (0 or more) of arbitrary characters + '?' for exactly one arbitrary character + escapable with backslash, "\*" or "\?" + + Return: + WLD if WLD <= nLimit, else nLimit+1 + + or, if bSplitCount: + WLD if WLD <= nLimit + -WLD if Replace and Insert and Delete <= nLimit + else nLimit+1 + + Recursive definition of WLD: + + WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) , + WLD( X(i) , Y(j-1) ) + q , + WLD( X(i-1), Y(j) ) + r ) + + X(i) := the first i characters of the word X + Y(j) := the first j characters of the word Y + p(i,j) := 0 if i-th character of X == j-th character of Y, + p else + + Boundary conditions: + WLD( X(0), Y(j) ) := j*q (Y created by j inserts) + WLD( X(i), Y(0) ) := i*r (Y created by i deletes) + WLD( X(0), Y(0) ) := 0 + + Instead of recursions a dynamic algorithm is used. + + See also: German computer magazine + c't 07/89 pages 192-208 and c't 03/94 pages 230-239 +*/ + +#include <algorithm> + +#include "levdis.hxx" + +#define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit +#define LEVDISDOUBLEBUF 2048 // no doubling atop this border + +static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr ) +{ + const sal_Unicode* pTempStr = pStr; + while( *pTempStr ) + pTempStr++; + return static_cast<sal_Int32>(pTempStr-pStr); +} + +// Distance from string to pattern +int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen ) +{ + int nSPMin = 0; // penalty point Minimum + int nRepS = 0; // for SplitCount + + // length difference between pattern and string + int nLenDiff = nPatternLen - nStars - nStringLen; + // more insertions or deletions necessary as the limit? Then leave + if ( (nLenDiff * nInsQ0 > nLimit) + || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) ) + return LEVDISBIG; + + // comparative String greater than instantaneous array + // -> adapt array size + if ( nStringLen >= nArrayLen ) + { + // increase size much more to avoid reallocation + if ( nStringLen < LEVDISDOUBLEBUF ) + nArrayLen = 2 * nStringLen; + else + nArrayLen = nStringLen + 1; + npDistance = aDisMem.NewMem( nArrayLen ); + } + + // Calculate start values of the second column (first pattern value). + // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0, + // therefore the minimum is 0 + if ( nPatternLen == 0 ) + { + // Count of deletions to reach pattern + for ( sal_Int32 i=0; i <= nStringLen; i++ ) + npDistance[i] = i * nDelR0; + } + else if ( cpPattern[0] == '*' && bpPatIsWild[0] ) + { + // instead of a '*' you can fit in anything + for ( sal_Int32 i=0; i <= nStringLen; i++ ) + npDistance[i] = 0; + } + else + { + sal_Unicode c; + int nP; + c = cpPattern[0]; + if ( c == '?' && bpPatIsWild[0] ) + nP = 0; // a '?' could be any character. + else + // Minimum of replacement and deletion+insertion weighting + nP = Min3( nRepP0, nRepP0, nDelR0 + nInsQ0 ); + npDistance[0] = nInsQ0; // start with simple insert + npDistance[1] = nInsQ0; + npDistance[2] = nInsQ0; + int nReplacePos = -1; // tristate flag + int nDelCnt = 0; + for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 ) + { + if ( cString[i-1] == c ) + nP = 0; // Replace from this position is 0 + // Deletions to match pattern + Replace + npDistance[i] = nDelCnt + nP; + if ( bSplitCount ) + { + if ( nReplacePos < 0 && nP ) + { // this position will be replaced + nRepS++; + nReplacePos = i; + } + else if ( nReplacePos > 0 && !nP ) + { + // same count of c + int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen ); + if ( !nBalance ) + { // one was replaced that was an insertion instead + nRepS--; + nReplacePos = 0; + } + } + } + } + nSPMin = Min3( npDistance[0], npDistance[1], npDistance[2] ); + } + + // calculate distance matrix + sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached + while ( (j < nPatternLen-1) + && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) ) + { + sal_Unicode c; + int nP, nQ, nR, nPij, d2; + + j++; + c = cpPattern[j]; + if ( bpPatIsWild[j] ) // '*' or '?' not escaped + nP = 0; // could be replaced without penalty + else + nP = nRepP0; + if ( c == '*' && bpPatIsWild[j] ) + { + nQ = 0; // insertion and deletion without penalty + nR = 0; + } + else + { + nQ = nInsQ0; // usual weighting + nR = nDelR0; + } + d2 = npDistance[0]; + // increase insert count to get from null string to pattern + npDistance[0] = npDistance[0] + nQ; + nSPMin = npDistance[0]; + int nReplacePos = -1; // tristate flag + // for each pattern column run through the string + for ( sal_Int32 i=1; i <= nStringLen; i++ ) + { + int d1 = d2; // WLD( X(i-1), Y(j-1) ) + d2 = npDistance[i]; // WLD( X(i) , Y(j-1) ) + if ( cString[i-1] == c ) + { + nPij = 0; // p(i,j) + if ( nReplacePos < 0 ) + { + // same count of c + int nBalance = levdisbalance( j, i-1, c, cString, nStringLen ); + if ( !nBalance ) + nReplacePos = 0; // no replacement + } + } + else + nPij = nP; + // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) , + // WLD( X(i) , Y(j-1) ) + q , + // WLD( X(i-1), Y(j) ) + r ) + npDistance[i] = Min3( d1 + nPij, d2 + nQ, npDistance[i-1] + nR ); + if ( npDistance[i] < nSPMin ) + nSPMin = npDistance[i]; + if ( bSplitCount ) + { + if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij ) + { // this position will be replaced + nRepS++; + nReplacePos = i; + } + else if ( nReplacePos > 0 && !nPij ) + { + // character is equal in string and pattern + // + // If from this point: + // * pattern and string have the same count of this + // character + // * and character count is the same before this position + // then the replace was none. + // + // Scrambled letters are recognized here and the nRepS + // replacement is withdrawn, whereby the double limit kicks + // in. + + // Same count of c + int nBalance = levdisbalance( j, i-1, c, cString, nStringLen ); + if ( !nBalance ) + { // one was replaced that was an insertion instead + nRepS--; + nReplacePos = 0; + } + } + } + } + } + if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) ) + return npDistance[nStringLen]; + else + { + if ( bSplitCount ) + { + if ( nRepS && nLenDiff > 0 ) + nRepS -= nLenDiff; // Inserts were counted + if ( (nSPMin <= 2 * nLimit) + && (npDistance[nStringLen] <= 2 * nLimit) + && (nRepS * nRepP0 <= nLimit) ) + return -npDistance[nStringLen]; + return LEVDISBIG; + } + return LEVDISBIG; + } +} + +// Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount +// from user values nOtherX, nShorterY, nLongerZ, bRelaxed +void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed ) +{ + if ( nX < 0 ) nX = 0; // only positive values + if ( nY < 0 ) nY = 0; + if ( nZ < 0 ) nZ = 0; + if (0 == Min3( nX, nY, nZ )) // at least one 0 + { + int nMid, nMax; + nMax = Max3( nX, nY, nZ ); // either 0 for three 0s or Max + if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0 + nLimit = nMax; // either 0 or the only one >0 + else // one is 0 + nLimit = LCM( nMid, nMax ); + } + else // all three of them are not 0 + nLimit = LCM( LCM( nX, nY ), nZ ); + nRepP0 = ( nX ? nLimit / nX : nLimit + 1 ); + nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 ); + nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 ); + bSplitCount = bRelaxed; +} + +// greatest common divisor according to Euklid (chaindivision) +// special case: 0 plus anything produces 1 +int WLevDistance::GCD( int a, int b ) +{ + if ( !a || !b ) + return 1; + if ( a < 0 ) a = -a; + if ( b < 0 ) b = -b; + do + { + if ( a > b ) + a -= int(a / b) * b; + else + b -= int(b / a) * a; + } while ( a && b ); + return( a ? a : b); +} + +// least common multiple : a * b / GCD(a,b) +int WLevDistance::LCM( int a, int b ) +{ + if ( a > b ) // decrease overflow chance + return( (a / GCD(a,b)) * b ); + else + return( (b / GCD(a,b)) * a ); +} + +// Minimum of three values +inline int WLevDistance::Min3( int x, int y, int z ) +{ + if ( x < y ) + return std::min(x, z); + else + return std::min(y, z); +} + +// The value in the middle +int WLevDistance::Mid3( int x, int y, int z ) +{ + int min = Min3(x,y,z); + if ( x == min ) + return std::min(y, z); + else if ( y == min ) + return std::min(x, z); + else // z == min + return std::min(x, y); +} + +// Maximum of three values +int WLevDistance::Max3( int x, int y, int z ) +{ + if ( x > y ) + return std::max(x, z); + else + return std::max(y, z); +} + +// initialize data from CTOR +void WLevDistance::InitData( const sal_Unicode* cPattern ) +{ + cpPattern = aPatMem.GetcPtr(); + bpPatIsWild = aPatMem.GetbPtr(); + npDistance = aDisMem.GetPtr(); + nStars = 0; + const sal_Unicode* cp1 = cPattern; + sal_Unicode* cp2 = cpPattern; + bool* bp = bpPatIsWild; + // copy pattern, count asterisks, escaped Jokers + while ( *cp1 ) + { + if ( *cp1 == '\\' ) // maybe escaped + { + if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker? + { + cp1++; // skip '\\' + nPatternLen--; + } + *bp++ = false; + } + else if ( *cp1 == '*' || *cp1 == '?' ) // Joker + { + if ( *cp1 == '*' ) + nStars++; + *bp++ = true; + } + else + *bp++ = false; + *cp2++ = *cp1++; + } + *cp2 = '\0'; +} + +WLevDistance::WLevDistance( const sal_Unicode* cPattern, + int nOtherX, int nShorterY, int nLongerZ, + bool bRelaxed ) : + nPatternLen( Impl_WLD_StringLen(cPattern) ), + aPatMem( nPatternLen + 1 ), + nArrayLen( nPatternLen + 1 ), + aDisMem( nArrayLen ) +{ + InitData( cPattern ); + CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed ); +} + +// CopyCTor +WLevDistance::WLevDistance( const WLevDistance& rWLD ) : + nPatternLen( rWLD.nPatternLen ), + aPatMem( nPatternLen + 1 ), + nArrayLen( nPatternLen + 1 ), + aDisMem( nArrayLen ), + nLimit( rWLD.nLimit ), + nRepP0( rWLD.nRepP0 ), + nInsQ0( rWLD.nInsQ0 ), + nDelR0( rWLD.nDelR0 ), + nStars( rWLD.nStars ), + bSplitCount( rWLD.bSplitCount ) +{ + cpPattern = aPatMem.GetcPtr(); + bpPatIsWild = aPatMem.GetbPtr(); + npDistance = aDisMem.GetPtr(); + sal_Int32 i; + for ( i=0; i<nPatternLen; i++ ) + { + cpPattern[i] = rWLD.cpPattern[i]; + bpPatIsWild[i] = rWLD.bpPatIsWild[i]; + } + cpPattern[i] = '\0'; +} + +// DTor +WLevDistance::~WLevDistance() +{ +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |