summaryrefslogtreecommitdiffstats
path: root/src/fts_fuzzy_match.cc
blob: 8ae7098b3885c34cc1a9b63a7b13b8042f27e337 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// LICENSE
//
//   This software is dual-licensed to the public domain and under the following
//   license: you are granted a perpetual, irrevocable license to copy, modify,
//   publish, and distribute this file as you see fit.

#include <cstring>  // memcpy

#include "fts_fuzzy_match.hh"

#include <ctype.h>  // ::tolower, ::toupper

#include "config.h"

namespace fts {

// Forward declarations for "private" implementation
namespace fuzzy_internal {
static bool fuzzy_match_recursive(const char* pattern,
                                  const char* str,
                                  int& outScore,
                                  const char* strBegin,
                                  uint8_t const* srcMatches,
                                  uint8_t* newMatches,
                                  int maxMatches,
                                  int nextMatch,
                                  int& recursionCount,
                                  int recursionLimit);
}

// Public interface
bool
fuzzy_match_simple(char const* pattern, char const* str)
{
    while (*pattern != '\0' && *str != '\0') {
        if (tolower(*pattern) == tolower(*str))
            ++pattern;
        ++str;
    }

    return *pattern == '\0' ? true : false;
}

bool
fuzzy_match(char const* pattern, char const* str, int& outScore)
{
    uint8_t matches[256];
    return fuzzy_match(pattern, str, outScore, matches, sizeof(matches));
}

bool
fuzzy_match(char const* pattern,
            char const* str,
            int& outScore,
            uint8_t* matches,
            int maxMatches)
{
    int recursionCount = 0;
    int recursionLimit = 10;

    return fuzzy_internal::fuzzy_match_recursive(pattern,
                                                 str,
                                                 outScore,
                                                 str,
                                                 nullptr,
                                                 matches,
                                                 maxMatches,
                                                 0,
                                                 recursionCount,
                                                 recursionLimit);
}

// Private implementation
static bool
fuzzy_internal::fuzzy_match_recursive(const char* pattern,
                                      const char* str,
                                      int& outScore,
                                      const char* strBegin,
                                      uint8_t const* srcMatches,
                                      uint8_t* matches,
                                      int maxMatches,
                                      int nextMatch,
                                      int& recursionCount,
                                      int recursionLimit)
{
    // Count recursions
    ++recursionCount;
    if (recursionCount >= recursionLimit)
        return false;

    // Detect end of strings
    if (*pattern == '\0' || *str == '\0')
        return false;

    // Recursion params
    bool recursiveMatch = false;
    uint8_t bestRecursiveMatches[256];
    int bestRecursiveScore = 0;

    // Loop through pattern and str looking for a match
    bool first_match = true;
    while (*pattern != '\0' && *str != '\0') {
        // Found match
        if (tolower(*pattern) == tolower(*str)) {
            // Supplied matches buffer was too short
            if (nextMatch >= maxMatches)
                return false;

            // "Copy-on-Write" srcMatches into matches
            if (first_match && srcMatches) {
                memcpy(matches, srcMatches, nextMatch);
                first_match = false;
            }

            // Recursive call that "skips" this match
            uint8_t recursiveMatches[256];
            int recursiveScore;
            if (fuzzy_match_recursive(pattern,
                                      str + 1,
                                      recursiveScore,
                                      strBegin,
                                      matches,
                                      recursiveMatches,
                                      sizeof(recursiveMatches),
                                      nextMatch,
                                      recursionCount,
                                      recursionLimit))
            {
                // Pick best recursive score
                if (!recursiveMatch || recursiveScore > bestRecursiveScore) {
                    memcpy(bestRecursiveMatches, recursiveMatches, 256);
                    bestRecursiveScore = recursiveScore;
                }
                recursiveMatch = true;
            }

            // Advance
            matches[nextMatch++] = (uint8_t) (str - strBegin);
            ++pattern;
        }
        ++str;
    }

    // Determine if full pattern was matched
    bool matched = *pattern == '\0' ? true : false;

    // Calculate score
    if (matched) {
        const int sequential_bonus = 15;  // bonus for adjacent matches
        const int separator_bonus
            = 30;  // bonus if match occurs after a separator
        const int camel_bonus
            = 30;  // bonus if match is uppercase and prev is lower
        const int first_letter_bonus
            = 15;  // bonus if the first letter is matched

        const int leading_letter_penalty
            = -5;  // penalty applied for every letter in str before the first
                   // match
        const int max_leading_letter_penalty
            = -15;  // maximum penalty for leading letters
        const int unmatched_letter_penalty
            = -1;  // penalty for every letter that doesn't matter

        // Iterate str to end
        while (*str != '\0')
            ++str;

        // Initialize score
        outScore = 100;

        // Apply leading letter penalty
        int penalty = leading_letter_penalty * matches[0];
        if (penalty < max_leading_letter_penalty)
            penalty = max_leading_letter_penalty;
        outScore += penalty;

        // Apply unmatched penalty
        int unmatched = (int) (str - strBegin) - nextMatch;
        outScore += unmatched_letter_penalty * unmatched;

        // Apply ordering bonuses
        for (int i = 0; i < nextMatch; ++i) {
            uint8_t currIdx = matches[i];

            if (i > 0) {
                uint8_t prevIdx = matches[i - 1];

                // Sequential
                if (currIdx == (prevIdx + 1))
                    outScore += sequential_bonus;
            }

            // Check for bonuses based on neighbor character value
            if (currIdx > 0) {
                // Camel case
                char neighbor = strBegin[currIdx - 1];
                char curr = strBegin[currIdx];
                if (::islower(neighbor) && ::isupper(curr))
                    outScore += camel_bonus;

                // Separator
                bool neighborSeparator = neighbor == '_' || neighbor == ' ';
                if (neighborSeparator)
                    outScore += separator_bonus;
            } else {
                // First letter
                outScore += first_letter_bonus;
            }
        }
    }

    // Return best result
    if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) {
        // Recursive score is better than "this"
        memcpy(matches, bestRecursiveMatches, maxMatches);
        outScore = bestRecursiveScore;
        return true;
    } else if (matched) {
        // "this" score is better than recursive
        return true;
    } else {
        // no match
        return false;
    }
}
}  // namespace fts