-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a new fuzzy matching algorithm, it should be better than the pr…
…evious version, it's similar to what most editors have (implementation is inspired in the sublime_text fuzzy matcher).
- Loading branch information
Showing
12 changed files
with
305 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,7 +33,7 @@ void addXML() { | |
|
||
}, | ||
"", | ||
{ "<%?xml" } | ||
{ "^<%?xml" } | ||
|
||
} ) | ||
.setAutoCloseXMLTags( true ); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
// LICENSE | ||
// | ||
// This software is dual-licensed to the public domain and under the following | ||
// license: you are granted a perpetual, irrevocable license to copy, modify, | ||
// publish, and distribute this file as you see fit. | ||
// | ||
// VERSION | ||
// 0.2.0 (2017-02-18) Scored matches perform exhaustive search for best score | ||
// 0.1.0 (2016-03-28) Initial release | ||
// | ||
// AUTHOR | ||
// Forrest Smith | ||
// | ||
// NOTES | ||
// Compiling | ||
// You MUST add '#define FTS_FUZZY_MATCH_IMPLEMENTATION' before including this header in ONE source file to create implementation. | ||
// | ||
// fuzzy_match_simple(...) | ||
// Returns true if each character in pattern is found sequentially within str | ||
// | ||
// fuzzy_match(...) | ||
// Returns true if pattern is found AND calculates a score. | ||
// Performs exhaustive search via recursion to find all possible matches and match with highest score. | ||
// Scores values have no intrinsic meaning. Possible score range is not normalized and varies with pattern. | ||
// Recursion is limited internally (default=10) to prevent degenerate cases (pattern="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") | ||
// Uses uint8_t for match indices. Therefore patterns are limited to 256 characters. | ||
// Score system should be tuned for YOUR use case. Words, sentences, file names, or method names all prefer different tuning. | ||
|
||
|
||
#ifndef FTS_FUZZY_MATCH_H | ||
#define FTS_FUZZY_MATCH_H | ||
|
||
|
||
#include <cstdint> // uint8_t | ||
#include <ctype.h> // ::tolower, ::toupper | ||
#include <cstring> // memcpy | ||
|
||
#include <cstdio> | ||
|
||
// Public interface | ||
namespace fts { | ||
[[maybe_unused]] static bool fuzzy_match_simple(char const * pattern, char const * str); | ||
[[maybe_unused]] static bool fuzzy_match(char const * pattern, char const * str, int & outScore); | ||
[[maybe_unused]] static bool fuzzy_match(char const * pattern, char const * str, int & outScore, uint8_t * matches, int maxMatches); | ||
} | ||
|
||
|
||
#ifdef FTS_FUZZY_MATCH_IMPLEMENTATION | ||
namespace fts { | ||
|
||
// Forward declarations for "private" implementation | ||
namespace fuzzy_internal { | ||
static bool fuzzy_match_recursive(const char * pattern, const char * str, int & outScore, const char * strBegin, | ||
uint8_t const * srcMatches, uint8_t * newMatches, int maxMatches, int nextMatch, | ||
int & recursionCount, int recursionLimit); | ||
} | ||
|
||
// Public interface | ||
static bool fuzzy_match_simple(char const * pattern, char const * str) { | ||
while (*pattern != '\0' && *str != '\0') { | ||
if (tolower(*pattern) == tolower(*str)) | ||
++pattern; | ||
++str; | ||
} | ||
|
||
return *pattern == '\0' ? true : false; | ||
} | ||
|
||
static bool fuzzy_match(char const * pattern, char const * str, int & outScore) { | ||
|
||
uint8_t matches[256]; | ||
return fuzzy_match(pattern, str, outScore, matches, sizeof(matches)); | ||
} | ||
|
||
static bool fuzzy_match(char const * pattern, char const * str, int & outScore, uint8_t * matches, int maxMatches) { | ||
int recursionCount = 0; | ||
int recursionLimit = 10; | ||
|
||
return fuzzy_internal::fuzzy_match_recursive(pattern, str, outScore, str, nullptr, matches, maxMatches, 0, recursionCount, recursionLimit); | ||
} | ||
|
||
// Private implementation | ||
static bool fuzzy_internal::fuzzy_match_recursive(const char * pattern, const char * str, int & outScore, | ||
const char * strBegin, uint8_t const * srcMatches, uint8_t * matches, int maxMatches, | ||
int nextMatch, int & recursionCount, int recursionLimit) | ||
{ | ||
// Count recursions | ||
++recursionCount; | ||
if (recursionCount >= recursionLimit) | ||
return false; | ||
|
||
// Detect end of strings | ||
if (*pattern == '\0' || *str == '\0') | ||
return false; | ||
|
||
// Recursion params | ||
bool recursiveMatch = false; | ||
uint8_t bestRecursiveMatches[256]; | ||
int bestRecursiveScore = 0; | ||
|
||
// Loop through pattern and str looking for a match | ||
bool first_match = true; | ||
while (*pattern != '\0' && *str != '\0') { | ||
|
||
// Found match | ||
if (tolower(*pattern) == tolower(*str)) { | ||
|
||
// Supplied matches buffer was too short | ||
if (nextMatch >= maxMatches) | ||
return false; | ||
|
||
// "Copy-on-Write" srcMatches into matches | ||
if (first_match && srcMatches) { | ||
memcpy(matches, srcMatches, nextMatch); | ||
first_match = false; | ||
} | ||
|
||
// Recursive call that "skips" this match | ||
uint8_t recursiveMatches[256]; | ||
int recursiveScore; | ||
if (fuzzy_match_recursive(pattern, str + 1, recursiveScore, strBegin, matches, recursiveMatches, sizeof(recursiveMatches), nextMatch, recursionCount, recursionLimit)) { | ||
|
||
// Pick best recursive score | ||
if (!recursiveMatch || recursiveScore > bestRecursiveScore) { | ||
memcpy(bestRecursiveMatches, recursiveMatches, 256); | ||
bestRecursiveScore = recursiveScore; | ||
} | ||
recursiveMatch = true; | ||
} | ||
|
||
// Advance | ||
matches[nextMatch++] = (uint8_t)(str - strBegin); | ||
++pattern; | ||
} | ||
++str; | ||
} | ||
|
||
// Determine if full pattern was matched | ||
bool matched = *pattern == '\0' ? true : false; | ||
|
||
// Calculate score | ||
if (matched) { | ||
const int sequential_bonus = 15; // bonus for adjacent matches | ||
const int separator_bonus = 30; // bonus if match occurs after a separator | ||
const int camel_bonus = 30; // bonus if match is uppercase and prev is lower | ||
const int first_letter_bonus = 15; // bonus if the first letter is matched | ||
|
||
const int leading_letter_penalty = -5; // penalty applied for every letter in str before the first match | ||
const int max_leading_letter_penalty = -15; // maximum penalty for leading letters | ||
const int unmatched_letter_penalty = -1; // penalty for every letter that doesn't matter | ||
|
||
// Iterate str to end | ||
while (*str != '\0') | ||
++str; | ||
|
||
// Initialize score | ||
outScore = 100; | ||
|
||
// Apply leading letter penalty | ||
int penalty = leading_letter_penalty * matches[0]; | ||
if (penalty < max_leading_letter_penalty) | ||
penalty = max_leading_letter_penalty; | ||
outScore += penalty; | ||
|
||
// Apply unmatched penalty | ||
int unmatched = (int)(str - strBegin) - nextMatch; | ||
outScore += unmatched_letter_penalty * unmatched; | ||
|
||
// Apply ordering bonuses | ||
for (int i = 0; i < nextMatch; ++i) { | ||
uint8_t currIdx = matches[i]; | ||
|
||
if (i > 0) { | ||
uint8_t prevIdx = matches[i - 1]; | ||
|
||
// Sequential | ||
if (currIdx == (prevIdx + 1)) | ||
outScore += sequential_bonus; | ||
} | ||
|
||
// Check for bonuses based on neighbor character value | ||
if (currIdx > 0) { | ||
// Camel case | ||
char neighbor = strBegin[currIdx - 1]; | ||
char curr = strBegin[currIdx]; | ||
if (::islower(neighbor) && ::isupper(curr)) | ||
outScore += camel_bonus; | ||
|
||
// Separator | ||
bool neighborSeparator = neighbor == '_' || neighbor == ' '; | ||
if (neighborSeparator) | ||
outScore += separator_bonus; | ||
} | ||
else { | ||
// First letter | ||
outScore += first_letter_bonus; | ||
} | ||
} | ||
} | ||
|
||
// Return best result | ||
if (recursiveMatch && (!matched || bestRecursiveScore > outScore)) { | ||
// Recursive score is better than "this" | ||
memcpy(matches, bestRecursiveMatches, maxMatches); | ||
outScore = bestRecursiveScore; | ||
return true; | ||
} | ||
else if (matched) { | ||
// "this" score is better than recursive | ||
return true; | ||
} | ||
else { | ||
// no match | ||
return false; | ||
} | ||
} | ||
} // namespace fts | ||
|
||
#endif // FTS_FUZZY_MATCH_IMPLEMENTATION | ||
|
||
#endif // FTS_FUZZY_MATCH_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.