Skip to content

Commit

Permalink
Adds support for chaining tokenizers. Fixes #54
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurmadnani committed Oct 14, 2021
1 parent 933978f commit c55ea07
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@ public static Stream<Token<String>> getNGramTokens(int size, Element element) {
} else {
throw new MatchException("Unsupported data type");
}
return Utils.getNGrams(elementValueStr, size)
.map(str -> new Token<String>(str, element));
return Utils.getNGrams(elementValueStr, size).map(str -> new Token<String>(str, element));

}

public static Function<Element<String>, Stream<Token<String>>> chainTokenizers(Function<Element<String>, Stream<Token<String>>>... tokenizers) {
return element -> Arrays.stream(tokenizers).flatMap(fun -> fun.apply(element));
}
}
32 changes: 32 additions & 0 deletions src/test/java/com/intuit/fuzzymatcher/domain/ElementTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,38 @@ public void itShouldSetTokenizerFunction() {

}

@Test
public void itShouldNotMatchPhoneticWordsWithCustomTokenizerFunction() {
List<String> names = Arrays.asList("bold", "bolt");

List<Document> documents1 = getDocuments(names, TokenizerFunction.wordSoundexEncodeTokenizer());

Map<Document, List<Match<Document>>> result1 = matchService.applyMatch(documents1);
Assert.assertEquals(2, result1.size());
Assert.assertEquals(1.0, result1.get(documents1.get(0)).get(0).getResult(), .01);

List<Document> documents2 = getDocuments(names, TokenizerFunction.chainTokenizers(TokenizerFunction.wordTokenizer(), TokenizerFunction.wordSoundexEncodeTokenizer(), TokenizerFunction.triGramTokenizer()));

Map<Document, List<Match<Document>>> result2 = matchService.applyMatch(documents2);
Assert.assertEquals(0, result2.size());
}

@Test
public void itShouldMatchUnequalWordsWithCustomTokenizerFunction() {
List<String> names = Arrays.asList("Mario", "Marieo");

List<Document> documents1 = getDocuments(names, TokenizerFunction.wordTokenizer());

Map<Document, List<Match<Document>>> result1 = matchService.applyMatch(documents1);
Assert.assertEquals(0, result1.size());

List<Document> documents2 = getDocuments(names, TokenizerFunction.chainTokenizers(TokenizerFunction.wordSoundexEncodeTokenizer(), TokenizerFunction.triGramTokenizer()));

Map<Document, List<Match<Document>>> result2 = matchService.applyMatch(documents2);
Assert.assertEquals(2, result2.size());
Assert.assertEquals(0.6, result2.get(documents1.get(0)).get(0).getResult(), .01);
}

private List<Document> getDocuments(List<String> names, Function tokenizerFunction) {
AtomicInteger counter = new AtomicInteger();
return names.stream().map(name -> {
Expand Down

0 comments on commit c55ea07

Please sign in to comment.