Skip to content

Commit

Permalink
Merge pull request #58 from mayurmadnani/chain-tokenizers
Browse files Browse the repository at this point in the history
Adds support for chaining tokenizers. Fixes #54
  • Loading branch information
manishobhatia authored Oct 14, 2021
2 parents 933978f + 99b0698 commit 0d7b9d8
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@ public static Stream<Token<String>> getNGramTokens(int size, Element element) {
} else {
throw new MatchException("Unsupported data type");
}
return Utils.getNGrams(elementValueStr, size)
.map(str -> new Token<String>(str, element));
return Utils.getNGrams(elementValueStr, size).map(str -> new Token<String>(str, element));

}

public static Function<Element<String>, Stream<Token<String>>> chainTokenizers(Function<Element<String>, Stream<Token<String>>>... tokenizers) {
return element -> Arrays.stream(tokenizers).flatMap(fun -> fun.apply(element));
}
}
67 changes: 67 additions & 0 deletions src/test/java/com/intuit/fuzzymatcher/domain/ElementTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,73 @@ public void itShouldSetTokenizerFunction() {

}

@Test
public void itShouldNotMatchPhoneticWordsWithChainTokenizerFunction() {
List<String> names = Arrays.asList("bold", "bolt");

List<Document> documents1 = getDocuments(names, TokenizerFunction.wordSoundexEncodeTokenizer());

Map<Document, List<Match<Document>>> result1 = matchService.applyMatch(documents1);
Assert.assertEquals(2, result1.size());
Assert.assertEquals(1.0, result1.get(documents1.get(0)).get(0).getResult(), .01);

List<Document> documents2 = getDocuments(names,
TokenizerFunction.chainTokenizers(TokenizerFunction.wordTokenizer(),
TokenizerFunction.wordSoundexEncodeTokenizer(), TokenizerFunction.triGramTokenizer()));

Map<Document, List<Match<Document>>> result2 = matchService.applyMatch(documents2);
Assert.assertEquals(0, result2.size());
}

@Test
public void itShouldNotMatchPhoneticWordsWithChainTokenizerFunction2() {
List<String> names = Arrays.asList("Caputo", "Chabot");

List<Document> documents1 = getDocuments(names, TokenizerFunction.wordSoundexEncodeTokenizer());

Map<Document, List<Match<Document>>> result1 = matchService.applyMatch(documents1);
Assert.assertEquals(2, result1.size());
Assert.assertEquals(1.0, result1.get(documents1.get(0)).get(0).getResult(), .01);

List<Document> documents2 = getDocuments(names, TokenizerFunction.chainTokenizers(TokenizerFunction.wordSoundexEncodeTokenizer (), TokenizerFunction.triGramTokenizer()));

Map<Document, List<Match<Document>>> result2 = matchService.applyMatch(documents2);
Assert.assertEquals(0, result2.size());
}

@Test
public void itShouldMatchUnequalWordsWithChainTokenizerFunction() {
List<String> names = Arrays.asList("Mario", "Marieo");

List<Document> documents1 = getDocuments(names, TokenizerFunction.wordTokenizer());

Map<Document, List<Match<Document>>> result1 = matchService.applyMatch(documents1);
Assert.assertEquals(0, result1.size());

List<Document> documents2 = getDocuments(names, TokenizerFunction
.chainTokenizers(TokenizerFunction.wordSoundexEncodeTokenizer(), TokenizerFunction.triGramTokenizer()));

Map<Document, List<Match<Document>>> result2 = matchService.applyMatch(documents2);
Assert.assertEquals(2, result2.size());
Assert.assertEquals(0.6, result2.get(documents1.get(0)).get(0).getResult(), .01);
}

@Test
public void itShouldMatchUnequalWordsWithChainTokenizerFunction2() {
List<String> names = Arrays.asList("Nikolau", "Nikolaou");

List<Document> documents1 = getDocuments(names, TokenizerFunction.wordTokenizer());

Map<Document, List<Match<Document>>> result1 = matchService.applyMatch(documents1);
Assert.assertEquals(0, result1.size());

List<Document> documents2 = getDocuments(names, TokenizerFunction.chainTokenizers(TokenizerFunction.wordTokenizer(), TokenizerFunction.triGramTokenizer()));

Map<Document, List<Match<Document>>> result2 = matchService.applyMatch(documents2);
Assert.assertEquals(2, result2.size());
Assert.assertEquals(0.58, result2.get(documents1.get(0)).get(0).getResult(), .01);
}

private List<Document> getDocuments(List<String> names, Function tokenizerFunction) {
AtomicInteger counter = new AtomicInteger();
return names.stream().map(name -> {
Expand Down

0 comments on commit 0d7b9d8

Please sign in to comment.