Skip to content

Commit

Permalink
intuit#35 fix DateMatch with NeighborhoodRange greater than 0.91 failing
Browse files Browse the repository at this point in the history
  • Loading branch information
agajurel committed Oct 2, 2020
1 parent d2ce6f6 commit 5439fae
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 7 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ Some commonly used are already available.
* __Match Type__ : Allows 2 types of matches, which can be applied to each `Element`
* _Equality_: Uses exact matches with token values.
* _Nearest Neighbor_: Finds tokens that are contained in the neighborhood range, that can be specified as a
probability (0.0 - 1.0) for each element. It defaults to 0.9
probability (0.0 - 1.0) for each element. It defaults to 0.99 for Dates and 0.9 otherwise.

* __Scoring__ : These are defined for `Element` and `Document` matches
* _Element scoring_: Uses a simple average, where for each element the matching token is divided by the total tokens.
Expand Down Expand Up @@ -160,7 +160,7 @@ _Note: Since each element is unique in the way it should match, if you need to m
* __TokenizerFunction__: Override The _TokenizerFunction_ function defined by Type
* __MatchType__: Override the MatchType defined by Type
* __NeighborhoodRange__: Relevant only for `NEAREST_NEIGHBORS` MatchType. Defines how close should the `Value` be, to be considered a match.
Accepted values between 0.0 - 1.0 (defaults to 0.9)
Accepted values between 0.0 - 1.0 (defaults to 0.99 for Dates and 0.9 otherwise)

### Match Service
It supports 3 ways to match the documents
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ private class TokenRange {

private final Object lower;
private final Object higher;
private static final double DATE_SCALE_FACTOR = 1.1;
private static final double DATE_SCALE_FACTOR = 0.9;


TokenRange(Token token, double pct) {
Expand All @@ -104,8 +104,8 @@ private class TokenRange {
this.lower = getLower((Float) value, pct).floatValue();
this.higher = getHigher((Float) value, pct).floatValue();
} else if (value instanceof Date) {
this.lower = new Date(getLower(((Date) value).getTime(), pct * DATE_SCALE_FACTOR).longValue());
this.higher = new Date(getHigher(((Date) value).getTime(), pct * DATE_SCALE_FACTOR).longValue());
this.lower = new Date(getLower(((Date) value).getTime(), pct).longValue());
this.higher = new Date(getHigher(((Date) value).getTime(), pct).longValue());
} else {
throw new MatchException("Data Type not supported");
}
Expand Down
11 changes: 9 additions & 2 deletions src/main/java/com/intuit/fuzzymatcher/domain/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* <li>type - The ElementType for the value. This determines the functions applied at different steps of the match</li>
* <li>weight - Used in scoring function to increase the Document score for an Element. Default is 1.0 for all elements</li>
* <li>threshold - Value above which elements are considered a match, default 0.3</li>
* <li>neighborhoodRange - Relevant for NEAREST_NEIGHBORS MatchType. Defines how close should the value be, to be considered a match (default 0.9) </li>
* <li>neighborhoodRange - Relevant for NEAREST_NEIGHBORS MatchType. Defines how close should the value be, to be considered a match (default 0.99 for dates 0.9 otherwise) </li>
* <li>preProcessFunction - Function to pre-process the value. If this is not set, the function defined in ElementType is used </li>
* <li>tokenizerFunction - Function to break values into tokens. If this is not set, the function defined in ElementType is used </li>
* <li>matchType - MatchType used. If this is not set, the type defined in ElementType is used </li>
Expand Down Expand Up @@ -168,7 +168,7 @@ public static class Builder<T> {
private T value;
private double weight = 1.0;
private double threshold = 0.3;
private double neighborhoodRange = 0.9;
private Double neighborhoodRange;
private Function<T, T> preProcessFunction;
private MatchType matchType;

Expand Down Expand Up @@ -222,6 +222,13 @@ public Builder setMatchType(MatchType matchType) {


public Element createElement() {
if (neighborhoodRange == null){
if (type == ElementType.DATE) {
neighborhoodRange = 0.99;
} else {
neighborhoodRange = 0.9;
}
}
return new Element<T>(type, variance, value, weight, threshold, neighborhoodRange, preProcessFunction, tokenizerFunction, matchType);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,14 @@ public void itShouldApplyMatchWithDate() {
Assert.assertEquals(2, result.size());
}

@Test
public void itShouldApplyMatchWithDateForHighNeighborhoodRange() {
List<Object> dates = Arrays.asList(getDate("01/01/2020"), getDate("01/02/2020"), getDate("02/01/2019"));
List<Document> documentList = getTestDocuments(dates, DATE, 0.999); // 0.999 neighborhood is about 18 days
Map<Document, List<Match<Document>>> result = matchService.applyMatch(documentList);
Assert.assertEquals(2, result.size());
}


private List<Document> getTestDocuments(List<Object> values, ElementType elementType, Double neighborhoodRange) {
AtomicInteger ai = new AtomicInteger(0);
Expand Down

0 comments on commit 5439fae

Please sign in to comment.