forked from pelias/api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdedupe.js
107 lines (86 loc) · 3.96 KB
/
dedupe.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
const logger = require('pelias-logger').get('api');
const _ = require('lodash');
const isDifferent = require('../helper/diffPlaces').isDifferent;
const layerPreferences = require('../helper/diffPlaces').layerPreferences;
const canonical_sources = require('../helper/type_mapping').canonical_sources;
const field = require('../helper/fieldValue');
// convenience function to pretty print hits
const formatLog = (hit) => {
const name = field.getStringValue(_.get(hit, 'name.default'));
const zip = field.getStringValue(_.get(hit, 'address_parts.zip'));
return [name, zip, hit._id].filter(Boolean).join(' ');
};
function dedupeResults(req, res, next) {
// do nothing if request data is invalid
if( _.isUndefined(res) || !_.isPlainObject(req.clean) ){ return next(); }
// do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }
// use the user agent language to improve deduplication
const lang = _.get(req, 'clean.lang.iso6393');
// maintain a set of inferior records
const inferior = new Set();
// maintain a set of superior records
// note: this set maintains ordering of synonymous records
// while also preventing duplicates.
const superior = new Set();
for (var i = 0; i < res.data.length; i++) {
for (var j = (i+1); j < res.data.length; j++) {
// ensure these two records are considered duplicates
if (isDifferent(res.data[i], res.data[j], lang)) { continue; }
// decide which of the two records was 'inferior'
// note: $preference equals true when $j is preferred and vice versa
const preference = isPreferred(res.data[i], res.data[j]);
superior.add(preference ? res.data[j] : res.data[i]);
inferior.add(preference ? res.data[i] : res.data[j]);
// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
superior: formatLog(res.data[preference ? j : i]),
inferior: formatLog(res.data[preference ? i : j]),
});
}
superior.add(res.data[i]);
}
// remove inferior records, return the remaining results
const result = Array.from(superior).filter(v => !inferior.has(v));
const maxElements = _.get(req, 'clean.size', undefined);
res.data = result.slice(0, maxElements);
next();
}
// return true if the second argument represents a hit which is preferred
// to the hit in the first argument
function isPreferred(existingHit, candidateHit) {
// prefer a record with a postcode
// https://github.com/pelias/api/issues/872
if( !_.has(existingHit, 'address_parts.zip') &&
_.has(candidateHit, 'address_parts.zip') ){ return true; }
// if the existing hit HAS a postcode, and this candidate does NOT, keep the existing hit
if( _.has(existingHit, 'address_parts.zip') &&
!_.has(candidateHit, 'address_parts.zip') ){ return false; }
// prefer non-canonical sources over canonical ones
if( !_.includes(canonical_sources, candidateHit.source) &&
_.includes(canonical_sources, existingHit.source) ){ return true; }
// prefer certain layers over others
if( existingHit.layer !== candidateHit.layer && _.isArray( layerPreferences ) ){
for( let i=0; i<layerPreferences.length; i++ ){
if( existingHit.layer === layerPreferences[i] ){ return false; }
if( candidateHit.layer === layerPreferences[i] ){ return true; }
}
}
// prefer certain sources over others
if( existingHit.source !== candidateHit.source ){
switch( existingHit.source ){
// WOF has bbox and is generally preferred
case 'geonames': return candidateHit.source === 'whosonfirst' || candidateHit.source === 'openstreetmap';
// addresses are generally better in OA
case 'openstreetmap': return candidateHit.source === 'openaddresses';
// venues are better in OSM than WOF
case 'whosonfirst': return candidateHit.source === 'openstreetmap';
}
}
// no preference, keep existing hit
return false;
}
module.exports = function() {
return dedupeResults;
};