forked from pelias/api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfidenceScore.js
273 lines (236 loc) · 8.36 KB
/
confidenceScore.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
/**
*
*Basic confidence score should be computed and returned for each item in the results.
* The score should range between 0-1, and take into consideration as many factors as possible.
*
* Some factors to consider:
*
* - number of results from ES
* - score of item within the range of highest-lowest scores from ES (within the returned set)
* - linguistic match of query
* - detection (or specification) of query type. i.e. an address shouldn't match an admin address.
*/
const _ = require('lodash');
const stats = require('stats-lite');
const logger = require('pelias-logger').get('api');
const field = require('../helper/fieldValue');
var RELATIVE_SCORES = true;
function setup(peliasConfig) {
if (!_.isNil(peliasConfig)) {
RELATIVE_SCORES = peliasConfig.hasOwnProperty('relativeScores') ? peliasConfig.relativeScores : true;
}
return computeScores;
}
function computeScores(req, res, next) {
// do nothing if no result data set or if query is not of the pelias_parser variety
if (_.isUndefined(req.clean) || _.isUndefined(res) ||
_.isUndefined(res.data) || _.isUndefined(res.meta) ||
res.meta.query_type !== 'search_pelias_parser') {
return next();
}
// compute standard deviation and mean from all scores
var scores = res.meta.scores;
var stdev = computeStandardDeviation(scores);
var mean = stats.mean(scores);
// loop through data items and determine confidence scores
res.data = res.data.map(computeConfidenceScore.bind(null, req, mean, stdev));
next();
}
/**
* Check all types of things to determine how confident we are that this result
* is correct. Score is based on overall score distribution in the result set
* as well as how closely the result matches the text parameters.
*
* @param {object} req
* @param {number} mean
* @param {number} stdev
* @param {object} hit
* @returns {object}
*/
function computeConfidenceScore(req, mean, stdev, hit) {
var dealBreakers = checkForDealBreakers(req, hit);
if (dealBreakers) {
hit.confidence = 0.5;
return hit;
}
var checkCount = 3;
hit.confidence = 0;
if (RELATIVE_SCORES) {
checkCount += 2;
hit.confidence += checkDistanceFromMean(hit._score, mean, stdev);
hit.confidence += computeZScore(hit._score, mean, stdev);
}
hit.confidence += checkName(req.clean.text, req.clean.parsed_text, hit);
hit.confidence += checkQueryType(req.clean.parsed_text, hit);
hit.confidence += checkAddress(req.clean.parsed_text, hit);
// TODO: look at categories and location
hit.confidence /= checkCount;
hit.confidence = Number((hit.confidence).toFixed(3));
return hit;
}
/*
* Check for clearly mismatching properties in a result
* zip code and state (region) are currently checked if present
*
* @param {object|undefined} text
* @param {object} hit
* @returns {bool}
*/
function checkForDealBreakers(req, hit) {
if (_.isUndefined(req.clean.parsed_text)) {
return false;
}
if (!_.isNil(req.clean.parsed_text.state) && !_.isNil(hit.parent) &&
hit.parent.region_a && req.clean.parsed_text.state !== hit.parent.region_a[0]) {
logger.debug('[confidence][deal-breaker]: state !== region_a');
return true;
}
if (!_.isNil(req.clean.parsed_text.postalcode) && !_.isNil(hit.address_parts) &&
req.clean.parsed_text.postalcode !== hit.address_parts.zip) {
return true;
}
}
/**
* Check how statistically significant the score of this result is
* given mean and standard deviation
*
* @param {number} score
* @param {number} mean
* @param {number} stdev
* @returns {number}
*/
function checkDistanceFromMean(score, mean, stdev) {
return (score - mean) > stdev ? 1 : 0;
}
/**
* Compare text string or name component of parsed_text against
* default name in result
*
* @param {string} text
* @param {object|undefined} parsed_text
* @param {object} hit
* @returns {number}
*/
function checkName(text, parsed_text, hit) {
// parsed_text name should take precedence if available since it's the cleaner name property
if (!_.isNil(parsed_text) && !_.isNil(parsed_text.name) &&
field.getStringValue(hit.name.default).toLowerCase() === parsed_text.name.toLowerCase()) {
return 1;
}
// if no parsed_text check the text value as provided against result's default name
if (field.getStringValue(hit.name.default).toLowerCase() === text.toLowerCase()) {
return 1;
}
// if no matches detected, don't judge too harshly since it was a longshot anyway
return 0.7;
}
/**
* text being set indicates the query was for an address
* check if house number was specified and found in result
*
* @param {object|undefined} text
* @param {object} hit
* @returns {number}
*/
function checkQueryType(text, hit) {
if (!_.isNil(text) && !_.isNil(text.housenumber) &&
(_.isUndefined(hit.address_parts) ||
(!_.isNil(hit.address_parts) && _.isUndefined(hit.address_parts.number)))) {
return 0;
}
return 1;
}
/**
* Determine the quality of the property match
*
* @param {string|number|undefined|null} textProp
* @param {string|number|undefined|null} hitProp
* @param {boolean} expectEnriched
* @returns {number}
*/
function propMatch(textProp, hitProp, expectEnriched) {
// both missing, but expect to have enriched value in result => BAD
if (_.isUndefined(textProp) && _.isUndefined(hitProp) && !_.isNil(expectEnriched)) { return 0; }
// both missing, and no enrichment expected => GOOD
if (_.isUndefined(textProp) && _.isUndefined(hitProp)) { return 1; }
// text has it, result doesn't => BAD
if (!_.isNil(textProp) && _.isUndefined(hitProp)) { return 0; }
// text missing, result has it, and enrichment is expected => GOOD
if (_.isUndefined(textProp) && !_.isNil(hitProp) && !_.isNil(expectEnriched)) { return 1; }
// text missing, result has it, enrichment not desired => 50/50
if (_.isUndefined(textProp) && !_.isNil(hitProp)) { return 0.5; }
// both present, values match => GREAT
if (!_.isNil(textProp) && !_.isNil(hitProp) &&
textProp.toString().toLowerCase() === hitProp.toString().toLowerCase()) { return 1; }
// ¯\_(ツ)_/¯
return 0.7;
}
/**
* Check various parts of the parsed text address
* against the results
*
* @param {object} text
* @param {string|number} [text.housenumber]
* @param {string} [text.street]
* @param {string} [text.postalcode]
* @param {string} [text.state]
* @param {string} [text.country]
* @param {object} hit
* @param {object} [hit.address_parts]
* @param {string|number} [hit.address_parts.number]
* @param {string} [hit.address_parts.street]
* @param {string|number} [hit.address_parts.zip]
* @param {Array} [hit.parent.region_a]
* @param {Array} [hit.parent.country_a]
* @returns {number}
*/
function checkAddress(text, hit) {
var checkCount = 5;
var res = 0;
if (!_.isNil(text) && !_.isNil(text.housenumber) && !_.isNil(text.street)) {
res += propMatch(text.housenumber, (hit.address_parts ? hit.address_parts.number : null), false);
res += propMatch(text.street, (hit.address_parts ? hit.address_parts.street : null), false);
res += propMatch(text.postalcode, (hit.address_parts ? hit.address_parts.zip: null), true);
res += propMatch(text.state, ((hit.parent && hit.parent.region_a) ? hit.parent.region_a[0] : null), true);
res += propMatch(text.country, ((hit.parent && hit.parent.country_a) ? hit.parent.country_a[0] :null), true);
res /= checkCount;
}
else {
res = 1;
}
return res;
}
/**
* z-scores have an effective range of -3.00 to +3.00.
* An average z-score is ZERO.
* A negative z-score indicates that the item/element is below
* average and a positive z-score means that the item/element
* in above average. When teachers say they are going to "curve"
* the test, they do this by computing z-scores for the students' test scores.
*
* @param {number} score
* @param {number} mean
* @param {number} stdev
* @returns {number}
*/
function computeZScore(score, mean, stdev) {
if (stdev < 0.01) {
return 0;
}
// because the effective range of z-scores is -3.00 to +3.00
// add 10 to ensure a positive value, and then divide by 10+3+3
// to further normalize to %-like result
return (((score - mean) / (stdev)) + 10) / 16;
}
/**
* Computes standard deviation given an array of values
*
* @param {Array} scores
* @returns {number}
*/
function computeStandardDeviation(scores) {
var stdev = stats.stdev(scores);
// if stdev is low, just consider it 0
return (stdev < 0.01) ? 0 : stdev;
}
module.exports = setup;