Skip to content

Commit

Permalink
refactor synonyms to be easier to modify, change some names for token…
Browse files Browse the repository at this point in the history
… filters to be more consistent
  • Loading branch information
missinglink committed Mar 21, 2018
1 parent 8bb01fa commit 1b40e4d
Show file tree
Hide file tree
Showing 11 changed files with 453 additions and 533 deletions.
20 changes: 20 additions & 0 deletions .jshintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"node": true,
"curly": true,
"eqeqeq": true,
"esversion": 6,
"freeze": true,
"immed": true,
"indent": 2,
"latedef": false,
"newcap": true,
"noarg": true,
"noempty": true,
"nonbsp": true,
"nonew": true,
"plusplus": false,
"undef": true,
"unused": false,
"maxparams": 4,
"maxdepth": 4
}
74 changes: 33 additions & 41 deletions settings.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
'use strict';

var fs = require('fs');
var path = require('path');
var merge = require('lodash.merge');
var peliasConfig = require('pelias-config');
var punctuation = require('./punctuation');
var street_suffix = require('./street_suffix');
var synonymFile = require('./synonyms/parser');

// load synonyms from disk
var synonyms = fs.readdirSync('./synonyms')
.sort()
.filter( f => f.match(/\.txt$/) )
.reduce(( acc, cur ) => {
acc[cur.replace('.txt','')] = synonymFile(
path.join( './synonyms', cur )
);
return acc;
}, {});

require('./configValidation').validate(peliasConfig.generate());

Expand Down Expand Up @@ -102,8 +115,8 @@ function generate(){
"icu_folding",
"trim",
"ampersand",
"street_synonym",
"direction_synonym",
"street_suffix_contractions",
"directionals",
"unique",
"notnull"
]
Expand Down Expand Up @@ -139,9 +152,9 @@ function generate(){
"lowercase",
"icu_folding",
"remove_duplicate_spaces",
].concat( street_suffix.synonyms.map( function( synonym ){
].concat( synonyms.street_suffix_contractions.map( function( synonym ){
return "keyword_street_suffix_" + synonym.split(' ')[0];
})).concat( street_suffix.direction_synonyms.map( function( synonym ){
})).concat( synonyms.directionals.map( function( synonym ){
return "keyword_compass_" + synonym.split(' ')[0];
})).concat([
"remove_ordinals",
Expand All @@ -150,10 +163,6 @@ function generate(){
}
},
"filter" : {
"ampersand" :{
"type": "synonym",
"synonyms": [ "and => &" ]
},
"notnull" :{
"type" : "length",
"min" : 1
Expand All @@ -173,30 +182,6 @@ function generate(){
"pattern" : "^(0*)",
"replacement" : ""
},
"address_stop": {
"type": "stop",
"stopwords": street_suffix.terms
},
"street_synonym": {
"type": "synonym",
"synonyms": street_suffix.synonyms
},
"partial_token_address_suffix_expansion": {
"type": "synonym",
"synonyms": street_suffix.partial_token_safe_expansions
},
"full_token_address_suffix_expansion": {
"type": "synonym",
"synonyms": street_suffix.full_token_safe_expansions
},
"direction_synonym": {
"type": "synonym",
"synonyms": street_suffix.direction_synonyms
},
"direction_synonym_contraction_keep_original": {
"type": "synonym",
"synonyms": street_suffix.direction_synonyms_keep_original
},
"remove_ordinals" : {
"type" : "pattern_replace",
"pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
Expand Down Expand Up @@ -276,6 +261,13 @@ function generate(){
}
};

for( var key in synonyms ){
settings.analysis.filter[ key ] = {
"type": "synonym",
"synonyms": synonyms[ key ]
};
}

// dynamically create filters which can replace text *inside* a token.
// we are not able to re-use the synonym functionality in elasticsearch
// because it only matches whole tokens, not strings *within* tokens.
Expand All @@ -284,30 +276,30 @@ function generate(){

// street suffix filters (replace text inside tokens)
// based off synonym list
street_suffix.synonyms.forEach( function( synonym ){
synonyms.street_suffix_contractions.forEach( function( synonym ){
var split = synonym.split(' ');
settings.analysis.filter[ "keyword_street_suffix_" + split[0] ] = {
"type": "pattern_replace",
"pattern": " " + split[0],
"replacement": " " + split[2]
}
};
});

// compass prefix filters (replace text inside tokens)
// based off direction_synonyms list
street_suffix.direction_synonyms.forEach( function( synonym ){
// based off directionals list
synonyms.directionals.forEach( function( synonym ){
var split = synonym.split(' ');
settings.analysis.filter[ "keyword_compass_" + split[0] ] = {
"type": "pattern_replace",
"pattern": split[0],
"replacement": split[2]
}
};
});

// Merge settings from pelias/config
if( 'object' == typeof config &&
'object' == typeof config.elasticsearch &&
'object' == typeof config.elasticsearch.settings ){
if( 'object' === typeof config &&
'object' === typeof config.elasticsearch &&
'object' === typeof config.elasticsearch.settings ){
return merge({}, settings, config.elasticsearch.settings);
}

Expand Down
Loading

0 comments on commit 1b40e4d

Please sign in to comment.