Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MappingCharFilter #1107

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.lucene.analysis.pattern;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.IOException;

/**
* A TokenFilter which applies a Pattern to each token in the stream,
* replacing match occurances with the specified replacement string.
*
* <p>
* <b>Note:</b> Depending on the input and the pattern used and the input
* TokenStream, this TokenFilter may produce Tokens whose text is the empty
* string.
* </p>
*
* @see Pattern
*/
public final class PatternReplaceFilter extends TokenFilter {
private final Pattern p;
private final String replacement;
private final boolean all;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final Matcher m;

/**
* Constructs an instance to replace either the first, or all occurances
*
* @param in the TokenStream to process
* @param p the patterm to apply to each Token
* @param replacement the "replacement string" to substitute, if null a
* blank string will be used. Note that this is not the literal
* string that will be used, '$' and '\' have special meaning.
* @param all if true, all matches will be replaced otherwise just the first match.
* @see Matcher#quoteReplacement
*/
public PatternReplaceFilter(TokenStream in,
Pattern p,
String replacement,
boolean all) {
super(in);
this.p=p;
this.replacement = (null == replacement) ? "" : replacement;
this.all=all;
this.m = p.matcher(termAtt);
}

@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;

m.reset();
if (m.find()) {
// replaceAll/replaceFirst will reset() this previous find.
String transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement);
termAtt.setEmpty().append(transformed);
}

return true;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -366,13 +366,18 @@ private static class DefaultProcessor extends AnalysisBinderProcessor {
}

private static class ExtendedProcessor extends AnalysisBinderProcessor {
@Override public void processCharFilters(CharFiltersBindings charFiltersBindings) {
charFiltersBindings.processCharFilter("mapping", MappingCharFilterFactory.class);
}

@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("stemmer", StemmerTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("word_delimiter", WordDelimiterTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);

tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("phonetic", PhoneticTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("dictionary_decompounder", DictionaryCompoundWordTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("hypennation_decompounder", HyphenationCompoundWordTokenFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;

import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@AnalysisSettingsRequired
public class MappingCharFilterFactory extends AbstractCharFilterFactory {

private final NormalizeCharMap normMap;

@Inject public MappingCharFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);

List<String> rules = Analysis.getWordList(env, settings, "mappings");
if (rules == null) {
throw new ElasticSearchIllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
}

normMap = new NormalizeCharMap();
parseRules(rules, normMap);
}

@Override public CharStream create(CharStream tokenStream) {
return new MappingCharFilter(normMap, tokenStream);
}

// source => target
private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");

/**
* parses a list of MappingCharFilter style rules into a normalize char map
*/
private void parseRules(List<String> rules, NormalizeCharMap map) {
for (String rule : rules) {
Matcher m = rulePattern.matcher(rule);
if (!m.find())
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
String rhs = parseString(m.group(2).trim());
if (lhs == null || rhs == null)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
map.add(lhs, rhs);
}
}

char[] out = new char[256];

private String parseString(String s) {
int readPos = 0;
int len = s.length();
int writePos = 0;
while (readPos < len) {
char c = s.charAt(readPos++);
if (c == '\\') {
if (readPos >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = s.charAt(readPos++);
switch (c) {
case '\\':
c = '\\';
break;
case 'n':
c = '\n';
break;
case 't':
c = '\t';
break;
case 'r':
c = '\r';
break;
case 'b':
c = '\b';
break;
case 'f':
c = '\f';
break;
case 'u':
if (readPos + 3 >= len)
throw new RuntimeException("Invalid escaped char in [" + s + "]");
c = (char) Integer.parseInt(s.substring(readPos, readPos + 4), 16);
readPos += 4;
break;
}
}
out[writePos++] = c;
}
return new String(out, 0, writePos);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;

import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;

public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory {

private final Pattern pattern;
private final String replacement;
private final boolean all;

@Inject public PatternReplaceTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);

String sPattern = settings.get("pattern", "");
if (sPattern == null) {
throw new ElasticSearchIllegalArgumentException("pattern is missing for [" + name + "] token filter of type 'pattern_replace'");
}

this.pattern = Regex.compile(sPattern, settings.get("flags"));

String sReplacement = settings.get("replacement", "");
if (sReplacement == null) {
throw new ElasticSearchIllegalArgumentException("replacement is missing for [" + name + "] token filter of type 'pattern_replace'");
}

this.replacement = sReplacement;

this.all = settings.getAsBoolean("all", true);
}

@Override public TokenStream create(TokenStream tokenStream) {
return new PatternReplaceFilter(tokenStream, pattern, replacement, all);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ private void testSimpleConfiguration(Settings settings) {
// html = (HtmlStripCharFilterFactory) custom2.charFilters()[1];
// assertThat(html.readAheadLimit(), equalTo(1024));

// verify characters mapping
analyzer = analysisService.analyzer("custom5").analyzer();
assertThat(analyzer, instanceOf(CustomAnalyzer.class));
CustomAnalyzer custom5 = (CustomAnalyzer) analyzer;
assertThat(custom5.tokenFilters()[0], instanceOf(MappingCharFilterFactory.class));

// verify aliases
analyzer = analysisService.analyzer("alias1").analyzer();
assertThat(analyzer, instanceOf(StandardAnalyzer.class));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
"type" : "html_strip",
"escaped_tags" : ["xxx", "yyy"],
"read_ahead" : 1024
},
"my_mapping" : {
"type" : "mapping",
"mappings" : ["ph=>f", "qu=>q"]
}
},
"filter" : {
Expand Down Expand Up @@ -57,6 +61,10 @@
"tokenizer" : "standard",
"filter" : ["my"]
},
"custom5" : {
"tokenizer" : "standard",
"char_filter" : ["my_mapping"]
},
"czechAnalyzerWithStemmer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "stop", "czech_stem"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ index :
type : html_strip
escaped_tags : [xxx, yyy]
read_ahead : 1024
my_mapping :
type : mapping
mappings : [ph=>f, qu=>q]
filter :
stop :
type : stop
Expand Down Expand Up @@ -41,6 +44,9 @@ index :
custom4 :
tokenizer : standard
filter : [my]
custom5 :
tokenizer : standard
char_filter : [my_mapping]
czechAnalyzerWithStemmer :
tokenizer : standard
filter : [standard, lowercase, stop, czech_stem]
Expand Down