Skip to content

Commit

Permalink
Remove nGram and edgeNGram token filter names (#38911)
Browse files Browse the repository at this point in the history
In #30209 we deprecated the camel case `nGram` filter name in favour of `ngram` and
did the same for `edgeNGram` and `edge_ngram`. Using these names has been deprecated
since 6.4 and is issuing deprecation warnings since then.
I think we can remove these filters in 8.0. In a backport of this PR I would change what was a
dreprecation warning from 6.4. to an error starting with new indices created in 7.0.
  • Loading branch information
Christoph Büscher authored Feb 15, 2019
1 parent 8ee9657 commit 7bb2da1
Show file tree
Hide file tree
Showing 10 changed files with 43 additions and 149 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[[analysis-edgengram-tokenfilter]]
=== Edge NGram Token Filter

A token filter of type `edgeNGram`.
A token filter of type `edge_ngram`.

The following are settings that can be set for a `edgeNGram` token
The following are settings that can be set for a `edge_ngram` token
filter type:

[cols="<,<",options="header",]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[[analysis-ngram-tokenfilter]]
=== NGram Token Filter

A token filter of type `nGram`.
A token filter of type `ngram`.

The following are settings that can be set for a `nGram` token filter
The following are settings that can be set for a `ngram` token filter
type:

[cols="<,<",options="header",]
Expand Down
6 changes: 5 additions & 1 deletion docs/reference/migration/migrate_8_0.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ your application to {es} 8.0.

See also <<release-highlights>> and <<es-release-notes>>.

coming[8.0.0]
coming[8.0.0]

* <<breaking_80_mappings_changes>>

include::migrate_8_0/mappings.asciidoc[]
10 changes: 10 additions & 0 deletions docs/reference/migration/migrate_8_0/mappings.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[float]
[[breaking_80_mappings_changes]]
=== Mapping changes

[float]
==== The `nGram` and `edgeNGram` token filter names have been removed

The `nGram` and `edgeNGram` token filter names that have been deprecated since
version 6.4 have been removed. Both token filters should be used by their
alternative names `ngram` and `edge_ngram` instead.
Original file line number Diff line number Diff line change
Expand Up @@ -414,14 +414,6 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
new EdgeNGramTokenFilter(input, 1)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_deprecation",
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}
return new EdgeNGramTokenFilter(reader, 1);
}));
filters.add(PreConfiguredTokenFilter.singleton("elision", true,
input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
Expand All @@ -438,14 +430,6 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_deprecation",
"The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
}
return new NGramTokenFilter(reader, 1, 2, false);
}));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
filters.put("delimited_payload", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class);
filters.put("dutch_stem", SnowballPorterFilterFactory.class);
filters.put("edge_ngram", null);
filters.put("edgeNGram", null);
filters.put("elision", null);
filters.put("french_stem", SnowballPorterFilterFactory.class);
filters.put("german_stem", null);
Expand All @@ -197,7 +196,6 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
filters.put("length", null);
filters.put("limit", LimitTokenCountFilterFactory.class);
filters.put("ngram", null);
filters.put("nGram", null);
filters.put("persian_normalization", null);
filters.put("porter_stem", null);
filters.put("reverse", ReverseStringFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,107 +20,21 @@
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

/**
* Check that the deprecated name "nGram" issues a deprecation warning for indices created since 6.3.0
*/
public void testNGramDeprecationWarning() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
assertWarnings(
"The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
}
}

/**
* Check that the deprecated name "nGram" does NOT issues a deprecation warning for indices created before 6.4.0
*/
public void testNGramNoDeprecationWarningPre6_4() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, Version.V_6_3_0))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
}
}

/**
* Check that the deprecated name "edgeNGram" issues a deprecation warning for indices created since 6.3.0
*/
public void testEdgeNGramDeprecationWarning() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
assertWarnings(
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}
}

/**
* Check that the deprecated name "edgeNGram" does NOT issues a deprecation warning for indices created before 6.4.0
*/
public void testEdgeNGramNoDeprecationWarningPre6_4() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, Version.V_6_3_0))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
}
}


/**
* Check that the deprecated analyzer name "standard_html_strip" throws exception for indices created since 7.0.0
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public void testNgramHighlightingWithBrokenPositions() throws IOException {
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.tokenizer.autocomplete.type", "ngram")
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putList("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,38 +23,6 @@
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }

---
"nGram":
- do:
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: nGram
min_gram: 2
max_gram: 2
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: go }
- match: { detail.tokenizer.tokens.1.token: oo }
- match: { detail.tokenizer.tokens.2.token: od }

---
"nGram_exception":
- skip:
version: " - 6.99.99"
reason: only starting from version 7.x this throws an error
- do:
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: nGram
min_gram: 2
max_gram: 4
---
"simple_pattern":
- do:
indices.analyze:
Expand Down Expand Up @@ -133,7 +101,7 @@
text: "foobar"
explain: true
tokenizer:
type: nGram
type: ngram
min_gram: 3
max_gram: 3
- length: { detail.tokenizer.tokens: 4 }
Expand Down Expand Up @@ -162,15 +130,31 @@
body:
text: "foo"
explain: true
tokenizer: nGram
tokenizer: ngram
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: nGram }
- match: { detail.tokenizer.name: ngram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- match: { detail.tokenizer.tokens.2.token: o }
- match: { detail.tokenizer.tokens.3.token: oo }
- match: { detail.tokenizer.tokens.4.token: o }

---
"ngram_exception":
- skip:
version: " - 6.99.99"
reason: only starting from version 7.x this throws an error
- do:
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: ngram
min_gram: 2
max_gram: 4

---
"edge_ngram":
- do:
Expand All @@ -194,7 +178,7 @@
text: "foo"
explain: true
tokenizer:
type: edgeNGram
type: edge_ngram
min_gram: 1
max_gram: 3
- length: { detail.tokenizer.tokens: 3 }
Expand All @@ -219,9 +203,9 @@
body:
text: "foo"
explain: true
tokenizer: edgeNGram
tokenizer: edge_ngram
- length: { detail.tokenizer.tokens: 2 }
- match: { detail.tokenizer.name: edgeNGram }
- match: { detail.tokenizer.name: edge_ngram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
analysis:
tokenizer:
trigram:
type: nGram
type: ngram
min_gram: 3
max_gram: 3
filter:
Expand Down

0 comments on commit 7bb2da1

Please sign in to comment.