Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace TokenizerFactory with Supplier #32063

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
private boolean tokenizeOnSymbol = false;

public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);

for (final String c : settings.getAsList("tokenize_on_chars")) {
if (c == null || c.length() == 0) {
Expand Down Expand Up @@ -109,7 +109,7 @@ private char parseEscapedChar(final String s) {
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new CharTokenizer() {
@Override
protected boolean isTokenChar(int c) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {
private final int maxTokenLength;

ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
ClassicTokenizer tokenizer = new ClassicTokenizer();
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
Expand Down Expand Up @@ -122,7 +123,6 @@
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.elasticsearch.plugins.AnalysisPlugin;
Expand All @@ -134,6 +134,7 @@
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.function.Supplier;

import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings;

Expand Down Expand Up @@ -262,8 +263,8 @@ public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
}

@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
public Map<String, AnalysisProvider<Supplier<Tokenizer>>> getTokenizers() {
Map<String, AnalysisProvider<Supplier<Tokenizer>>> tokenizers = new TreeMap<>();
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
tokenizers.put("thai", ThaiTokenizerFactory::new);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
private final CharMatcher matcher;

EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
}

@Override
public Tokenizer create() {
public Tokenizer get() {
if (matcher == null) {
return new EdgeNGramTokenizer(minGram, maxGram);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ public class KeywordTokenizerFactory extends AbstractTokenizerFactory {
private final int bufferSize;

KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
bufferSize = settings.getAsInt("buffer_size", 256);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new KeywordTokenizer(bufferSize);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
public class LetterTokenizerFactory extends AbstractTokenizerFactory {

LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new LetterTokenizer();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {

LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new LowerCaseTokenizer();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
}

NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
Expand All @@ -105,7 +105,7 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
}

@Override
public Tokenizer create() {
public Tokenizer get() {
if (matcher == null) {
return new NGramTokenizer(minGram, maxGram);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
private final boolean reverse;

PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
bufferSize = settings.getAsInt("buffer_size", 1024);
String delimiter = settings.get("delimiter");
if (delimiter == null) {
Expand All @@ -61,7 +61,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
}

@Override
public Tokenizer create() {
public Tokenizer get() {
if (reverse) {
return new ReversePathHierarchyTokenizer(bufferSize, delimiter, replacement, skip);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
private final int group;

PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);

String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
if (sPattern == null) {
Expand All @@ -47,7 +47,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new PatternTokenizer(pattern, group);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory
private final String pattern;

public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);

pattern = settings.get("pattern", "");
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new SimplePatternSplitTokenizer(pattern);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
private final String pattern;

public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);

pattern = settings.get("pattern", "");
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new SimplePatternTokenizer(pattern);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
public class ThaiTokenizerFactory extends AbstractTokenizerFactory {

ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new ThaiTokenizer();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
private final int maxTokenLength;

UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer();
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ public class WhitespaceTokenizerFactory extends AbstractTokenizerFactory {
private Integer maxTokenLength;

WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
super(indexSettings, settings);
maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}

@Override
public Tokenizer create() {
public Tokenizer get() {
return new WhitespaceTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, maxTokenLength);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public void testParseTokenChars() {
new String[] { "commas" },
new String[] { "a", "b", "c", "\\$" })) {
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).get());
}

for (String[] conf : Arrays.asList(
Expand All @@ -56,7 +56,7 @@ public void testParseTokenChars() {
new String[] { "\\r" },
new String[] { "f", "o", "o", "symbol" })) {
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
new CharGroupTokenizerFactory(indexProperties, null, name, settings).get();
// no exception
}
}
Expand All @@ -67,7 +67,7 @@ public void testTokenization() throws IOException {
final Settings indexSettings = newAnalysisSettingsBuilder().build();
final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
null, name, settings).create();
null, name, settings).get();
tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void testParseTokenChars() {
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.put("token_chars", tokenChars).build();
try {
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
new NGramTokenizerFactory(indexProperties, null, name, settings).get();
fail();
} catch (IllegalArgumentException expected) {
// OK
Expand All @@ -62,7 +62,7 @@ public void testParseTokenChars() {
.put("token_chars", tokenChars).build();
indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);

new NGramTokenizerFactory(indexProperties, null, name, settings).create();
new NGramTokenizerFactory(indexProperties, null, name, settings).get();
// no exception
}
}
Expand All @@ -75,7 +75,7 @@ public void testNoTokenChars() throws IOException {
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4)
.putList("token_chars", new String[0]).build();
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
.create();
.get();
tokenizer.setReader(new StringReader("1.34"));
assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"});
}
Expand All @@ -88,13 +88,13 @@ public void testPreTokenization() throws IOException {
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.put("token_chars", "letter,digit").build();
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
.create();
.get();
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
assertTokenStreamContents(tokenizer,
new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).get();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer,
new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
Expand All @@ -107,14 +107,14 @@ public void testPreTokenizationEdge() throws IOException {
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
Tokenizer tokenizer =
new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).get();
tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
assertTokenStreamContents(tokenizer,
new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3)
.put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
.create();
.get();
tokenizer.setReader(new StringReader(" a!$ 9"));
assertTokenStreamContents(tokenizer,
new String[] {" a", " a!"});
Expand Down Expand Up @@ -163,7 +163,7 @@ public void testMaxNGramDiffException() throws Exception{

final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build();
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
new NGramTokenizerFactory(indexProperties, null, name, settings).create());
new NGramTokenizerFactory(indexProperties, null, name, settings).get());
assertEquals(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
Expand Down
Loading