Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move language analyzers from server to analysis-common module. #31300

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
Expand All @@ -52,17 +53,27 @@
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.apache.lucene.analysis.gl.GalicianAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
Expand All @@ -79,19 +90,26 @@
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
Expand Down Expand Up @@ -130,6 +148,8 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
analyzers.put("standard_html_strip", StandardHtmlStripAnalyzerProvider::new);
analyzers.put("pattern", PatternAnalyzerProvider::new);
analyzers.put("snowball", SnowballAnalyzerProvider::new);

// Language analyzers:
analyzers.put("arabic", ArabicAnalyzerProvider::new);
analyzers.put("armenian", ArmenianAnalyzerProvider::new);
analyzers.put("basque", BasqueAnalyzerProvider::new);
Expand All @@ -147,6 +167,24 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
analyzers.put("french", FrenchAnalyzerProvider::new);
analyzers.put("galician", GalicianAnalyzerProvider::new);
analyzers.put("german", GermanAnalyzerProvider::new);
analyzers.put("greek", GreekAnalyzerProvider::new);
analyzers.put("hindi", HindiAnalyzerProvider::new);
analyzers.put("hungarian", HungarianAnalyzerProvider::new);
analyzers.put("indonesian", IndonesianAnalyzerProvider::new);
analyzers.put("irish", IrishAnalyzerProvider::new);
analyzers.put("italian", ItalianAnalyzerProvider::new);
analyzers.put("latvian", LatvianAnalyzerProvider::new);
analyzers.put("lithuanian", LithuanianAnalyzerProvider::new);
analyzers.put("norwegian", NorwegianAnalyzerProvider::new);
analyzers.put("persian", PersianAnalyzerProvider::new);
analyzers.put("portuguese", PortugueseAnalyzerProvider::new);
analyzers.put("romanian", RomanianAnalyzerProvider::new);
analyzers.put("russian", RussianAnalyzerProvider::new);
analyzers.put("sorani", SoraniAnalyzerProvider::new);
analyzers.put("spanish", SpanishAnalyzerProvider::new);
analyzers.put("swedish", SwedishAnalyzerProvider::new);
analyzers.put("turkish", TurkishAnalyzerProvider::new);
analyzers.put("thai", ThaiAnalyzerProvider::new);
return analyzers;
}

Expand Down Expand Up @@ -247,13 +285,15 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
@Override
public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactories() {
List<PreBuiltAnalyzerProviderFactory> analyzers = new ArrayList<>();
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.LUCENE,
analyzers.add(new PreBuiltAnalyzerProviderFactory("standard_html_strip", CachingStrategy.ELASTICSEARCH,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to change this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, see previous comment.

() -> new StandardHtmlStripAnalyzer(CharArraySet.EMPTY_SET)));
analyzers.add(new PreBuiltAnalyzerProviderFactory("pattern", CachingStrategy.ELASTICSEARCH,
() -> new PatternAnalyzer(Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true,
CharArraySet.EMPTY_SET)));
analyzers.add(new PreBuiltAnalyzerProviderFactory("snowball", CachingStrategy.LUCENE,
() -> new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS_SET)));

// Language analyzers:
analyzers.add(new PreBuiltAnalyzerProviderFactory("arabic", CachingStrategy.LUCENE, ArabicAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("armenian", CachingStrategy.LUCENE, ArmenianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("basque", CachingStrategy.LUCENE, BasqueAnalyzer::new));
Expand All @@ -262,7 +302,7 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
analyzers.add(new PreBuiltAnalyzerProviderFactory("bulgarian", CachingStrategy.LUCENE, BulgarianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("catalan", CachingStrategy.LUCENE, CatalanAnalyzer::new));
// chinese analyzer: only for old indices, best effort
analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.LUCENE, StandardAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("chinese", CachingStrategy.ONE, StandardAnalyzer::new));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to change this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I made a copy paste error. If you take a look at the history of the PreBuiltAnalyzers.java file, you will see that this was the cache strategy used. Sorry, I should have mentioned this specifically in this pr.

analyzers.add(new PreBuiltAnalyzerProviderFactory("cjk", CachingStrategy.LUCENE, CJKAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("czech", CachingStrategy.LUCENE, CzechAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
Expand All @@ -272,6 +312,24 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("german", CachingStrategy.LUCENE, GermanAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("greek", CachingStrategy.LUCENE, GreekAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("hindi", CachingStrategy.LUCENE, HindiAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("hungarian", CachingStrategy.LUCENE, HungarianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("indonesian", CachingStrategy.LUCENE, IndonesianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("irish", CachingStrategy.LUCENE, IrishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("italian", CachingStrategy.LUCENE, ItalianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("latvian", CachingStrategy.LUCENE, LatvianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("lithuanian", CachingStrategy.LUCENE, LithuanianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("norwegian", CachingStrategy.LUCENE, NorwegianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("persian", CachingStrategy.LUCENE, PersianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("portuguese", CachingStrategy.LUCENE, PortugueseAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("romanian", CachingStrategy.LUCENE, RomanianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("russian", CachingStrategy.LUCENE, RussianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("sorani", CachingStrategy.LUCENE, SoraniAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("spanish", CachingStrategy.LUCENE, SpanishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("swedish", CachingStrategy.LUCENE, SwedishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("turkish", CachingStrategy.LUCENE, TurkishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("thai", CachingStrategy.LUCENE, ThaiAnalyzer::new));
return analyzers;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,20 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class GreekAnalyzerProvider extends AbstractIndexAnalyzerProvider<GreekAnalyzer> {

private final GreekAnalyzer analyzer;

public GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new GreekAnalyzer(
Analysis.parseStopWords(env, settings, GreekAnalyzer.getDefaultStopSet()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class HindiAnalyzerProvider extends AbstractIndexAnalyzerProvider<HindiAnalyzer> {

private final HindiAnalyzer analyzer;

public HindiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
HindiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new HindiAnalyzer(
Analysis.parseStopWords(env, settings, HindiAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class HungarianAnalyzerProvider extends AbstractIndexAnalyzerProvider<HungarianAnalyzer> {

private final HungarianAnalyzer analyzer;

public HungarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
HungarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new HungarianAnalyzer(
Analysis.parseStopWords(env, settings, HungarianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class IndonesianAnalyzerProvider extends AbstractIndexAnalyzerProvider<IndonesianAnalyzer> {

private final IndonesianAnalyzer analyzer;

public IndonesianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
IndonesianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new IndonesianAnalyzer(
Analysis.parseStopWords(env, settings, IndonesianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

/**
* Provider for {@link IrishAnalyzer}
Expand All @@ -32,7 +34,7 @@ public class IrishAnalyzerProvider extends AbstractIndexAnalyzerProvider<IrishAn

private final IrishAnalyzer analyzer;

public IrishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
IrishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new IrishAnalyzer(
Analysis.parseStopWords(env, settings, IrishAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class ItalianAnalyzerProvider extends AbstractIndexAnalyzerProvider<ItalianAnalyzer> {

private final ItalianAnalyzer analyzer;

public ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
ItalianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new ItalianAnalyzer(
Analysis.parseStopWords(env, settings, ItalianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class LatvianAnalyzerProvider extends AbstractIndexAnalyzerProvider<LatvianAnalyzer> {

private final LatvianAnalyzer analyzer;

public LatvianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
LatvianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new LatvianAnalyzer(
Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

/**
* Provider for {@link LithuanianAnalyzer}
Expand All @@ -32,7 +34,7 @@ public class LithuanianAnalyzerProvider extends AbstractIndexAnalyzerProvider<Li

private final LithuanianAnalyzer analyzer;

public LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new LithuanianAnalyzer(
Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,21 @@
* under the License.
*/

package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class NorwegianAnalyzerProvider extends AbstractIndexAnalyzerProvider<NorwegianAnalyzer> {

private final NorwegianAnalyzer analyzer;

public NorwegianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
NorwegianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new NorwegianAnalyzer(
Analysis.parseStopWords(env, settings, NorwegianAnalyzer.getDefaultStopSet()),
Expand Down
Loading