Skip to content

Commit

Permalink
Better plural stemmer than minimal_english (opensearch-project#4738)
Browse files Browse the repository at this point in the history
Drops the trailing "e" in taxes, dresses, watches, dishes etc that otherwise
cause mismatches with plural and singular forms.

Signed-off-by: Nicholas Walter Knize <nknize@apache.org>

Co-authored-by: Mark Harwood <markharwood@gmail.com>
Co-authored-by: Nicholas Walter Knize <nknize@apache.org>
(cherry picked from commit c92846d)
  • Loading branch information
nknize committed Oct 19, 2022
1 parent 45fb2e0 commit 8fe8518
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- [Segment Replication] Update replicas to commit SegmentInfos instead of relying on segments_N from primary shards ([#4450](https://github.com/opensearch-project/OpenSearch/pull/4450))
- [Segment Replication] Adding check to make sure checkpoint is not processed when a shard's shard routing is primary ([#4716](https://github.com/opensearch-project/OpenSearch/pull/4716))
- Disable merge on refresh in DiskThresholdDeciderIT ([#4828](https://github.com/opensearch-project/OpenSearch/pull/4828))
- Better plural stemmer than minimal_english ([#4738](https://github.com/opensearch-project/OpenSearch/pull/4738))

### Security

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;

import java.io.IOException;

public final class EnglishPluralStemFilter extends TokenFilter {
private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

public EnglishPluralStemFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}

/**
* Plural stemmer for English based on the {@link EnglishMinimalStemFilter}
* <p>
* This stemmer removes plurals but beyond EnglishMinimalStemFilter adds
* four new suffix rules to remove dangling e characters:
* <ul>
* <li>xes - "boxes" becomes "box"</li>
* <li>sses - "dresses" becomes "dress"</li>
* <li>shes - "dishes" becomes "dish"</li>
* <li>tches - "watches" becomes "watch"</li>
* </ul>
* See https://github.com/elastic/elasticsearch/issues/42892
* <p>
* In addition the s stemmer logic is amended so that
* <ul>
* <li>ees-&gt;ee so that bees matches bee</li>
* <li>ies-&gt;y only on longer words to that ties matches tie</li>
* <li>oes-&gt;o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe</li>
* </ul>
*/
public static class EnglishPluralStemmer {

// Words ending in oes that retain the e when stemmed
public static final char[][] oesExceptions = { "shoes".toCharArray(), "canoes".toCharArray(), "oboes".toCharArray() };
// Words ending in ches that retain the e when stemmed
public static final char[][] chesExceptions = {
"cliches".toCharArray(),
"avalanches".toCharArray(),
"mustaches".toCharArray(),
"moustaches".toCharArray(),
"quiches".toCharArray(),
"headaches".toCharArray(),
"heartaches".toCharArray(),
"porsches".toCharArray(),
"tranches".toCharArray(),
"caches".toCharArray() };

@SuppressWarnings("fallthrough")
public int stem(char s[], int len) {
if (len < 3 || s[len - 1] != 's') return len;

switch (s[len - 2]) {
case 'u':
case 's':
return len;
case 'e':
// Modified ies->y logic from original s-stemmer - only work on strings > 4
// so spies -> spy still but pies->pie.
// The original code also special-cased aies and eies for no good reason as far as I can tell.
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
if (len > 4 && s[len - 3] == 'i') {
s[len - 3] = 'y';
return len - 2;
}

// Suffix rules to remove any dangling "e"
if (len > 3) {
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
if (len > 4 && s[len - 3] == 'x') {
return len - 2;
}
// oes
if (len > 3 && s[len - 3] == 'o') {
if (isException(s, len, oesExceptions)) {
// Only remove the S
return len - 1;
}
// Remove the es
return len - 2;
}
if (len > 4) {
// shes/sses
if (s[len - 4] == 's' && (s[len - 3] == 'h' || s[len - 3] == 's')) {
return len - 2;
}

// ches
if (len > 4) {
if (s[len - 4] == 'c' && s[len - 3] == 'h') {
if (isException(s, len, chesExceptions)) {
// Only remove the S
return len - 1;
}
// Remove the es
return len - 2;

}
}
}
}

default:
return len - 1;
}
}

private boolean isException(char[] s, int len, char[][] exceptionsList) {
for (char[] oesRule : exceptionsList) {
int rulePos = oesRule.length - 1;
int sPos = len - 1;
boolean matched = true;
while (rulePos >= 0 && sPos >= 0) {
if (oesRule[rulePos] != s[sPos]) {
matched = false;
break;
}
rulePos--;
sPos--;
}
if (matched) {
return true;
}
}
return false;
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ public TokenStream create(TokenStream tokenStream) {
return new SnowballFilter(tokenStream, new EnglishStemmer());
} else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) {
return new EnglishMinimalStemFilter(tokenStream);
} else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) {
return new EnglishPluralStemFilter(tokenStream);
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
return new EnglishPossessiveFilter(tokenStream);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,83 @@ public void testPorter2FilterFactory() throws IOException {
}
}

public void testEnglishPluralFilter() throws IOException {
int iters = scaledRandomIntBetween(20, 100);
for (int i = 0; i < iters; i++) {

Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder()
.put("index.analysis.filter.my_plurals.type", "stemmer")
.put("index.analysis.filter.my_plurals.language", "plural_english")
.put("index.analysis.analyzer.my_plurals.tokenizer", "whitespace")
.put("index.analysis.analyzer.my_plurals.filter", "my_plurals")
.put(SETTING_VERSION_CREATED, v)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals");
assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("dresses"));
TokenStream create = tokenFilter.create(tokenizer);
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals");
assertThat(create, instanceOf(EnglishPluralStemFilter.class));

// Check old EnglishMinimalStemmer ("S" stemmer) logic
assertAnalyzesTo(analyzer, "phones", new String[] { "phone" });
assertAnalyzesTo(analyzer, "horses", new String[] { "horse" });
assertAnalyzesTo(analyzer, "cameras", new String[] { "camera" });

// The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem
// (see https://howtospell.co.uk/making-O-words-plural )
// This stemmer removes the es but retains e for a small number of exceptions
assertAnalyzesTo(analyzer, "mosquitoes", new String[] { "mosquito" });
assertAnalyzesTo(analyzer, "heroes", new String[] { "hero" });
// oes exceptions that retain the e.
assertAnalyzesTo(analyzer, "shoes", new String[] { "shoe" });
assertAnalyzesTo(analyzer, "horseshoes", new String[] { "horseshoe" });
assertAnalyzesTo(analyzer, "canoes", new String[] { "canoe" });
assertAnalyzesTo(analyzer, "oboes", new String[] { "oboe" });

// Check improved EnglishPluralStemFilter logic
// sses
assertAnalyzesTo(analyzer, "dresses", new String[] { "dress" });
assertAnalyzesTo(analyzer, "possess", new String[] { "possess" });
assertAnalyzesTo(analyzer, "possesses", new String[] { "possess" });
// xes
assertAnalyzesTo(analyzer, "boxes", new String[] { "box" });
assertAnalyzesTo(analyzer, "axes", new String[] { "axe" });
// shes
assertAnalyzesTo(analyzer, "dishes", new String[] { "dish" });
assertAnalyzesTo(analyzer, "washes", new String[] { "wash" });
// ees
assertAnalyzesTo(analyzer, "employees", new String[] { "employee" });
assertAnalyzesTo(analyzer, "bees", new String[] { "bee" });
// tch
assertAnalyzesTo(analyzer, "watches", new String[] { "watch" });
assertAnalyzesTo(analyzer, "itches", new String[] { "itch" });
// ies->y but only for length >4
assertAnalyzesTo(analyzer, "spies", new String[] { "spy" });
assertAnalyzesTo(analyzer, "ties", new String[] { "tie" });
assertAnalyzesTo(analyzer, "lies", new String[] { "lie" });
assertAnalyzesTo(analyzer, "pies", new String[] { "pie" });
assertAnalyzesTo(analyzer, "dies", new String[] { "die" });

assertAnalyzesTo(analyzer, "lunches", new String[] { "lunch" });
assertAnalyzesTo(analyzer, "avalanches", new String[] { "avalanche" });
assertAnalyzesTo(analyzer, "headaches", new String[] { "headache" });
assertAnalyzesTo(analyzer, "caches", new String[] { "cache" });
assertAnalyzesTo(analyzer, "beaches", new String[] { "beach" });
assertAnalyzesTo(analyzer, "britches", new String[] { "britch" });
assertAnalyzesTo(analyzer, "cockroaches", new String[] { "cockroach" });
assertAnalyzesTo(analyzer, "cliches", new String[] { "cliche" });
assertAnalyzesTo(analyzer, "quiches", new String[] { "quiche" });

}
}

public void testMultipleLanguagesThrowsException() throws IOException {
Version v = VersionUtils.randomVersion(random());
Settings settings = Settings.builder()
Expand Down

0 comments on commit 8fe8518

Please sign in to comment.