Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve include/exclude clause list speed and scalability #11188

Closed
wants to merge 2 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals;

import java.io.IOException;
import java.util.HashSet;
Expand Down Expand Up @@ -80,33 +82,65 @@ private void addReject(long val) {
}

// Only used for the 'map' execution mode (ie. scripts)
public static class StringFilter {
public abstract static class StringFilter {
public abstract boolean accept(BytesRef value);
}

static class AutomatonBackedStringFilter extends StringFilter {

private final ByteRunAutomaton runAutomaton;

private StringFilter(Automaton automaton) {
private AutomatonBackedStringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}

/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return runAutomaton.run(value.bytes, value.offset, value.length);
}
}

public static class OrdinalsFilter {
static class TermListBackedStringFilter extends StringFilter {

private final Set<BytesRef> valids;
private final Set<BytesRef> invalids;

public TermListBackedStringFilter(Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
this.valids = includeValues;
this.invalids = excludeValues;
}

/**
* Returns whether the given value is accepted based on the
* {@code include} & {@code exclude} sets.
*/
@Override
public boolean accept(BytesRef value) {
return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
}
}

public static abstract class OrdinalsFilter {
public abstract LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException;

}

static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter {

private final CompiledAutomaton compiled;

private OrdinalsFilter(Automaton automaton) {
private AutomatonBackedOrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
}

/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*
*/
@Override
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Expand All @@ -120,6 +154,43 @@ public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, Values
}

}

static class TermListBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includeValues;
private final SortedSet<BytesRef> excludeValues;

public TermListBackedOrdinalsFilter(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}

@Override
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, WithOrdinals valueSource) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if(includeValues!=null){
for (BytesRef term : includeValues) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
} else {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
if (excludeValues != null) {
for (BytesRef term : excludeValues) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
}
}
}
return acceptedGlobalOrdinals;
}

}

private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
Expand Down Expand Up @@ -325,11 +396,23 @@ private Automaton toAutomaton() {
}

public StringFilter convertToStringFilter() {
return new StringFilter(toAutomaton());
try {
return new AutomatonBackedStringFilter(toAutomaton());
} catch (TooComplexToDeterminizeException e) {
// Hard to pre-empt when this exception will occur due to the number
// terms and possible overlap of contents
// so we let it try and fall-back to Set based lookups if too
// complex.
return new TermListBackedStringFilter(includeValues, excludeValues);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering that maybe we should directly use TermListBackedStringFilter in that case: I wouldn't expect automatons to speed things up a lot and having TermListBackedStringFilter only in this catch block makes it hard to test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even it we remove that logic, it would be good though to keep the explanation that we don't use an automaton for the term list case in order not to blow up the maximum number of states.

}

public OrdinalsFilter convertToOrdinalsFilter() {
return new OrdinalsFilter(toAutomaton());

if (isRegexBased()) {
return new AutomatonBackedOrdinalsFilter(toAutomaton());
}
return new TermListBackedOrdinalsFilter(includeValues, excludeValues);
}

public LongFilter convertToLongFilter() {
Expand Down