-
Notifications
You must be signed in to change notification settings - Fork 24.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bring back sparse_vector mapping #98996
Changes from 9 commits
2a55359
a262af9
44c9a9b
8910e1b
ab8f9ed
d915d64
19a0d1f
6178103
8bbe283
223bd3c
90d74e4
6bfad1b
015de93
c4126d1
959b542
6b818a3
5a66ee6
bfcd7aa
0f718ee
647136a
48350a9
523fda9
a63790a
997ed30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
--- | ||
"Indexing and searching sparse vectors": | ||
|
||
- skip: | ||
version: " - 8.10.99" | ||
reason: "sparse_vector field type reintroduced in 8.11" | ||
|
||
- do: | ||
indices.create: | ||
index: test | ||
body: | ||
settings: | ||
number_of_replicas: 0 | ||
mappings: | ||
properties: | ||
text: | ||
type: text | ||
ml.tokens: | ||
type: sparse_vector | ||
|
||
- match: { acknowledged: true } | ||
|
||
- do: | ||
index: | ||
index: test | ||
id: "1" | ||
body: | ||
text: "running is good for you" | ||
ml: | ||
tokens: | ||
running: 2.4097164 | ||
good: 2.170997 | ||
run: 2.052153 | ||
race: 1.4575411 | ||
for: 1.1908325 | ||
runner: 1.1803857 | ||
exercise: 1.1652642 | ||
you: 0.9654308 | ||
training: 0.94999343 | ||
sports: 0.93650943 | ||
fitness: 0.83129317 | ||
best: 0.820365 | ||
bad: 0.7385934 | ||
health: 0.7098149 | ||
marathon: 0.61555296 | ||
gym: 0.5652374 | ||
|
||
- match: { result: "created" } | ||
|
||
- do: | ||
index: | ||
index: test | ||
id: "2" | ||
body: | ||
text: "walking is a healthy exercise" | ||
ml: | ||
tokens: | ||
walking: 2.4797723 | ||
exercise: 2.074234 | ||
healthy: 1.971596 | ||
walk: 1.6458614 | ||
health: 1.5291847 | ||
walker: 1.4736869 | ||
activity: 1.0793462 | ||
good: 1.0597849 | ||
fitness: 0.91855437 | ||
training: 0.86342937 | ||
movement: 0.7657065 | ||
normal: 0.6694081 | ||
foot: 0.5892523 | ||
physical: 0.4926789 | ||
|
||
- match: { result: "created" } | ||
|
||
- do: | ||
indices.refresh: { } | ||
|
||
- do: | ||
search: | ||
index: test | ||
body: | ||
query: | ||
bool: | ||
should: | ||
- term: | ||
ml.tokens: | ||
value: "walk" | ||
boost: 1.9790847 | ||
- term: | ||
ml.tokens: | ||
value: "walking" | ||
boost: 1.7092685 | ||
- term: | ||
ml.tokens: | ||
value: "exercise" | ||
boost: 0.84076905 | ||
|
||
- match: { hits.total.value: 2 } | ||
- match: { hits.hits.0._id: "2" } | ||
- match: { hits.hits.1._id: "1" } | ||
|
||
--- | ||
"Sparse vector in 7.x": | ||
- skip: | ||
features: warnings | ||
version: "8.0.0 - " | ||
reason: "sparse_vector field type supported in 7.x" | ||
- do: | ||
warnings: | ||
- The [sparse_vector] field type is deprecated and will be removed in 8.0. | ||
indices.create: | ||
index: test | ||
body: | ||
settings: | ||
number_of_replicas: 0 | ||
mappings: | ||
properties: | ||
text: | ||
type: text | ||
ml.tokens: | ||
type: sparse_vector | ||
|
||
- match: { acknowledged: true } | ||
|
||
|
||
--- | ||
"Sparse vector in 8.x": | ||
- skip: | ||
version: " - 7.99.99, 8.11.0 - " | ||
reason: "sparse_vector field type not supported in 8.x until 8.11.0" | ||
- do: | ||
catch: /The \[sparse_vector\] field type is not supported from 8.0 to 8.10 versions./ | ||
indices.create: | ||
index: test | ||
body: | ||
settings: | ||
number_of_replicas: 0 | ||
mappings: | ||
properties: | ||
text: | ||
type: text | ||
ml.tokens: | ||
type: sparse_vector |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is copied and pasted from |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,41 +8,49 @@ | |
|
||
package org.elasticsearch.index.mapper.vectors; | ||
|
||
import org.apache.lucene.document.FeatureField; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.elasticsearch.common.logging.DeprecationCategory; | ||
import org.elasticsearch.common.logging.DeprecationLogger; | ||
import org.elasticsearch.common.lucene.Lucene; | ||
import org.elasticsearch.index.IndexVersion; | ||
import org.elasticsearch.index.analysis.NamedAnalyzer; | ||
import org.elasticsearch.index.fielddata.FieldDataContext; | ||
import org.elasticsearch.index.fielddata.IndexFieldData; | ||
import org.elasticsearch.index.mapper.DocumentParserContext; | ||
import org.elasticsearch.index.mapper.FieldMapper; | ||
import org.elasticsearch.index.mapper.MappedFieldType; | ||
import org.elasticsearch.index.mapper.MapperBuilderContext; | ||
import org.elasticsearch.index.mapper.SourceValueFetcher; | ||
import org.elasticsearch.index.mapper.TextSearchInfo; | ||
import org.elasticsearch.index.mapper.ValueFetcher; | ||
import org.elasticsearch.index.query.SearchExecutionContext; | ||
import org.elasticsearch.search.DocValueFormat; | ||
import org.elasticsearch.xcontent.XContentParser.Token; | ||
|
||
import java.time.ZoneId; | ||
import java.io.IOException; | ||
import java.util.Map; | ||
|
||
import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST; | ||
|
||
/** | ||
* A {@link FieldMapper} for indexing a sparse vector of floats. | ||
* | ||
* @deprecated The sparse_vector type was deprecated in 7.x and removed in 8.0. This mapper | ||
* definition only exists so that 7.x indices can be read without error. | ||
* | ||
* TODO: remove in 9.0. | ||
* A {@link FieldMapper} that exposes Lucene's {@link FeatureField} as a sparse | ||
* vector of features. | ||
*/ | ||
@Deprecated | ||
public class SparseVectorFieldMapper extends FieldMapper { | ||
private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(SparseVectorFieldMapper.class); | ||
static final String ERROR_MESSAGE = "The [sparse_vector] field type is no longer supported."; | ||
static final String ERROR_MESSAGE_7X = "The [sparse_vector] field type is no longer supported. Old 7.x indices are allowed to " | ||
+ "contain [sparse_vector] fields, but they cannot be indexed or searched."; | ||
|
||
public static final String CONTENT_TYPE = "sparse_vector"; | ||
|
||
static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpicky question: Should we use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I copied / pasted from the previous |
||
+ "contain [sparse_vector] fields, but they cannot be indexed or searched."; | ||
static final String ERROR_MESSAGE_8X = "The [sparse_vector] field type is not supported from 8.0 to 8.10 versions."; | ||
|
||
private static SparseVectorFieldType ft(FieldMapper in) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that it is never used. Should we remove it? |
||
return ((SparseVectorFieldMapper) in).fieldType(); | ||
} | ||
|
||
public static class Builder extends FieldMapper.Builder { | ||
|
||
final Parameter<Map<String, String>> meta = Parameter.metaParam(); | ||
private final Parameter<Map<String, String>> meta = Parameter.metaParam(); | ||
|
||
public Builder(String name) { | ||
super(name); | ||
|
@@ -65,18 +73,19 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { | |
} | ||
|
||
public static final TypeParser PARSER = new TypeParser((n, c) -> { | ||
if (c.indexVersionCreated().onOrAfter(IndexVersion.V_8_0_0)) { | ||
throw new IllegalArgumentException(ERROR_MESSAGE); | ||
} else { | ||
if (c.indexVersionCreated().before(IndexVersion.V_8_0_0)) { | ||
mayya-sharipova marked this conversation as resolved.
Show resolved
Hide resolved
|
||
deprecationLogger.warn(DeprecationCategory.MAPPINGS, "sparse_vector", ERROR_MESSAGE_7X); | ||
return new Builder(n); | ||
} else if (c.indexVersionCreated().before(IndexVersion.V_8_11_0)) { | ||
mayya-sharipova marked this conversation as resolved.
Show resolved
Hide resolved
|
||
throw new IllegalArgumentException(ERROR_MESSAGE_8X); | ||
} | ||
}); | ||
|
||
return new Builder(n); | ||
}, notInMultiFields(CONTENT_TYPE)); | ||
|
||
public static final class SparseVectorFieldType extends MappedFieldType { | ||
|
||
public SparseVectorFieldType(String name, Map<String, String> meta) { | ||
super(name, false, false, false, TextSearchInfo.NONE, meta); | ||
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); | ||
} | ||
|
||
@Override | ||
|
@@ -85,28 +94,45 @@ public String typeName() { | |
} | ||
|
||
@Override | ||
public DocValueFormat docValueFormat(String format, ZoneId timeZone) { | ||
throw new UnsupportedOperationException(ERROR_MESSAGE_7X); | ||
public Query existsQuery(SearchExecutionContext context) { | ||
throw new IllegalArgumentException("[sparse_vector] fields do not support [exists] queries"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did we decide to move from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not really a change in the exception, as the method is different. Keep in mind that I copied / pasted |
||
} | ||
|
||
@Override | ||
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { | ||
throw new UnsupportedOperationException(ERROR_MESSAGE_7X); | ||
public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) { | ||
throw new IllegalArgumentException("[sparse_vector] fields do not support sorting, scripting or aggregating"); | ||
} | ||
|
||
@Override | ||
public Query existsQuery(SearchExecutionContext context) { | ||
throw new UnsupportedOperationException(ERROR_MESSAGE_7X); | ||
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { | ||
return SourceValueFetcher.identity(name(), context, format); | ||
} | ||
|
||
@Override | ||
public Query termQuery(Object value, SearchExecutionContext context) { | ||
throw new UnsupportedOperationException(ERROR_MESSAGE_7X); | ||
return FeatureField.newLinearQuery(name(), indexedValueForSearch(value), DEFAULT_BOOST); | ||
} | ||
|
||
private static String indexedValueForSearch(Object value) { | ||
if (value instanceof BytesRef) { | ||
return ((BytesRef) value).utf8ToString(); | ||
} | ||
return value.toString(); | ||
} | ||
} | ||
|
||
private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, MultiFields multiFields, CopyTo copyTo) { | ||
super(simpleName, mappedFieldType, multiFields, copyTo); | ||
super(simpleName, mappedFieldType, multiFields, copyTo, false, null); | ||
} | ||
|
||
@Override | ||
public Map<String, NamedAnalyzer> indexAnalyzers() { | ||
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER); | ||
} | ||
|
||
@Override | ||
public FieldMapper.Builder getMergeBuilder() { | ||
return new Builder(simpleName()).init(this); | ||
} | ||
|
||
@Override | ||
|
@@ -115,22 +141,73 @@ public SparseVectorFieldType fieldType() { | |
} | ||
|
||
@Override | ||
public void parse(DocumentParserContext context) { | ||
throw new UnsupportedOperationException(ERROR_MESSAGE_7X); | ||
protected boolean supportsParsingObject() { | ||
return true; | ||
} | ||
|
||
@Override | ||
public void parse(DocumentParserContext context) throws IOException { | ||
|
||
// No support for indexing / searching 7.x sparse_vector field types | ||
if (context.indexSettings().getIndexVersionCreated().before(IndexVersion.V_8_0_0)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here - checking that we're not operating on indices created before 8.0. Return the same errors that the previous implementation had. |
||
throw new UnsupportedOperationException(ERROR_MESSAGE_7X); | ||
} else if (context.indexSettings().getIndexVersionCreated().before(IndexVersion.V_8_11_0)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here instead of |
||
throw new UnsupportedOperationException(ERROR_MESSAGE_8X); | ||
} | ||
|
||
if (context.parser().currentToken() != Token.START_OBJECT) { | ||
throw new IllegalArgumentException( | ||
"[sparse_vector] fields must be json objects, expected a START_OBJECT but got: " + context.parser().currentToken() | ||
); | ||
} | ||
|
||
String feature = null; | ||
try { | ||
// make sure that we don't expand dots in field names while parsing | ||
context.path().setWithinLeafObject(true); | ||
for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) { | ||
if (token == Token.FIELD_NAME) { | ||
feature = context.parser().currentName(); | ||
if (feature.contains(".")) { | ||
throw new IllegalArgumentException( | ||
"[sparse_vector] fields do not support dots in feature names but found [" + feature + "]" | ||
); | ||
} | ||
} else if (token == Token.VALUE_NULL) { | ||
// ignore feature, this is consistent with numeric fields | ||
} else if (token == Token.VALUE_NUMBER || token == Token.VALUE_STRING) { | ||
final String key = name() + "." + feature; | ||
float value = context.parser().floatValue(true); | ||
if (context.doc().getByKey(key) != null) { | ||
throw new IllegalArgumentException( | ||
"[sparse_vector] fields do not support indexing multiple values for the same " | ||
+ "rank feature [" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of "rank feature", I think we can just say "feature" |
||
+ key | ||
+ "] in the same document" | ||
); | ||
} | ||
context.doc().addWithKey(key, new FeatureField(name(), feature, value)); | ||
} else { | ||
throw new IllegalArgumentException( | ||
"[sparse_vector] fields take hashes that map a feature to a strictly positive " | ||
+ "float, but got unexpected token " | ||
+ token | ||
); | ||
} | ||
} | ||
} finally { | ||
context.path().setWithinLeafObject(false); | ||
} | ||
} | ||
|
||
@Override | ||
protected void parseCreateField(DocumentParserContext context) { | ||
throw new IllegalStateException("parse is implemented directly"); | ||
throw new AssertionError("parse is implemented directly"); | ||
} | ||
|
||
@Override | ||
protected String contentType() { | ||
return CONTENT_TYPE; | ||
} | ||
|
||
@Override | ||
public FieldMapper.Builder getMergeBuilder() { | ||
return new Builder(simpleName()).init(this); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a test that checks indexing and performs a simulated text_expansion query.