Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce NativeEngineKNNVectorsFormat as a KNNVectorsFormat for Native engines #1855

Merged
merged 1 commit into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import lombok.Getter;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.RamUsageEstimator;

import java.util.HashMap;
import java.util.Map;

/**
* NativeEngineVectorFieldsWriter is a class that will be used to accumulate all the vectors during ingestion before
* lucene does a flush. This class ensures that KNNVectorWriter is free from generics and this class can encapsulate
* all the details related to vectors types and docIds.
*
* @param <T> float[] or byte[]
*/
class NativeEngineFieldVectorsWriter<T> extends KnnFieldVectorsWriter<T> {
navneet1v marked this conversation as resolved.
Show resolved Hide resolved
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEngineFieldVectorsWriter.class);
private final FieldInfo fieldInfo;
/**
* We are using a map here instead of list, because for sampler interface for quantization we have to advance the iterator
navneet1v marked this conversation as resolved.
Show resolved Hide resolved
* to a specific docId, there a list cannot be useful because a docId != index of the vector in the list. Similar
* thing is true when we have vector field in child document. There doc Ids will not be consistent. Hence, we need to
* use the map here.
*/
@Getter
private final Map<Integer, T> vectors;
private int lastDocID = -1;
@Getter
private final DocsWithFieldSet docsWithField;
private final InfoStream infoStream;

static NativeEngineFieldVectorsWriter<?> create(final FieldInfo fieldInfo, final InfoStream infoStream) {
switch (fieldInfo.getVectorEncoding()) {
case FLOAT32:
return new NativeEngineFieldVectorsWriter<float[]>(fieldInfo, infoStream);
case BYTE:
return new NativeEngineFieldVectorsWriter<byte[]>(fieldInfo, infoStream);
}
throw new IllegalStateException("Unsupported Vector encoding : " + fieldInfo.getVectorEncoding());
}

NativeEngineFieldVectorsWriter(final FieldInfo fieldInfo, final InfoStream infoStream) {
navneet1v marked this conversation as resolved.
Show resolved Hide resolved
this.fieldInfo = fieldInfo;
this.infoStream = infoStream;
vectors = new HashMap<>();
this.docsWithField = new DocsWithFieldSet();
}

/**
* Add new docID with its vector value to the given field for indexing. Doc IDs must be added in
* increasing order.
*
* @param docID int
* @param vectorValue T
*/
@Override
public void addValue(int docID, T vectorValue) {
if (docID == lastDocID) {
throw new IllegalArgumentException(
"[NativeEngineKNNVectorWriter]VectorValuesField \""
+ fieldInfo.name
+ "\" appears more than once in this document (only one value is allowed per field)"
);
}
assert docID > lastDocID;
vectors.put(docID, vectorValue);
docsWithField.add(docID);
lastDocID = docID;
}

/**
* Used to copy values being indexed to internal storage.
*
* @param vectorValue an array containing the vector value to add
* @return a copy of the value; a new array
*/
@Override
public T copyValue(T vectorValue) {
throw new UnsupportedOperationException("NativeEngineVectorFieldsWriter doesn't support copyValue operation");
}

/**
* Return the memory usage of this object in bytes. Negative values are illegal.
*/
@Override
public long ramBytesUsed() {
return SHALLOW_SIZE + docsWithField.ramBytesUsed() + (long) this.vectors.size() * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + (long) this.vectors.size() * RamUsageEstimator.shallowSizeOfInstance(
Integer.class
) + (long) vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

import java.io.IOException;

/**
* This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector
* related data structures.
*/
public class NativeEngines990KnnVectorsFormat extends KnnVectorsFormat {
/** The format for storing, reading, merging vectors on disk */
private static FlatVectorsFormat flatVectorsFormat;
private static final String FORMAT_NAME = "NativeEngines99KnnVectorsFormat";

public NativeEngines990KnnVectorsFormat() {
super(FORMAT_NAME);
flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer());
}

public NativeEngines990KnnVectorsFormat(final FlatVectorsFormat lucene99FlatVectorsFormat) {
super(FORMAT_NAME);
flatVectorsFormat = lucene99FlatVectorsFormat;
}

/**
* Returns a {@link KnnVectorsWriter} to write the vectors to the index.
*
* @param state {@link SegmentWriteState}
*/
@Override
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException {
return new NativeEngines990KnnVectorsWriter(state, flatVectorsFormat.fieldsWriter(state));
}

/**
* Returns a {@link KnnVectorsReader} to read the vectors from the index.
*
* @param state {@link SegmentReadState}
*/
@Override
public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException {
return new NativeEngines990KnnVectorsReader(state, flatVectorsFormat.fieldsReader(state));
}

@Override
public String toString() {
return "NativeEngines99KnnVectorsFormat(name=" + this.getClass().getSimpleName() + ", flatVectorsFormat=" + flatVectorsFormat + ")";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;

import java.io.IOException;

/**
* Vectors reader class for reading the flat vectors for native engines. The class provides methods for iterating
* over the vectors and retrieving their values.
*/
public class NativeEngines990KnnVectorsReader extends KnnVectorsReader {

private final FlatVectorsReader flatVectorsReader;

public NativeEngines990KnnVectorsReader(final SegmentReadState state, final FlatVectorsReader flatVectorsReader) {
this.flatVectorsReader = flatVectorsReader;
}

/**
* Checks consistency of this reader.
*
* <p>Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value
* against large data files.
*
*/
@Override
public void checkIntegrity() throws IOException {
flatVectorsReader.checkIntegrity();
}

/**
* Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
* never {@code null}.
*
* @param field {@link String}
*/
@Override
public FloatVectorValues getFloatVectorValues(final String field) throws IOException {
return flatVectorsReader.getFloatVectorValues(field);
}

/**
* Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
* never {@code null}.
*
* @param field {@link String}
*/
@Override
public ByteVectorValues getByteVectorValues(final String field) throws IOException {
return flatVectorsReader.getByteVectorValues(field);
}

/**
* Return the k nearest neighbor documents as determined by comparison of their vector values for
* this field, to the given vector, by the field's similarity function. The score of each document
* is derived from the vector similarity in a way that ensures scores are positive and that a
* larger score corresponds to a higher ranking.
*
* <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the
* true k closest neighbors. For large values of k (for example when k is close to the total
* number of documents), the search may also retrieve fewer than k documents.
*
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
* contains the number of documents visited during the search. If the search stopped early because
* it hit {@code visitedLimit}, it is indicated through the relation {@code
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
*
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
* FieldInfo}. The return value is never {@code null}.
*
* @param field the vector field to search
* @param target the vector-valued query
* @param knnCollector a KnnResults collector and relevant settings for gathering vector results
* @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null}
* if they are all allowed to match.
*/
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException {
throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader");
navneet1v marked this conversation as resolved.
Show resolved Hide resolved
}

/**
navneet1v marked this conversation as resolved.
Show resolved Hide resolved
* Return the k nearest neighbor documents as determined by comparison of their vector values for
* this field, to the given vector, by the field's similarity function. The score of each document
* is derived from the vector similarity in a way that ensures scores are positive and that a
* larger score corresponds to a higher ranking.
*
* <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the
* true k closest neighbors. For large values of k (for example when k is close to the total
* number of documents), the search may also retrieve fewer than k documents.
*
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
* contains the number of documents visited during the search. If the search stopped early because
* it hit {@code visitedLimit}, it is indicated through the relation {@code
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
*
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
* FieldInfo}. The return value is never {@code null}.
*
* @param field the vector field to search
* @param target the vector-valued query
* @param knnCollector a KnnResults collector and relevant settings for gathering vector results
* @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null}
* if they are all allowed to match.
*/
@Override
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException {
throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader");
}

/**
* Closes this stream and releases any system resources associated
* with it. If the stream is already closed then invoking this
* method has no effect.
*
* <p> As noted in {@link AutoCloseable#close()}, cases where the
* close may fail require careful attention. It is strongly advised
* to relinquish the underlying resources and to internally
* <em>mark</em> the {@code Closeable} as closed, prior to throwing
* the {@code IOException}.
*
* @throws IOException if an I/O error occurs
*/
@Override
public void close() throws IOException {
IOUtils.close(flatVectorsReader);
jmazanec15 marked this conversation as resolved.
Show resolved Hide resolved
}

/**
* Return the memory usage of this object in bytes. Negative values are illegal.
*/
@Override
public long ramBytesUsed() {
return flatVectorsReader.ramBytesUsed();
}
}
Loading
Loading