Skip to content

Commit

Permalink
Introduced NativeEngineKNNVectorsFormat as a KNNVectorsFormat for Nat…
Browse files Browse the repository at this point in the history
…ive engines

Signed-off-by: Navneet Verma <navneev@amazon.com>
  • Loading branch information
navneet1v committed Jul 18, 2024
1 parent 881364f commit f3894f0
Show file tree
Hide file tree
Showing 9 changed files with 794 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import lombok.Getter;
import org.apache.lucene.codecs.KnnFieldVectorsWriter;
import org.apache.lucene.index.DocsWithFieldSet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.RamUsageEstimator;

import java.util.HashMap;
import java.util.Map;

/**
* NativeEngineVectorFieldsWriter is a class that will be used to accumulate all the vectors during ingestion before
* lucene does a flush. This class ensures that KNNVectorWriter is free from generics and this class can encapsulate
* all the details related to vectors types and docIds.
*
* @param <T> float[] or byte[]
*/
class NativeEngineVectorFieldsWriter<T> extends KnnFieldVectorsWriter<T> {
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEngineVectorFieldsWriter.class);
private final FieldInfo fieldInfo;
@Getter
private final Map<Integer, T> vectors;
private int lastDocID = -1;

@Getter
private final DocsWithFieldSet docsWithField;
private final InfoStream infoStream;

static NativeEngineVectorFieldsWriter<?> create(final FieldInfo fieldInfo, final InfoStream infoStream) {
switch (fieldInfo.getVectorEncoding()) {
case FLOAT32:
return new NativeEngineVectorFieldsWriter<float[]>(fieldInfo, infoStream);
case BYTE:
return new NativeEngineVectorFieldsWriter<byte[]>(fieldInfo, infoStream);
}
throw new IllegalStateException("Unsupported Vector encoding : " + fieldInfo.getVectorEncoding());
}

NativeEngineVectorFieldsWriter(final FieldInfo fieldInfo, final InfoStream infoStream) {
this.fieldInfo = fieldInfo;
this.infoStream = infoStream;
vectors = new HashMap<>();
this.docsWithField = new DocsWithFieldSet();
}

/**
* Add new docID with its vector value to the given field for indexing. Doc IDs must be added in
* increasing order.
*
* @param docID int
* @param vectorValue T
*/
@Override
public void addValue(int docID, T vectorValue) {
if (docID == lastDocID) {
throw new IllegalArgumentException(
"[NativeEngineKNNVectorWriter]VectorValuesField \""
+ fieldInfo.name
+ "\" appears more than once in this document (only one value is allowed per field)"
);
}
assert docID > lastDocID;
vectors.put(docID, vectorValue);
docsWithField.add(docID);
lastDocID = docID;
}

/**
* Used to copy values being indexed to internal storage.
*
* @param vectorValue an array containing the vector value to add
* @return a copy of the value; a new array
*/
@Override
public T copyValue(T vectorValue) {
throw new UnsupportedOperationException("NativeEngineVectorFieldsWriter doesn't support copyValue operation");
}

/**
* Return the memory usage of this object in bytes. Negative values are illegal.
*/
@Override
public long ramBytesUsed() {
return SHALLOW_SIZE + docsWithField.ramBytesUsed() + (long) this.vectors.size() * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + (long) this.vectors.size() * RamUsageEstimator.shallowSizeOfInstance(
Integer.class
) + (long) vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

import java.io.IOException;

/**
* This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector
* related data structures.
*/
public class NativeEngines990KnnVectorsFormat extends KnnVectorsFormat {
/** The format for storing, reading, merging vectors on disk */
private static FlatVectorsFormat flatVectorsFormat;
private static final String FORMAT_NAME = "NativeEngines99KnnVectorsFormat";

public NativeEngines990KnnVectorsFormat() {
super(FORMAT_NAME);
flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer());
}

public NativeEngines990KnnVectorsFormat(final Lucene99FlatVectorsFormat lucene99FlatVectorsFormat) {
super(FORMAT_NAME);
flatVectorsFormat = lucene99FlatVectorsFormat;
}

/**
* Returns a {@link KnnVectorsWriter} to write the vectors to the index.
*
* @param state {@link SegmentWriteState}
*/
@Override
public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException {
return new NativeEngines990KnnVectorsWriter(state, flatVectorsFormat.fieldsWriter(state));
}

/**
* Returns a {@link KnnVectorsReader} to read the vectors from the index.
*
* @param state {@link SegmentReadState}
*/
@Override
public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException {
return new NativeEngines990KnnVectorsReader(state, flatVectorsFormat.fieldsReader(state));
}

@Override
public String toString() {
return "NativeEngines99KnnVectorsFormat(name=NativeEngines99KnnVectorsFormat, flatVectorsFormat=" + flatVectorsFormat + ")";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/

package org.opensearch.knn.index.codec.KNN990Codec;

import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;

import java.io.IOException;

/**
* Vectors reader class for reading the flat vectors for native engines. The class provides methods for iterating
* over the vectors and retrieving their values.
*/
public class NativeEngines990KnnVectorsReader extends KnnVectorsReader {

private final FlatVectorsReader flatVectorsReader;

public NativeEngines990KnnVectorsReader(final SegmentReadState state, final FlatVectorsReader flatVectorsReader) {
this.flatVectorsReader = flatVectorsReader;
}

/**
* Checks consistency of this reader.
*
* <p>Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value
* against large data files.
*
*/
@Override
public void checkIntegrity() throws IOException {
flatVectorsReader.checkIntegrity();
}

/**
* Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
* never {@code null}.
*
* @param field {@link String}
*/
@Override
public FloatVectorValues getFloatVectorValues(final String field) throws IOException {
return flatVectorsReader.getFloatVectorValues(field);
}

/**
* Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
* never {@code null}.
*
* @param field {@link String}
*/
@Override
public ByteVectorValues getByteVectorValues(final String field) throws IOException {
return flatVectorsReader.getByteVectorValues(field);
}

/**
* Return the k nearest neighbor documents as determined by comparison of their vector values for
* this field, to the given vector, by the field's similarity function. The score of each document
* is derived from the vector similarity in a way that ensures scores are positive and that a
* larger score corresponds to a higher ranking.
*
* <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the
* true k closest neighbors. For large values of k (for example when k is close to the total
* number of documents), the search may also retrieve fewer than k documents.
*
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
* contains the number of documents visited during the search. If the search stopped early because
* it hit {@code visitedLimit}, it is indicated through the relation {@code
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
*
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
* FieldInfo}. The return value is never {@code null}.
*
* @param field the vector field to search
* @param target the vector-valued query
* @param knnCollector a KnnResults collector and relevant settings for gathering vector results
* @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null}
* if they are all allowed to match.
*/
@Override
public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException {
throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader");
}

/**
* Return the k nearest neighbor documents as determined by comparison of their vector values for
* this field, to the given vector, by the field's similarity function. The score of each document
* is derived from the vector similarity in a way that ensures scores are positive and that a
* larger score corresponds to a higher ranking.
*
* <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the
* true k closest neighbors. For large values of k (for example when k is close to the total
* number of documents), the search may also retrieve fewer than k documents.
*
* <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
* order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
* contains the number of documents visited during the search. If the search stopped early because
* it hit {@code visitedLimit}, it is indicated through the relation {@code
* TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
*
* <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
* FieldInfo}. The return value is never {@code null}.
*
* @param field the vector field to search
* @param target the vector-valued query
* @param knnCollector a KnnResults collector and relevant settings for gathering vector results
* @param acceptDocs {@link Bits} that represents the allowed documents to match, or {@code null}
* if they are all allowed to match.
*/
@Override
public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException {
throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader");
}

/**
* Closes this stream and releases any system resources associated
* with it. If the stream is already closed then invoking this
* method has no effect.
*
* <p> As noted in {@link AutoCloseable#close()}, cases where the
* close may fail require careful attention. It is strongly advised
* to relinquish the underlying resources and to internally
* <em>mark</em> the {@code Closeable} as closed, prior to throwing
* the {@code IOException}.
*
* @throws IOException if an I/O error occurs
*/
@Override
public void close() throws IOException {
IOUtils.close(flatVectorsReader);
}

/**
* Return the memory usage of this object in bytes. Negative values are illegal.
*/
@Override
public long ramBytesUsed() {
return flatVectorsReader.ramBytesUsed();
}
}
Loading

0 comments on commit f3894f0

Please sign in to comment.