opensearch-project · navneet1v · Jul 24, 2024 · Jul 16, 2024
@@ -0,0 +1,107 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ *
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.knn.index.codec.KNN990Codec;
+
+import lombok.Getter;
+import org.apache.lucene.codecs.KnnFieldVectorsWriter;
+import org.apache.lucene.index.DocsWithFieldSet;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.util.InfoStream;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * NativeEngineVectorFieldsWriter is a class that will be used to accumulate all the vectors during ingestion before
+ * lucene does a flush. This class ensures that KNNVectorWriter is free from generics and this class can encapsulate
+ * all the details related to vectors types and docIds.
+ *
+ * @param <T> float[] or byte[]
+ */
+class NativeEngineFieldVectorsWriter<T> extends KnnFieldVectorsWriter<T> {
+    private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(NativeEngineFieldVectorsWriter.class);
+    private final FieldInfo fieldInfo;
+    /**
+     * We are using a map here instead of list, because for sampler interface for quantization we have to advance the iterator
+     * to a specific docId, there a list cannot be useful because a docId != index of the vector in the list. Similar
+     * thing is true when we have vector field in child document. There doc Ids will not be consistent. Hence, we need to
+     * use the map here.
+     */
+    @Getter
+    private final Map<Integer, T> vectors;
+    private int lastDocID = -1;
+    @Getter
+    private final DocsWithFieldSet docsWithField;
+    private final InfoStream infoStream;
+
+    static NativeEngineFieldVectorsWriter<?> create(final FieldInfo fieldInfo, final InfoStream infoStream) {
+        switch (fieldInfo.getVectorEncoding()) {
+            case FLOAT32:
+                return new NativeEngineFieldVectorsWriter<float[]>(fieldInfo, infoStream);
+            case BYTE:
+                return new NativeEngineFieldVectorsWriter<byte[]>(fieldInfo, infoStream);
+        }
+        throw new IllegalStateException("Unsupported Vector encoding : " + fieldInfo.getVectorEncoding());
+    }
+
+    NativeEngineFieldVectorsWriter(final FieldInfo fieldInfo, final InfoStream infoStream) {
+        this.fieldInfo = fieldInfo;
+        this.infoStream = infoStream;
+        vectors = new HashMap<>();
+        this.docsWithField = new DocsWithFieldSet();
+    }
+
+    /**
+     * Add new docID with its vector value to the given field for indexing. Doc IDs must be added in
+     * increasing order.
+     *
+     * @param docID int
+     * @param vectorValue T
+     */
+    @Override
+    public void addValue(int docID, T vectorValue) {
+        if (docID == lastDocID) {
+            throw new IllegalArgumentException(
+                "[NativeEngineKNNVectorWriter]VectorValuesField \""
+                    + fieldInfo.name
+                    + "\" appears more than once in this document (only one value is allowed per field)"
+            );
+        }
+        assert docID > lastDocID;
+        vectors.put(docID, vectorValue);
+        docsWithField.add(docID);
+        lastDocID = docID;
+    }
+
+    /**
+     * Used to copy values being indexed to internal storage.
+     *
+     * @param vectorValue an array containing the vector value to add
+     * @return a copy of the value; a new array
+     */
+    @Override
+    public T copyValue(T vectorValue) {
+        throw new UnsupportedOperationException("NativeEngineVectorFieldsWriter doesn't support copyValue operation");
+    }
+
+    /**
+     * Return the memory usage of this object in bytes. Negative values are illegal.
+     */
+    @Override
+    public long ramBytesUsed() {
+        return SHALLOW_SIZE + docsWithField.ramBytesUsed() + (long) this.vectors.size() * (long) (RamUsageEstimator.NUM_BYTES_OBJECT_REF
+            + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + (long) this.vectors.size() * RamUsageEstimator.shallowSizeOfInstance(
+                Integer.class
+            ) + (long) vectors.size() * fieldInfo.getVectorDimension() * fieldInfo.getVectorEncoding().byteSize;
+    }
+}
@@ -0,0 +1,68 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ *
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.knn.index.codec.KNN990Codec;
+
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
+import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+import java.io.IOException;
+
+/**
+ * This is a Vector format that will be used for Native engines like Faiss and Nmslib for reading and writing vector
+ * related data structures.
+ */
+public class NativeEngines990KnnVectorsFormat extends KnnVectorsFormat {
+    /** The format for storing, reading, merging vectors on disk */
+    private static FlatVectorsFormat flatVectorsFormat;
+    private static final String FORMAT_NAME = "NativeEngines99KnnVectorsFormat";
+
+    public NativeEngines990KnnVectorsFormat() {
+        super(FORMAT_NAME);
+        flatVectorsFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer());
+    }
+
+    public NativeEngines990KnnVectorsFormat(final FlatVectorsFormat lucene99FlatVectorsFormat) {
+        super(FORMAT_NAME);
+        flatVectorsFormat = lucene99FlatVectorsFormat;
+    }
+
+    /**
+     * Returns a {@link KnnVectorsWriter} to write the vectors to the index.
+     *
+     * @param state {@link SegmentWriteState}
+     */
+    @Override
+    public KnnVectorsWriter fieldsWriter(final SegmentWriteState state) throws IOException {
+        return new NativeEngines990KnnVectorsWriter(state, flatVectorsFormat.fieldsWriter(state));
+    }
+
+    /**
+     * Returns a {@link KnnVectorsReader} to read the vectors from the index.
+     *
+     * @param state {@link SegmentReadState}
+     */
+    @Override
+    public KnnVectorsReader fieldsReader(final SegmentReadState state) throws IOException {
+        return new NativeEngines990KnnVectorsReader(state, flatVectorsFormat.fieldsReader(state));
+    }
+
+    @Override
+    public String toString() {
+        return "NativeEngines99KnnVectorsFormat(name=" + this.getClass().getSimpleName() + ", flatVectorsFormat=" + flatVectorsFormat + ")";
+    }
+}
@@ -0,0 +1,162 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ *
+ * Modifications Copyright OpenSearch Contributors. See
+ * GitHub history for details.
+ */
+
+package org.opensearch.knn.index.codec.KNN990Codec;
+
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.search.KnnCollector;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.TotalHits;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.IOUtils;
+
+import java.io.IOException;
+
+/**
+ * Vectors reader class for reading the flat vectors for native engines. The class provides methods for iterating
+ * over the vectors and retrieving their values.
+ */
+public class NativeEngines990KnnVectorsReader extends KnnVectorsReader {
+
+    private final FlatVectorsReader flatVectorsReader;
+
+    public NativeEngines990KnnVectorsReader(final SegmentReadState state, final FlatVectorsReader flatVectorsReader) {
+        this.flatVectorsReader = flatVectorsReader;
+    }
+
+    /**
+     * Checks consistency of this reader.
+     *
+     * <p>Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value
+     * against large data files.
+     *
+     */
+    @Override
+    public void checkIntegrity() throws IOException {
+        flatVectorsReader.checkIntegrity();
+    }
+
+    /**
+     * Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if
+     * the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
+     * never {@code null}.
+     *
+     * @param field {@link String}
+     */
+    @Override
+    public FloatVectorValues getFloatVectorValues(final String field) throws IOException {
+        return flatVectorsReader.getFloatVectorValues(field);
+    }
+
+    /**
+     * Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if
+     * the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
+     * never {@code null}.
+     *
+     * @param field {@link String}
+     */
+    @Override
+    public ByteVectorValues getByteVectorValues(final String field) throws IOException {
+        return flatVectorsReader.getByteVectorValues(field);
+    }
+
+    /**
+     * Return the k nearest neighbor documents as determined by comparison of their vector values for
+     * this field, to the given vector, by the field's similarity function. The score of each document
+     * is derived from the vector similarity in a way that ensures scores are positive and that a
+     * larger score corresponds to a higher ranking.
+     *
+     * <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the
+     * true k closest neighbors. For large values of k (for example when k is close to the total
+     * number of documents), the search may also retrieve fewer than k documents.
+     *
+     * <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
+     * order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
+     * contains the number of documents visited during the search. If the search stopped early because
+     * it hit {@code visitedLimit}, it is indicated through the relation {@code
+     * TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
+     *
+     * <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
+     * FieldInfo}. The return value is never {@code null}.
+     *
+     * @param field        the vector field to search
+     * @param target       the vector-valued query
+     * @param knnCollector a KnnResults collector and relevant settings for gathering vector results
+     * @param acceptDocs   {@link Bits} that represents the allowed documents to match, or {@code null}
+     *                     if they are all allowed to match.
+     */
+    @Override
+    public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException {
+        throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader");
+    }
+
+    /**
+     * Return the k nearest neighbor documents as determined by comparison of their vector values for
+     * this field, to the given vector, by the field's similarity function. The score of each document
+     * is derived from the vector similarity in a way that ensures scores are positive and that a
+     * larger score corresponds to a higher ranking.
+     *
+     * <p>The search is allowed to be approximate, meaning the results are not guaranteed to be the
+     * true k closest neighbors. For large values of k (for example when k is close to the total
+     * number of documents), the search may also retrieve fewer than k documents.
+     *
+     * <p>The returned {@link TopDocs} will contain a {@link ScoreDoc} for each nearest neighbor, in
+     * order of their similarity to the query vector (decreasing scores). The {@link TotalHits}
+     * contains the number of documents visited during the search. If the search stopped early because
+     * it hit {@code visitedLimit}, it is indicated through the relation {@code
+     * TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO}.
+     *
+     * <p>The behavior is undefined if the given field doesn't have KNN vectors enabled on its {@link
+     * FieldInfo}. The return value is never {@code null}.
+     *
+     * @param field        the vector field to search
+     * @param target       the vector-valued query
+     * @param knnCollector a KnnResults collector and relevant settings for gathering vector results
+     * @param acceptDocs   {@link Bits} that represents the allowed documents to match, or {@code null}
+     *                     if they are all allowed to match.
+     */
+    @Override
+    public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) throws IOException {
+        throw new UnsupportedOperationException("Search functionality using codec is not supported with Native Engine Reader");
+    }
+
+    /**
+     * Closes this stream and releases any system resources associated
+     * with it. If the stream is already closed then invoking this
+     * method has no effect.
+     *
+     * <p> As noted in {@link AutoCloseable#close()}, cases where the
+     * close may fail require careful attention. It is strongly advised
+     * to relinquish the underlying resources and to internally
+     * <em>mark</em> the {@code Closeable} as closed, prior to throwing
+     * the {@code IOException}.
+     *
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public void close() throws IOException {
+        IOUtils.close(flatVectorsReader);
+    }
+
+    /**
+     * Return the memory usage of this object in bytes. Negative values are illegal.
+     */
+    @Override
+    public long ramBytesUsed() {
+        return flatVectorsReader.ramBytesUsed();
+    }
+}