Skip to content

Commit

Permalink
feat: FAISS in OpenSearch: Support HNSW for dot product and l2 (#3029)
Browse files Browse the repository at this point in the history
* support faiss hnsw

* blacken

* update docs

* improve similarity check

* add tests

* update schema

* set ef_search param correctly

* Apply suggestions from code review

Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>

* regenerate docs

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
Co-authored-by: Agnieszka Marzec <97166305+agnieszka-m@users.noreply.github.com>
  • Loading branch information
3 people authored Aug 24, 2022
1 parent 9b1b030 commit 92046ce
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 3 deletions.
4 changes: 3 additions & 1 deletion docs/_src/api/api/document_store.md
Original file line number Diff line number Diff line change
Expand Up @@ -1463,7 +1463,7 @@ class OpenSearchDocumentStore(BaseElasticsearchDocumentStore)
#### OpenSearchDocumentStore.\_\_init\_\_

```python
def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False)
def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False, knn_engine: str = "nmslib")
```

Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
Expand Down Expand Up @@ -1535,6 +1535,8 @@ More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/ana
- `synonym_type`: Synonym filter type can be passed.
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
- `knn_engine`: The engine you want to use for the nearest neighbor search by OpenSearch's KNN plug-in. Possible values: "nmslib" or "faiss". Defaults to "nmslib".
For more information, see [k-NN Index](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/).

<a id="opensearch.OpenSearchDocumentStore.query_by_embedding"></a>

Expand Down
20 changes: 18 additions & 2 deletions haystack/document_stores/opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def __init__(
synonyms: Optional[List] = None,
synonym_type: str = "synonym",
use_system_proxy: bool = False,
knn_engine: str = "nmslib",
):
"""
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
Expand Down Expand Up @@ -130,6 +131,8 @@ def __init__(
:param synonym_type: Synonym filter type can be passed.
Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process.
More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html
:param knn_engine: The engine you want to use for the nearest neighbor search by OpenSearch's KNN plug-in. Possible values: "nmslib" or "faiss". Defaults to "nmslib".
For more information, see [k-NN Index](https://opensearch.org/docs/latest/search-plugins/knn/knn-index/).
"""
# These parameters aren't used by Opensearch at the moment but could be in the future, see
# https://github.com/opensearch-project/security/issues/1504. Let's not deprecate them for
Expand Down Expand Up @@ -165,6 +168,15 @@ def __init__(
f"Make sure an Opensearch instance is running at `{host}` and that it has finished booting (can take > 30s)."
)

if knn_engine not in {"nmslib", "faiss"}:
raise ValueError(f"knn_engine must be either 'nmslib' or 'faiss' but was {knn_engine}")

if knn_engine == "faiss" and similarity not in {"dot_product", "l2"}:
raise ValueError(
f"knn_engine=`faiss` was set to similarity {similarity}. Currently, we only support 'dot_product' and 'l2' similarities. Set the similarity to one of the supported values."
)

self.knn_engine = knn_engine
self.embeddings_field_supports_similarity = False
self.similarity_to_space_type = {"cosine": "cosinesimil", "dot_product": "innerproduct", "l2": "l2"}
self.space_type_to_similarity = {v: k for k, v in self.similarity_to_space_type.items()}
Expand Down Expand Up @@ -443,7 +455,7 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st
f"e.g. `OpenSearchDocumentStore(index='my_new_{self.similarity}_index', similarity='{self.similarity}')`."
)

# Adjust global ef_search setting. If not set, default is 512.
# Adjust global ef_search setting (nmslib only). If not set, default is 512.
ef_search = index_settings.get("knn.algo_param", {"ef_search": 512}).get("ef_search", 512)
if self.index_type == "hnsw" and ef_search != 20:
body = {"knn.algo_param.ef_search": 20}
Expand Down Expand Up @@ -486,6 +498,7 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st

if self.embedding_field:
index_definition["settings"]["index"] = {"knn": True}
# global ef_search setting affects only nmslib, for faiss it is set in the field mapping
if self.index_type == "hnsw":
index_definition["settings"]["index"]["knn.algo_param.ef_search"] = 20
index_definition["mappings"]["properties"][self.embedding_field] = self._get_embedding_field_mapping(
Expand All @@ -505,14 +518,17 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st

def _get_embedding_field_mapping(self, similarity: str):
space_type = self.similarity_to_space_type[similarity]
method: dict = {"space_type": space_type, "name": "hnsw", "engine": "nmslib"}
method: dict = {"space_type": space_type, "name": "hnsw", "engine": self.knn_engine}

if self.index_type == "flat":
# use default parameters from https://opensearch.org/docs/1.2/search-plugins/knn/knn-index/
# we need to set them explicitly as aws managed instances starting from version 1.2 do not support empty parameters
method["parameters"] = {"ef_construction": 512, "m": 16}
elif self.index_type == "hnsw":
method["parameters"] = {"ef_construction": 80, "m": 64}
# for nmslib this is a global index setting
if self.knn_engine == "faiss":
method["parameters"]["ef_search"] = 20
else:
logger.error("Please set index_type to either 'flat' or 'hnsw'")

Expand Down
5 changes: 5 additions & 0 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1661,6 +1661,11 @@
"title": "Use System Proxy",
"default": false,
"type": "boolean"
},
"knn_engine": {
"title": "Knn Engine",
"default": "nmslib",
"type": "string"
}
},
"additionalProperties": false,
Expand Down
49 changes: 49 additions & 0 deletions test/document_stores/test_opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ def labels(self, documents):
def test___init__(self):
OpenSearchDocumentStore(index="default_index", port=9201, create_index=True)

@pytest.mark.integration
def test___init___faiss(self):
OpenSearchDocumentStore(index="faiss_index", port=9201, create_index=True, knn_engine="faiss")

@pytest.mark.integration
def test_write_documents(self, ds, documents):
ds.write_documents(documents)
Expand Down Expand Up @@ -599,6 +603,35 @@ def test__create_document_index_no_index_no_mapping_with_embedding_field(self, m
}
assert mocked_document_store.embeddings_field_supports_similarity is True

@pytest.mark.unit
def test__create_document_index_no_index_no_mapping_faiss(self, mocked_document_store):
mocked_document_store.client.indices.exists.return_value = False
mocked_document_store.knn_engine = "faiss"
mocked_document_store._create_document_index(self.index_name)
_, kwargs = mocked_document_store.client.indices.create.call_args
assert kwargs["body"] == {
"mappings": {
"dynamic_templates": [
{"strings": {"mapping": {"type": "keyword"}, "match_mapping_type": "string", "path_match": "*"}}
],
"properties": {
"content": {"type": "text"},
"embedding": {
"dimension": 768,
"method": {
"engine": "faiss",
"name": "hnsw",
"parameters": {"ef_construction": 512, "m": 16},
"space_type": "innerproduct",
},
"type": "knn_vector",
},
"name": {"type": "keyword"},
},
},
"settings": {"analysis": {"analyzer": {"default": {"type": "standard"}}}, "index": {"knn": True}},
}

@pytest.mark.unit
def test__create_document_index_client_failure(self, mocked_document_store):
mocked_document_store.client.indices.exists.return_value = False
Expand Down Expand Up @@ -637,6 +670,22 @@ def test__get_embedding_field_mapping_hnsw(self, mocked_document_store):
},
}

@pytest.mark.unit
def test__get_embedding_field_mapping_hnsw_faiss(self, mocked_document_store):
mocked_document_store.index_type = "hnsw"
mocked_document_store.knn_engine = "faiss"

assert mocked_document_store._get_embedding_field_mapping("dot_product") == {
"type": "knn_vector",
"dimension": 768,
"method": {
"space_type": "innerproduct",
"name": "hnsw",
"engine": "faiss",
"parameters": {"ef_construction": 80, "m": 64, "ef_search": 20},
},
}

@pytest.mark.unit
def test__get_embedding_field_mapping_wrong(self, mocked_document_store, caplog):
mocked_document_store.index_type = "foo"
Expand Down

0 comments on commit 92046ce

Please sign in to comment.