diff --git a/tests/s2_inference/test_encoding.py b/tests/s2_inference/test_encoding.py index 4843bc85b..53562a0f9 100644 --- a/tests/s2_inference/test_encoding.py +++ b/tests/s2_inference/test_encoding.py @@ -17,7 +17,6 @@ _load_model = functools.partial(og_load_model, calling_func = "unit_test") - class TestEncoding(unittest.TestCase): def setUp(self) -> None: @@ -38,7 +37,9 @@ def test_vectorize(self): names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"] - names = names + names_e5 + names_bge + names_snowflake + language_bind_models = ["LanguageBind/Video_V1.5_FT"] + + names = names + names_e5 + names_bge + names_snowflake + language_bind_models sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']] device = 'cpu' @@ -49,15 +50,20 @@ def test_vectorize(self): model = _load_model(model_properties['name'], model_properties=model_properties, device=device) for sentence in sentences: - for normalize_embeddings in [True, False]: - output_v = vectorise(name, sentence, model_properties, device, - normalize_embeddings=normalize_embeddings) - - assert _check_output_type(output_v) - - output_m = model.encode(sentence, normalize=normalize_embeddings) - - assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps + output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True) + assert _check_output_type(output_v) + output_m = model.encode(sentence, normalize=True) + assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps + for vector in output_v: + assert abs(torch.linalg.norm(np.array(vector)) - 1) < 1e-5 + + output_v_unnormalised = vectorise(name, sentence, model_properties, device, normalize_embeddings=False) + assert _check_output_type(output_v) + output_m_unnormalised = model.encode(sentence, normalize=False) + assert abs(torch.FloatTensor(output_v_unnormalised) - torch.FloatTensor(output_m_unnormalised)).sum() < eps + + for vector in output_v_unnormalised: + assert abs(torch.linalg.norm(np.array(vector)) - 1) > 1e-5 clear_loaded_models() diff --git a/tests/tensor_search/integ_tests/test_add_documents_combined.py b/tests/tensor_search/integ_tests/test_add_documents_combined.py index 9d80e0fb1..e50f9b4db 100644 --- a/tests/tensor_search/integ_tests/test_add_documents_combined.py +++ b/tests/tensor_search/integ_tests/test_add_documents_combined.py @@ -2,6 +2,8 @@ import uuid from unittest import mock from unittest.mock import patch + +import numpy as np import pytest import torch @@ -9,6 +11,8 @@ import PIL import requests import torch +from sentence_transformers.util import normalize_embeddings +from sklearn.metrics.pairwise import distance_metrics from torch import Tensor from urllib3.exceptions import ProtocolError import unittest.mock @@ -49,6 +53,33 @@ def setUpClass(cls) -> None: tensor_fields=["image_field_1", "text_field_1", "multimodal_field"] ) + structured_image_index_request_unnormalized = cls.structured_marqo_index_request( + name="structured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + fields=[ + FieldRequest(name="image_field_1", type=FieldType.ImagePointer), + FieldRequest(name="text_field_1", type=FieldType.Text, + features=[FieldFeature.Filter, FieldFeature.LexicalSearch]), + ], + model=Model(name="open_clip/ViT-B-32/laion2b_s34b_b79k"), + tensor_fields=["image_field_1", "text_field_1"], + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + + structured_text_index_request_unnormalized = cls.structured_marqo_index_request( + name="structured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + fields=[ + FieldRequest( + name="text_field_1", type=FieldType.Text, + features=[FieldFeature.Filter, FieldFeature.LexicalSearch] + ), + ], + model=Model(name="hf/e5-base-v2"), + tensor_fields=["text_field_1"], + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + structured_languagebind_index_request = cls.structured_marqo_index_request( name="my-multimodal-index" + str(uuid.uuid4()).replace('-', ''), fields=[ @@ -94,17 +125,41 @@ def setUpClass(cls) -> None: treat_urls_and_pointers_as_media=True ) + unstructured_image_index_request_unnormalized = cls.unstructured_marqo_index_request( + name="unstructured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + model=Model(name="open_clip/ViT-B-32/laion2b_s34b_b79k"), + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + + unstructured_text_index_request_unnormalized = cls.unstructured_marqo_index_request( + name="unstructured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''), + model=Model(name="hf/e5-base-v2"), + normalize_embeddings=False, + distance_metric=DistanceMetric.DotProduct + ) + cls.indexes = cls.create_indexes([ structured_image_index_request, structured_languagebind_index_request, + structured_image_index_request_unnormalized, + structured_text_index_request_unnormalized, + unstructured_image_index_request, - unstructured_languagebind_index_request + unstructured_languagebind_index_request, + unstructured_image_index_request_unnormalized, + unstructured_text_index_request_unnormalized ]) cls.structured_marqo_index_name = structured_image_index_request.name cls.structured_languagebind_index_name = structured_languagebind_index_request.name + cls.structured_image_index_unnormalized_name = structured_image_index_request_unnormalized.name + cls.structured_text_index_unnormalized_name = structured_text_index_request_unnormalized.name + cls.unstructured_marqo_index_name = unstructured_image_index_request.name cls.unstructured_languagebind_index_name = unstructured_languagebind_index_request.name + cls.unstructured_image_index_unnormalized_name = unstructured_image_index_request_unnormalized.name + cls.unstructured_text_index_unnormalized_name = unstructured_text_index_request_unnormalized.name def setUp(self) -> None: super().setUp() @@ -807,4 +862,104 @@ def test_no_extension_image_url_infer_modality(self): """this test ensures that the image url with no extension is correctly inferred as an image""" image_url_no_extension = "https://il.redbubble.net/catalogue/image/by-rb-work/157037551/simple-preview" modality = infer_modality(image_url_no_extension) - self.assertEqual(modality, streaming_media_processor.Modality.IMAGE) \ No newline at end of file + self.assertEqual(modality, streaming_media_processor.Modality.IMAGE) + + def test_imageIndexEmbeddingsUnnormalised(self): + """Test to ensure that the image embeddings are unnormalised when the index is unnormalised""" + documents = [ + { + "image_field_1": TestImageUrls.HIPPO_REALISTIC.value, + "_id": "1" + } + ] + for index_name in [self.unstructured_image_index_unnormalized_name, self.structured_image_index_unnormalized_name]: + tensor_fields = ["image_field_1"] if index_name == self.unstructured_image_index_unnormalized_name \ + else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + for item in res.dict(exclude_none=True, by_alias=True)['items']: + self.assertEqual(200, item['status']) + + get_res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index_name, + document_ids=["1"], + show_vectors=True + ).dict(exclude_none=True, by_alias=True) + + embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] + norm = np.linalg.norm(np.array(embeddings)) + self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}") + + def test_imageIndexEmbeddingsNormalised(self): + """Test to ensure that the image embeddings are normalised when the index is normalised""" + + documents = [ + { + "image_field_1": TestImageUrls.HIPPO_REALISTIC.value, + "_id": "1" + } + ] + for index_name in [self.unstructured_marqo_index_name, self.unstructured_marqo_index_name]: + tensor_fields = ["image_field_1"] if index_name == self.unstructured_marqo_index_name \ + else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + for item in res.dict(exclude_none=True, by_alias=True)['items']: + self.assertEqual(200, item['status']) + + get_res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index_name, + document_ids=["1"], + show_vectors=True + ).dict(exclude_none=True, by_alias=True) + + embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] + norm = np.linalg.norm(np.array(embeddings)) + self.assertTrue(norm - 1.0 < 1e-5, f"Embedding norm is {norm}") + + def test_textIndexEmbeddingsUnnormalized(self): + """A test to ensure that the text embeddings are unnormalised when the index is unnormalised""" + documents = [ + { + "text_field_1": "This is a test text", + "_id": "1" + } + ] + for index_name in [self.unstructured_text_index_unnormalized_name, self.structured_text_index_unnormalized_name]: + tensor_fields = ["text_field_1"] if index_name == self.unstructured_text_index_unnormalized_name \ + else None + with self.subTest(index_name): + res = tensor_search.add_documents( + self.config, + add_docs_params=AddDocsParams( + docs=documents, + index_name=index_name, + tensor_fields=tensor_fields + ) + ) + for item in res.dict(exclude_none=True, by_alias=True)['items']: + self.assertEqual(200, item['status']) + + get_res = tensor_search.get_documents_by_ids( + config=self.config, index_name=index_name, + document_ids=["1"], + show_vectors=True + ).dict(exclude_none=True, by_alias=True) + + embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding'] + norm = np.linalg.norm(np.array(embeddings)) + self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}") \ No newline at end of file