Skip to content

Commit

Permalink
Merge li's PR into this
Browse files Browse the repository at this point in the history
  • Loading branch information
wanliAlex committed Oct 9, 2024
2 parents cee2d13 + a417145 commit 1624d20
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 13 deletions.
28 changes: 17 additions & 11 deletions tests/s2_inference/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

_load_model = functools.partial(og_load_model, calling_func = "unit_test")


class TestEncoding(unittest.TestCase):

def setUp(self) -> None:
Expand All @@ -38,7 +37,9 @@ def test_vectorize(self):

names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"]

names = names + names_e5 + names_bge + names_snowflake
language_bind_models = ["LanguageBind/Video_V1.5_FT"]

names = names + names_e5 + names_bge + names_snowflake + language_bind_models

sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
device = 'cpu'
Expand All @@ -49,15 +50,20 @@ def test_vectorize(self):
model = _load_model(model_properties['name'], model_properties=model_properties, device=device)

for sentence in sentences:
for normalize_embeddings in [True, False]:
output_v = vectorise(name, sentence, model_properties, device,
normalize_embeddings=normalize_embeddings)

assert _check_output_type(output_v)

output_m = model.encode(sentence, normalize=normalize_embeddings)

assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
assert _check_output_type(output_v)
output_m = model.encode(sentence, normalize=True)
assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
for vector in output_v:
assert abs(torch.linalg.norm(np.array(vector)) - 1) < 1e-5

output_v_unnormalised = vectorise(name, sentence, model_properties, device, normalize_embeddings=False)
assert _check_output_type(output_v)
output_m_unnormalised = model.encode(sentence, normalize=False)
assert abs(torch.FloatTensor(output_v_unnormalised) - torch.FloatTensor(output_m_unnormalised)).sum() < eps

for vector in output_v_unnormalised:
assert abs(torch.linalg.norm(np.array(vector)) - 1) > 1e-5

clear_loaded_models()

Expand Down
159 changes: 157 additions & 2 deletions tests/tensor_search/integ_tests/test_add_documents_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
import uuid
from unittest import mock
from unittest.mock import patch

import numpy as np
import pytest
import torch


import PIL
import requests
import torch
from sentence_transformers.util import normalize_embeddings
from sklearn.metrics.pairwise import distance_metrics
from torch import Tensor
from urllib3.exceptions import ProtocolError
import unittest.mock
Expand Down Expand Up @@ -49,6 +53,33 @@ def setUpClass(cls) -> None:
tensor_fields=["image_field_1", "text_field_1", "multimodal_field"]
)

structured_image_index_request_unnormalized = cls.structured_marqo_index_request(
name="structured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''),
fields=[
FieldRequest(name="image_field_1", type=FieldType.ImagePointer),
FieldRequest(name="text_field_1", type=FieldType.Text,
features=[FieldFeature.Filter, FieldFeature.LexicalSearch]),
],
model=Model(name="open_clip/ViT-B-32/laion2b_s34b_b79k"),
tensor_fields=["image_field_1", "text_field_1"],
normalize_embeddings=False,
distance_metric=DistanceMetric.DotProduct
)

structured_text_index_request_unnormalized = cls.structured_marqo_index_request(
name="structured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''),
fields=[
FieldRequest(
name="text_field_1", type=FieldType.Text,
features=[FieldFeature.Filter, FieldFeature.LexicalSearch]
),
],
model=Model(name="hf/e5-base-v2"),
tensor_fields=["text_field_1"],
normalize_embeddings=False,
distance_metric=DistanceMetric.DotProduct
)

structured_languagebind_index_request = cls.structured_marqo_index_request(
name="my-multimodal-index" + str(uuid.uuid4()).replace('-', ''),
fields=[
Expand Down Expand Up @@ -94,17 +125,41 @@ def setUpClass(cls) -> None:
treat_urls_and_pointers_as_media=True
)

unstructured_image_index_request_unnormalized = cls.unstructured_marqo_index_request(
name="unstructured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''),
model=Model(name="open_clip/ViT-B-32/laion2b_s34b_b79k"),
normalize_embeddings=False,
distance_metric=DistanceMetric.DotProduct
)

unstructured_text_index_request_unnormalized = cls.unstructured_marqo_index_request(
name="unstructured_image_index_unnormalised" + str(uuid.uuid4()).replace('-', ''),
model=Model(name="hf/e5-base-v2"),
normalize_embeddings=False,
distance_metric=DistanceMetric.DotProduct
)

cls.indexes = cls.create_indexes([
structured_image_index_request,
structured_languagebind_index_request,
structured_image_index_request_unnormalized,
structured_text_index_request_unnormalized,

unstructured_image_index_request,
unstructured_languagebind_index_request
unstructured_languagebind_index_request,
unstructured_image_index_request_unnormalized,
unstructured_text_index_request_unnormalized
])

cls.structured_marqo_index_name = structured_image_index_request.name
cls.structured_languagebind_index_name = structured_languagebind_index_request.name
cls.structured_image_index_unnormalized_name = structured_image_index_request_unnormalized.name
cls.structured_text_index_unnormalized_name = structured_text_index_request_unnormalized.name

cls.unstructured_marqo_index_name = unstructured_image_index_request.name
cls.unstructured_languagebind_index_name = unstructured_languagebind_index_request.name
cls.unstructured_image_index_unnormalized_name = unstructured_image_index_request_unnormalized.name
cls.unstructured_text_index_unnormalized_name = unstructured_text_index_request_unnormalized.name

def setUp(self) -> None:
super().setUp()
Expand Down Expand Up @@ -807,4 +862,104 @@ def test_no_extension_image_url_infer_modality(self):
"""this test ensures that the image url with no extension is correctly inferred as an image"""
image_url_no_extension = "https://il.redbubble.net/catalogue/image/by-rb-work/157037551/simple-preview"
modality = infer_modality(image_url_no_extension)
self.assertEqual(modality, streaming_media_processor.Modality.IMAGE)
self.assertEqual(modality, streaming_media_processor.Modality.IMAGE)

def test_imageIndexEmbeddingsUnnormalised(self):
"""Test to ensure that the image embeddings are unnormalised when the index is unnormalised"""
documents = [
{
"image_field_1": TestImageUrls.HIPPO_REALISTIC.value,
"_id": "1"
}
]
for index_name in [self.unstructured_image_index_unnormalized_name, self.structured_image_index_unnormalized_name]:
tensor_fields = ["image_field_1"] if index_name == self.unstructured_image_index_unnormalized_name \
else None
with self.subTest(index_name):
res = tensor_search.add_documents(
self.config,
add_docs_params=AddDocsParams(
docs=documents,
index_name=index_name,
tensor_fields=tensor_fields
)
)
for item in res.dict(exclude_none=True, by_alias=True)['items']:
self.assertEqual(200, item['status'])

get_res = tensor_search.get_documents_by_ids(
config=self.config, index_name=index_name,
document_ids=["1"],
show_vectors=True
).dict(exclude_none=True, by_alias=True)

embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding']
norm = np.linalg.norm(np.array(embeddings))
self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}")

def test_imageIndexEmbeddingsNormalised(self):
"""Test to ensure that the image embeddings are normalised when the index is normalised"""

documents = [
{
"image_field_1": TestImageUrls.HIPPO_REALISTIC.value,
"_id": "1"
}
]
for index_name in [self.unstructured_marqo_index_name, self.unstructured_marqo_index_name]:
tensor_fields = ["image_field_1"] if index_name == self.unstructured_marqo_index_name \
else None
with self.subTest(index_name):
res = tensor_search.add_documents(
self.config,
add_docs_params=AddDocsParams(
docs=documents,
index_name=index_name,
tensor_fields=tensor_fields
)
)
for item in res.dict(exclude_none=True, by_alias=True)['items']:
self.assertEqual(200, item['status'])

get_res = tensor_search.get_documents_by_ids(
config=self.config, index_name=index_name,
document_ids=["1"],
show_vectors=True
).dict(exclude_none=True, by_alias=True)

embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding']
norm = np.linalg.norm(np.array(embeddings))
self.assertTrue(norm - 1.0 < 1e-5, f"Embedding norm is {norm}")

def test_textIndexEmbeddingsUnnormalized(self):
"""A test to ensure that the text embeddings are unnormalised when the index is unnormalised"""
documents = [
{
"text_field_1": "This is a test text",
"_id": "1"
}
]
for index_name in [self.unstructured_text_index_unnormalized_name, self.structured_text_index_unnormalized_name]:
tensor_fields = ["text_field_1"] if index_name == self.unstructured_text_index_unnormalized_name \
else None
with self.subTest(index_name):
res = tensor_search.add_documents(
self.config,
add_docs_params=AddDocsParams(
docs=documents,
index_name=index_name,
tensor_fields=tensor_fields
)
)
for item in res.dict(exclude_none=True, by_alias=True)['items']:
self.assertEqual(200, item['status'])

get_res = tensor_search.get_documents_by_ids(
config=self.config, index_name=index_name,
document_ids=["1"],
show_vectors=True
).dict(exclude_none=True, by_alias=True)

embeddings = get_res['results'][0]['_tensor_facets'][0]['_embedding']
norm = np.linalg.norm(np.array(embeddings))
self.assertTrue(norm - 1.0 > 1e-5, f"Embedding norm is {norm}")

0 comments on commit 1624d20

Please sign in to comment.