Skip to content

Commit

Permalink
Merge branch 'mainline' into joshua/2.11.4-client-2
Browse files Browse the repository at this point in the history
  • Loading branch information
vicilliar authored Sep 16, 2024
2 parents 0cbc50e + 8035382 commit f533c0b
Show file tree
Hide file tree
Showing 12 changed files with 320 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/open-source-unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ jobs:
docker pull ${{ steps.prepare.outputs.registry }}/${{ steps.prepare.outputs.image_repo }}:${{ steps.prepare.outputs.image_tag }}
docker run --name marqo -d --privileged -p 8882:8882 --add-host host.docker.internal:host-gateway \
-e MARQO_ENABLE_BATCH_APIS=True \
-e MARQO_MAX_CUDA_MODEL_MEMORY=15 \
-e MARQO_MAX_CPU_MODEL_MEMORY=15 \
${{ steps.prepare.outputs.registry }}/${{ steps.prepare.outputs.image_repo }}:${{ steps.prepare.outputs.image_tag }}
# wait for marqo to start with timeout of 10 minutes
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"tox"
],
name="marqo",
version="3.7.0",
version="3.8.0",
author="marqo org",
author_email="org@marqo.io",
description="Tensor search for humans",
Expand Down
7 changes: 7 additions & 0 deletions src/marqo/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def create_index(
type: Optional[marqo_index.IndexType] = None,
settings_dict: Optional[Dict[str, Any]] = None,
treat_urls_and_pointers_as_images: Optional[bool] = None,
treat_urls_and_pointers_as_media: Optional[bool] = None,
filter_string_max_length: Optional[int] = None,
all_fields: Optional[List[marqo_index.FieldRequest]] = None,
tensor_fields: Optional[List[str]] = None,
Expand All @@ -77,6 +78,8 @@ def create_index(
normalize_embeddings: Optional[bool] = None,
text_preprocessing: Optional[marqo_index.TextPreProcessing] = None,
image_preprocessing: Optional[marqo_index.ImagePreProcessing] = None,
audio_preprocessing: Optional[marqo_index.AudioPreProcessing] = None,
video_preprocessing: Optional[marqo_index.VideoPreProcessing] = None,
vector_numeric_type: Optional[marqo_index.VectorNumericType] = None,
ann_parameters: Optional[marqo_index.AnnParameters] = None,
wait_for_readiness: bool = True,
Expand All @@ -100,6 +103,7 @@ def create_index(
parameters, and is passed directly as the index's
index_settings
treat_urls_and_pointers_as_images: whether to treat urls and pointers as images
treat_urls_and_pointers_as_media: whether to treat urls and pointers as media (video/audio)
filter_string_max_length: threshold for short string length in unstructured indexes,
Marqo can filter on short strings but can not filter on long strings
all_fields: list of all the fields in the structured index
Expand Down Expand Up @@ -132,12 +136,15 @@ def create_index(
config=self.config, index_name=index_name,
type=type, settings_dict=settings_dict,
treat_urls_and_pointers_as_images=treat_urls_and_pointers_as_images,
treat_urls_and_pointers_as_media=treat_urls_and_pointers_as_media,
filter_string_max_length=filter_string_max_length,
all_fields=all_fields, tensor_fields=tensor_fields,
model=model, model_properties=model_properties,
normalize_embeddings=normalize_embeddings,
text_preprocessing=text_preprocessing,
image_preprocessing=image_preprocessing,
audio_preprocessing=audio_preprocessing,
video_preprocessing=video_preprocessing,
vector_numeric_type=vector_numeric_type,
ann_parameters=ann_parameters,
wait_for_readiness=wait_for_readiness,
Expand Down
10 changes: 10 additions & 0 deletions src/marqo/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def create(config: Config,
type: Optional[marqo_index.IndexType] = None,
settings_dict: Optional[Dict[str, Any]] = None,
treat_urls_and_pointers_as_images: Optional[bool] = None,
treat_urls_and_pointers_as_media: Optional[bool] = None,
filter_string_max_length: Optional[int] = None,
all_fields: Optional[List[marqo_index.FieldRequest]] = None,
tensor_fields: Optional[List[str]] = None,
Expand All @@ -88,6 +89,8 @@ def create(config: Config,
normalize_embeddings: Optional[bool] = None,
text_preprocessing: Optional[marqo_index.TextPreProcessing] = None,
image_preprocessing: Optional[marqo_index.ImagePreProcessing] = None,
audio_preprocessing: Optional[marqo_index.AudioPreProcessing] = None,
video_preprocessing: Optional[marqo_index.VideoPreProcessing] = None,
vector_numeric_type: Optional[marqo_index.VectorNumericType] = None,
ann_parameters: Optional[marqo_index.AnnParameters] = None,
inference_type: Optional[str] = None,
Expand All @@ -114,6 +117,7 @@ def create(config: Config,
parameters, and is passed directly as the index's
index_settings
treat_urls_and_pointers_as_images: whether to treat urls and pointers as images in unstructured indexes
treat_urls_and_pointers_as_media: whether to treat urls and pointers as media (video/audio) in unstructured indexes
filter_string_max_length: threshold for short string length in unstructured indexes,
Marqo can filter on short strings but can not filter on long strings
all_fields: list of fields in the structured index
Expand Down Expand Up @@ -148,13 +152,16 @@ def create(config: Config,
allFields=all_fields,
settingsDict=settings_dict,
treatUrlsAndPointersAsImages=treat_urls_and_pointers_as_images,
treatUrlsAndPointersAsMedia=treat_urls_and_pointers_as_media,
filterStringMaxLength=filter_string_max_length,
tensorFields=tensor_fields,
model=model,
modelProperties=model_properties,
normalizeEmbeddings=normalize_embeddings,
textPreprocessing=text_preprocessing,
imagePreprocessing=image_preprocessing,
audioPreprocessing=audio_preprocessing,
videoPreprocessing=video_preprocessing,
vectorNumericType=vector_numeric_type,
annParameters=ann_parameters,
textChunkPrefix=text_chunk_prefix,
Expand All @@ -170,13 +177,16 @@ def create(config: Config,
allFields=all_fields,
settingsDict=settings_dict,
treatUrlsAndPointersAsImages=treat_urls_and_pointers_as_images,
treatUrlsAndPointersAsMedia=treat_urls_and_pointers_as_media,
filterStringMaxLength=filter_string_max_length,
tensorFields=tensor_fields,
model=model,
modelProperties=model_properties,
normalizeEmbeddings=normalize_embeddings,
textPreprocessing=text_preprocessing,
imagePreprocessing=image_preprocessing,
audioPreprocessing=audio_preprocessing,
videoPreprocessing=video_preprocessing,
vectorNumericType=vector_numeric_type,
annParameters=ann_parameters,
numberOfInferences=number_of_inferences,
Expand Down
7 changes: 6 additions & 1 deletion src/marqo/models/create_index_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ class IndexSettings(MarqoBaseModel):
Can not be specified with other parameters.
tensorFields: A list of all tensor fields in the index.
treatUrlsAndPointersAsImages: Whether to treat urls and pointers as images.
This unstructured index only parameter.
This is and unstructured index only parameter.
treatUrlsAndPointersAsMedia: Whether to treat urls and pointers as media (video/audio).
This is an unstructured index only parameter.
filterStringMaxLength: The max length of the filter string in unstructured index
model: The name of the model to use for the index.
modelProperties: A dictionary of model properties.
Expand All @@ -34,12 +36,15 @@ class IndexSettings(MarqoBaseModel):
settingsDict: Optional[Dict] = None
tensorFields: Optional[List[str]] = None
treatUrlsAndPointersAsImages: Optional[bool] = None
treatUrlsAndPointersAsMedia: Optional[bool] = None
filterStringMaxLength: Optional[int] = None
model: Optional[str] = None
modelProperties: Optional[Dict[str, Any]] = None
normalizeEmbeddings: Optional[bool] = None
textPreprocessing: Optional[marqo_index.TextPreProcessing] = None
imagePreprocessing: Optional[marqo_index.ImagePreProcessing] = None
audioPreprocessing: Optional[marqo_index.AudioPreProcessing] = None
videoPreprocessing: Optional[marqo_index.VideoPreProcessing] = None
vectorNumericType: Optional[marqo_index.VectorNumericType] = None
annParameters: Optional[marqo_index.AnnParameters] = None
textQueryPrefix: Optional[str] = None
Expand Down
10 changes: 10 additions & 0 deletions src/marqo/models/marqo_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class FieldType(str, Enum):
ArrayFloat = 'array<float>'
ArrayDouble = 'array<double>'
ImagePointer = 'image_pointer'
VideoPointer = 'video_pointer'
AudioPointer = 'audio_pointer'
MultimodalCombination = 'multimodal_combination'
CustomVector = "custom_vector"
MapInt = 'map<text, int>'
Expand Down Expand Up @@ -77,6 +79,14 @@ class TextPreProcessing(StrictBaseModel):
class ImagePreProcessing(StrictBaseModel):
patchMethod: Optional[PatchMethod] = Field(None, alias="patch_method")

class VideoPreProcessing(StrictBaseModel):
splitLength: Optional[int] = Field(None, alias="split_length")
splitOverlap: Optional[int] = Field(None, alias="split_overlap")

class AudioPreProcessing(StrictBaseModel):
splitLength: Optional[int] = Field(None, alias="split_length")
splitOverlap: Optional[int] = Field(None, alias="split_overlap")


class Model(StrictBaseModel):
name: Optional[str] = None
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__minimum_supported_marqo_version__ = "2.10.0"
__minimum_supported_marqo_version__ = "2.12.0"

# NOTE: This isn't used anywhere
def supported_marqo_version() -> str:
Expand Down
31 changes: 28 additions & 3 deletions tests/cloud_test_logic/cloud_test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ class CloudTestIndex(str, Enum):
Please try to keep names short to avoid hitting name-length limits
We create 3 unstructured indexes and 3 structured indexes to test:
1) unstructured_text: Text-only index using hf/e5-base-v2, 2 shards, 1 replica, CPU, balanced storage, for hybrid duplicates testing.
2) unstructured_image: Image-compatible index using open_clip/ViT-B-32/laion2b_s34b_b79k, 1 shard, no replicas, CPU, basic storage.
3) unstructured_no_model: 512-dimension custom vectors, 1 shard, no replicas, CPU, basic storage.
4) structured_text: Structured text index with hf/e5-base-v2, lexical search, 2 shards, 1 replica, CPU, balanced storage.
5) structured_image: Structured image-text index with open_clip/ViT-B-32, 2 shards, 1 replica, CPU, balanced storage, with image preprocessing.
6) structured_languagebind_model: a structured index using the LanguageBind model for multi-modal support.
For more information on the settings of each index, please refer to index_name_to_settings_mappings.
FOR CLOUD REPLICAS AND SHARDS:
Expand All @@ -21,6 +23,7 @@ class CloudTestIndex(str, Enum):
We design these indexes to maximize the coverage of different settings and features. For each test method,
we will have to manually specify which index to use.
"""

unstructured_text = "pymarqo_unstr_txt"
Expand All @@ -32,6 +35,7 @@ class CloudTestIndex(str, Enum):
structured_image_custom = "pymarqo_str_img_custom"
structured_text = "pymarqo_str_txt"
structured_image = "pymarqo_str_img"
structured_languagebind_model = "pymarqo_str_langbind_model"


index_name_to_settings_mappings = {
Expand Down Expand Up @@ -80,11 +84,10 @@ class CloudTestIndex(str, Enum):
{"name": "int_field_1", "type": "int", "features": ["score_modifier"]},
{"name": "int_filter_field_1", "type": "int", "features": ["filter", "score_modifier"]}],
"tensorFields": ["text_field_1", "text_field_2", "text_field_3"],

"inferenceType": "marqo.CPU.small",
"storageClass": "marqo.balanced",
"numberOfShards": 2,
"numberOfReplicas": 1, # For hybrid duplicates test
"numberOfReplicas": 1, # For hybrid duplicates test
},
CloudTestIndex.structured_image: {
"type": "structured",
Expand All @@ -110,5 +113,27 @@ class CloudTestIndex(str, Enum):
"imagePreprocessing": {
"patchMethod": "simple",
}
}
},
CloudTestIndex.structured_languagebind_model: {
"type": "structured",
"model": "LanguageBind/Video_V1.5_FT_Audio_FT_Image",
"inferenceType": "marqo.GPU",
"storageClass": "marqo.balanced",
"allFields": [
{"name": "text_field_1", "type": "text"},
{"name": "text_field_2", "type": "text"},
{"name": "text_field_3", "type": "text"},
{"name": "video_field_1", "type": "video_pointer"},
{"name": "video_field_2", "type": "video_pointer"},
{"name": "video_field_3", "type": "video_pointer"},
{"name": "audio_field_1", "type": "audio_pointer"},
{"name": "audio_field_2", "type": "audio_pointer"},
{"name": "image_field_1", "type": "image_pointer"},
{"name": "image_field_2", "type": "image_pointer"},
{"name": "multimodal_field", "type": "multimodal_combination"},
],
"tensorFields": ["multimodal_field", "text_field_3", "video_field_3", "audio_field_2", "image_field_2"],
"normalizeEmbeddings": True,
},

}
35 changes: 35 additions & 0 deletions tests/marqo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,15 @@ def setUpClass(cls) -> None:
cls.unstructured_no_model_index_name = "unstructured_no_model_index"
cls.structured_image_index_name_simple_preprocessing_method = \
"structured_image_index_simple_preprocessing_method"
cls.structured_languagebind_index_name = "structured_languagebind_index"

# TODO: include structured when boolean_field bug for structured is fixed
cls.test_cases = [
(CloudTestIndex.unstructured_image, cls.unstructured_index_name),
]
cls.test_cases_multimodal = [
(CloudTestIndex.structured_languagebind_model, cls.structured_languagebind_index_name)
]

# class property to indicate if test is being run on multi
cls.IS_MULTI_INSTANCE = (True if os.environ.get("IS_MULTI_INSTANCE", False) in ["True", "TRUE", "true", True] else False)
Expand Down Expand Up @@ -262,6 +267,36 @@ def setUpClass(cls) -> None:
"type": "no_model",
"dimensions": 512
}
},
{
"indexName": cls.structured_languagebind_index_name,
"type": "structured",
"model": "LanguageBind/Video_V1.5_FT_Audio_FT_Image",
"allFields": [
{"name": "text_field_1", "type": "text"},
{"name": "text_field_2", "type": "text"},
{"name": "text_field_3", "type": "text"},
{"name": "video_field_1", "type": "video_pointer"},
{"name": "video_field_2", "type": "video_pointer"},
{"name": "video_field_3", "type": "video_pointer"},
{"name": "audio_field_1", "type": "audio_pointer"},
{"name": "audio_field_2", "type": "audio_pointer"},
{"name": "image_field_1", "type": "image_pointer"},
{"name": "image_field_2", "type": "image_pointer"},
{
"name": "multimodal_field",
"type": "multimodal_combination",
"dependentFields": {
"text_field_1": 0.1,
"text_field_2": 0.1,
"image_field_1": 0.5,
"video_field_1": 0.1,
"video_field_2": 0.1,
"audio_field_1": 0.1
}
},
],
"tensorFields": ["multimodal_field", "text_field_3", "video_field_3", "audio_field_2", "image_field_2"]
}
])
except Exception as e:
Expand Down
Loading

0 comments on commit f533c0b

Please sign in to comment.