From 75673862514ed3ddd7ed3420d548918e22b85405 Mon Sep 17 00:00:00 2001 From: pandu-k <107458762+pandu-k@users.noreply.github.com> Date: Thu, 6 Jul 2023 20:43:37 +1000 Subject: [PATCH] Removed references to processes, server_batch_size. Ensured add_documents calls have create_index() calls before them (#529) --- examples/GPT-examples/ironman.py | 2 +- examples/GPT3NewsSummary/README.md | 3 +++ examples/GPT3NewsSummary/main.py | 3 +++ examples/ImageSearchGuide/ImageSearchGuide.md | 2 +- examples/ImageSearchGuide/imagesearchguide.ipynb | 2 +- examples/ImageSearchLocalization/article.md | 5 ++--- examples/ImageSearchLocalization/index_all_data.py | 6 ++---- examples/SimpleWiki/simple_wiki_demo.py | 3 --- examples/SpeechProcessing/SpeechSearch/indexer.py | 3 +-- examples/SpeechProcessing/article/article.md | 3 +-- examples/StableDiffusion/hot-dog-100k.md | 6 ++---- examples/StableDiffusion/hot-dog-100k.py | 8 +++----- examples/podcast-search/podcast_search_demo.py | 3 ++- 13 files changed, 22 insertions(+), 27 deletions(-) diff --git a/examples/GPT-examples/ironman.py b/examples/GPT-examples/ironman.py index c07509f32..3cc5ed5d6 100644 --- a/examples/GPT-examples/ironman.py +++ b/examples/GPT-examples/ironman.py @@ -100,7 +100,7 @@ } } - # create the index - if no settings are present then sensible deffaults are used + # create the index - if no settings are present then sensible defaults are used mq.create_index(index_name, settings_dict=index_settings) res = mq.index(index_name).add_documents(documents) diff --git a/examples/GPT3NewsSummary/README.md b/examples/GPT3NewsSummary/README.md index d3e1dad19..678c4819d 100644 --- a/examples/GPT3NewsSummary/README.md +++ b/examples/GPT3NewsSummary/README.md @@ -79,6 +79,9 @@ DOC_INDEX_NAME = ''news-index' print('Establishing connection to marqo client.') mq = marqo.Client(url='http://localhost:8882') +print('creating a Marqo index') +mq.create_index(DOC_INDEX_NAME) + print('Indexing documents') mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS) ``` diff --git a/examples/GPT3NewsSummary/main.py b/examples/GPT3NewsSummary/main.py index 9171a955e..acdf486ca 100644 --- a/examples/GPT3NewsSummary/main.py +++ b/examples/GPT3NewsSummary/main.py @@ -34,6 +34,9 @@ except KeyboardInterrupt: raise except: + print('Creating index') + mq.create_index(DOC_INDEX_NAME) + print('Indexing documents') mq.index(DOC_INDEX_NAME).add_documents(MARQO_DOCUMENTS) print('Done') diff --git a/examples/ImageSearchGuide/ImageSearchGuide.md b/examples/ImageSearchGuide/ImageSearchGuide.md index f3bcd38f3..c5cb84406 100644 --- a/examples/ImageSearchGuide/ImageSearchGuide.md +++ b/examples/ImageSearchGuide/ImageSearchGuide.md @@ -139,7 +139,7 @@ output: Add the documents into the previously created index using function `add_documents()` ```python -mq.index(index_name).add_documents(documents, device="cpu", processes=1, client_batch_size= 1) +mq.index(index_name).add_documents(documents, device="cpu", client_batch_size= 1) ``` ```python outputs: diff --git a/examples/ImageSearchGuide/imagesearchguide.ipynb b/examples/ImageSearchGuide/imagesearchguide.ipynb index bbe7488c4..137c201d5 100644 --- a/examples/ImageSearchGuide/imagesearchguide.ipynb +++ b/examples/ImageSearchGuide/imagesearchguide.ipynb @@ -245,7 +245,7 @@ } ], "source": [ - "mq.index(index_name).add_documents(documents, device=\"cpu\", processes=1, client_batch_size= 1)" + "mq.index(index_name).add_documents(documents, device=\"cpu\", client_batch_size= 1)" ] }, { diff --git a/examples/ImageSearchLocalization/article.md b/examples/ImageSearchLocalization/article.md index 22bb4c433..7745180ad 100644 --- a/examples/ImageSearchLocalization/article.md +++ b/examples/ImageSearchLocalization/article.md @@ -150,9 +150,8 @@ for patch_method in patch_methods: response = client.create_index(index_name, settings_dict=settings) - # index the documents on the GPU using multiple processes - response = client.index(index_name).add_documents(documents, device='cuda', - server_batch_size=50, processes=2) + # index the documents on the GPU + response = client.index(index_name).add_documents(documents, device='cuda', client_batch_size=50) ``` If no GPU is available, set device='cpu'.  diff --git a/examples/ImageSearchLocalization/index_all_data.py b/examples/ImageSearchLocalization/index_all_data.py index c06c454e8..10a2c7607 100644 --- a/examples/ImageSearchLocalization/index_all_data.py +++ b/examples/ImageSearchLocalization/index_all_data.py @@ -44,8 +44,7 @@ index_name_prefix = "visual-search" patch_methods = ["dino/v1", None, "yolox"] #["dino/v1", "dino/v2", "frcnn", None, "yolox"] model_name = "ViT-B/32" -n_processes = 3 -batch_size = 50 +batch_size = 24 # set this to false if you do not want to delete the previous index of the same name delete_index = True @@ -81,5 +80,4 @@ response = client.create_index(index_name, settings_dict=settings) - response = client.index(index_name).add_documents(documents, device='cuda', - server_batch_size=batch_size, processes=n_processes) \ No newline at end of file + response = client.index(index_name).add_documents(documents, device='cuda', client_batch_size=batch_size) \ No newline at end of file diff --git a/examples/SimpleWiki/simple_wiki_demo.py b/examples/SimpleWiki/simple_wiki_demo.py index 9f46f82e2..8b41259c3 100644 --- a/examples/SimpleWiki/simple_wiki_demo.py +++ b/examples/SimpleWiki/simple_wiki_demo.py @@ -89,9 +89,6 @@ def split_big_docs(data, field='content', char_len=5e4): device = 'cpu' -# here we use parallel indexing to speed up the task -# Note: to use multiprocessing you will want at least 8GB of RAM and the maximum number -# of processes that can be supported will be system dependent. responses = client.index(index_name).add_documents(data, device=device, client_batch_size=20) # optionally take a look at the responses diff --git a/examples/SpeechProcessing/SpeechSearch/indexer.py b/examples/SpeechProcessing/SpeechSearch/indexer.py index 4357bb63a..ddf88e4d4 100644 --- a/examples/SpeechProcessing/SpeechSearch/indexer.py +++ b/examples/SpeechProcessing/SpeechSearch/indexer.py @@ -24,8 +24,7 @@ def index_transciptions( annotated_transcriptions, non_tensor_fields=non_tensor_fields, device=device, - client_batch_size=batch_size, - server_batch_size=batch_size, + client_batch_size=batch_size ) return response diff --git a/examples/SpeechProcessing/article/article.md b/examples/SpeechProcessing/article/article.md index 3290d3e76..28862eab1 100644 --- a/examples/SpeechProcessing/article/article.md +++ b/examples/SpeechProcessing/article/article.md @@ -316,8 +316,7 @@ def index_transciptions( annotated_transcriptions, non_tensor_fields=non_tensor_fields, device=device, - client_batch_size=batch_size, - server_batch_size=batch_size, + client_batch_size=batch_size ) return response diff --git a/examples/StableDiffusion/hot-dog-100k.md b/examples/StableDiffusion/hot-dog-100k.md index e3409f0eb..c2fe050da 100644 --- a/examples/StableDiffusion/hot-dog-100k.md +++ b/examples/StableDiffusion/hot-dog-100k.md @@ -66,8 +66,7 @@ settings = { "treat_urls_and_pointers_as_images": True, } client.create_index("hot-dogs-100k", **settings) -responses = client.index("hot-dogs-100k").add_documents(documents, - device="cuda", processes=4, batch_size=50) +responses = client.index("hot-dogs-100k").add_documents(documents, device="cuda", client_batch_size=50) ``` Check we have our images in the index: @@ -163,8 +162,7 @@ We have now calculated scores for the different categories described previously. ```python documents_image_docker = [doc.pop('image_docker') for doc in documents] -responses = client.index("hot-dogs-100k").add_documents(documents, device='cpu', - processes=3, batch_size=50) +responses = client.index("hot-dogs-100k").add_documents(documents, device='cpu', client_batch_size=50) ``` ## Animating the hot-dog 100k dataset diff --git a/examples/StableDiffusion/hot-dog-100k.py b/examples/StableDiffusion/hot-dog-100k.py index 18fcd05b7..b97d6a588 100644 --- a/examples/StableDiffusion/hot-dog-100k.py +++ b/examples/StableDiffusion/hot-dog-100k.py @@ -58,9 +58,8 @@ } client.create_index(index_name, **settings) -# here we use parallel indexing to speed up the task - a gpu is recomended (device='cuda') -responses = client.index(index_name).add_documents(documents, device='cpu' - , processes=4, batch_size=50) +# Here we index. A gpu is recommended (device='cuda') +responses = client.index(index_name).add_documents(documents, device='cpu', client_batch_size=50) ##################################################### @@ -101,8 +100,7 @@ doc[lab.replace(' ','_')] = [r['_score'] for r in responses['hits'] if r['label'] == lab][0] documents_image_docker = [doc.pop('image_docker') for doc in documents] -responses = client.index("hot-dogs-100k").add_documents(documents, device='cpu', - processes=3, batch_size=50) +responses = client.index("hot-dogs-100k").add_documents(documents, device='cpu', client_batch_size=50) ##################################################### ### Step 4. Remove the black images diff --git a/examples/podcast-search/podcast_search_demo.py b/examples/podcast-search/podcast_search_demo.py index c7293f7d0..391633639 100644 --- a/examples/podcast-search/podcast_search_demo.py +++ b/examples/podcast-search/podcast_search_demo.py @@ -46,7 +46,8 @@ def load_data(file: str, number_data: int) -> dict: #################################################### index_name = "marqo-podcast-search-demo" mq = marqo.Client(url='http://localhost:8882') # Connection to Marqo Docker Container -mq.index(index_name).add_documents(podcast_data) # If the index doesn't exist, Marqo will create it +mq.create_index(index_name) +mq.index(index_name).add_documents(podcast_data) stats = mq.index(index_name).get_stats() # get the stats for the index print(f"{stats['numberOfDocuments']} documents added to index: {index_name}")