Merge pull request #1235 from vespa-engine/tmaregge/increase-RAM

Add more RAM for embedding-service node and add info
vespa-engine · Jul 7, 2023 · 76db02f · 76db02f
2 parents 3ad602d + 7644c69
commit 76db02f
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 24 deletions.
diff --git a/examples/embedding-service/README.md b/examples/embedding-service/README.md
@@ -10,12 +10,33 @@ component can be used to process HTTP requests.
 In this application, a handler is used to implement an embedding service,
 which takes a string as an input and returns a vector embedding of that string.
 
-## TODO Setup for Vespa Cloud deployment
+## Setup for Vespa Cloud deployment
+
+### Cloud deployment
+
+1. Create a new application in Vespa Cloud by following steps 1-4 in the [quick start guide](https://cloud.vespa.ai/en/getting-started)
+2. Clone this repository: ``vespa clone examples/embedding-service embedding-service && cd embedding-service``
+3. Download the models:
+```
+mkdir -p src/main/application/embedder-models/e5-small-v2
+curl -o src/main/application/embedder-models/e5-small-v2/model.onnx \
+  https://data.vespa.oath.cloud/onnx_models/e5-small-v2/model.onnx
+curl -o src/main/application/embedder-models/e5-small-v2/tokenizer.json \
+  https://data.vespa.oath.cloud/onnx_models/e5-small-v2/tokenizer.json
+```
+4. Add a public certificate: ``vespa auth cert``
+5. Compile and deploy the application: ``mvn install && vespa deploy --wait 600``
+
+### Enabling more embedders
+
+By default, only the ``e5-small-v2`` embedder is enabled for cloud deployments.
+Additional models are available, and can be enabled easily, though you should be mindful of the increased memory consumption.
+Check out ``services.xml`` for more information.
 
 ## Setup for local deployment
 
 1. Set up a Vespa Docker container by following steps 1-5 in the [quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html)
-2. Clone this repository: ``vespa clone examples/embedding-service embedding-service``
+2. Clone this repository: ``vespa clone examples/embedding-service embedding-service && cd embedding-service``
 3. Download the models:
 ```
 mkdir -p src/main/application/embedder-models/e5-small-v2
@@ -24,19 +45,21 @@ curl -o src/main/application/embedder-models/e5-small-v2/tokenizer.json https://
 ```
 4. Compile and deploy the application: ``mvn install && vespa deploy --wait 300``
 
+### Adding more local embedders
+
+More embedders from the [model hub](https://cloud.vespa.ai/en/model-hub) can be added
+for local deployments, but this increases compile/deployment time.
+To add a model, download its ``model.onnx`` and ``tokenizer.json`` files and add them
+to a new subdirectory in ``src/main/application/embedder-models``.
+Then, add it as a component in ``services.xml``.
+
 ## Calling the embedding service
 
 This sample application is a work in progress.
 Currently, it has no GUI.
 To interact with the application, you need to somehow send a POST request to the ``embedding`` endpoint,
 containing a JSON object specifying the text to be encoded and the embedder to use.
 
-Currently, only ``"e5-small-v2"`` is supported for local deployments.
-If you're running the app in Vespa Cloud,
-``"e5-base-v2"``, ``"e5-large-v2"``, ``"multilingual-e5-base"``and ``"minilm-l6-v2"``
-are also available.
-
-
 If you're using Vespa Cloud, you can use the ``vespa curl`` utility:
 
     vespa curl -- -X POST --data-raw \
@@ -63,12 +86,5 @@ The output should look something like this in both cases:
         "embedding":"tensor<float>(x[384]):[-0.5786399, 0.20775521, ...]"
     }
 
-## Adding more local embedders
-
-More embedders from the [model hub](https://cloud.vespa.ai/en/model-hub) can be added
-for local deployments, but this increases compile/deployment time.
-To add a model, download its ``model.onnx`` and ``tokenizer.json`` files and add them
-to a new subdirectory in ``src/main/application/embedder-models``.
-Then, add it as a component in ``services.xml``.
 
 
diff --git a/examples/embedding-service/src/main/application/services.xml b/examples/embedding-service/src/main/application/services.xml
@@ -38,32 +38,43 @@
 
     <!-- Embedders fetched from https://cloud.vespa.ai/en/model-hub -->
 
-    <!-- Available locally and in Vespa Cloud -->
+    <!-- Available for local deployments and in Vespa Cloud
+    -->
     <component id="e5-small-v2" type="hugging-face-embedder">
-      <transformer-model
-              path="embedder-models/e5-small-v2/model.onnx"
-              model-id="e5-small-v2"
-      />
+      <transformer-model model-id="e5-small-v2" path="embedder-models/e5-small-v2/model.onnx"/>
       <tokenizer-model path="embedder-models/e5-small-v2/tokenizer.json"/>
     </component>
 
-    <!-- Available only in Vespa Cloud -->
+    <!--
+    The models below are only available in Vespa Cloud, unless you download the models locally (see instructions in README).
+    You can uncomment the blocks to enable the models, but please be mindful of the increased memory usage.
+    A brief test indicates that *at least* 12Gb of memory is consumed when all the models are enabled.
+    -->
+
+    <!--
     <component id="e5-base-v2" type="hugging-face-embedder">
       <transformer-model model-id="e5-base-v2"/>
     </component>
+    -->
 
+    <!--
     <component id="e5-large-v2" type="hugging-face-embedder">
-      <transformer-model model-id="e5-large-v2" />
+      <transformer-model model-id="e5-large-v2"/>
     </component>
+    -->
 
+    <!--
     <component id="multilingual-e5-base" type="hugging-face-embedder">
-      <transformer-model model-id="multilingual-e5-base" />
+      <transformer-model model-id="multilingual-e5-base"/>
     </component>
+    -->
 
+    <!--
     <component id="minilm-l6-v2" type="bert-embedder">
       <transformer-model model-id="minilm-l6-v2" />
       <tokenizer-vocab model-id="bert-base-uncased"/>
     </component>
+    -->
 
     <!--
         <nodes> specifies the nodes that should run this cluster, and through the <resources>
@@ -74,7 +85,10 @@
          - Reference: https://docs.vespa.ai/en/reference/services.html
     -->
     <nodes>
-      <node hostalias="node1" />
+      <node hostalias="node1"/>
+
+      <!-- Example of configuring more memory for larger models, using Vespa Cloud -->
+      <resources vcpu="4.0" memory="16Gb"/>
     </nodes>
   </container>