Merge pull request #1227 from vespa-engine/tmaregge/cloud-and-local-s…

…upport-embedding-service Tmaregge/cloud and local support embedding service
vespa-engine · Jun 25, 2023 · 88c4696 · 88c4696
2 parents 91c5352 + b04f01d
commit 88c4696
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 20 deletions.
diff --git a/examples/embedding-service/.gitignore b/examples/embedding-service/.gitignore
@@ -4,5 +4,5 @@ target/
 src/main/application/security/
 *.pem
 
-src/main/application/models/
+src/main/application/embedder-models/
 src/main/application/models.generated
diff --git a/examples/embedding-service/README.md b/examples/embedding-service/README.md
@@ -10,15 +10,17 @@ component can be used to process HTTP requests.
 In this application, a handler is used to implement an embedding service,
 which takes a string as an input and returns a vector embedding of that string.
 
+## TODO Setup for Vespa Cloud deployment
+
 ## Setup for local deployment
 
 1. Set up a Vespa Docker container by following steps 1-5 in the [quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html)
 2. Clone this repository: ``vespa clone examples/embedding-service embedding-service``
 3. Download the models:
 ```
-mkdir -p src/main/application/models
-wget -P src/main/application/models https://data.vespa.oath.cloud/onnx_models/e5-small-v2/model.onnx
-wget -P src/main/application/models https://data.vespa.oath.cloud/onnx_models/e5-small-v2/tokenizer.json
+mkdir -p src/main/application/embedder-models/e5-small-v2
+curl -o src/main/application/embedder-models/e5-small-v2/model.onnx https://data.vespa.oath.cloud/onnx_models/e5-small-v2/model.onnx
+curl -o src/main/application/embedder-models/e5-small-v2/tokenizer.json https://data.vespa.oath.cloud/onnx_models/e5-small-v2/tokenizer.json
 ```
 4. Compile and deploy the application: ``mvn install && vespa deploy --wait 300``
 
@@ -28,23 +30,45 @@ This sample application is a work in progress.
 Currently, it has no GUI.
 To interact with the application, you need to somehow send a POST request to the ``embedding`` endpoint,
 containing a JSON object specifying the text to be encoded and the embedder to use.
-Currently, only ``"hugging-face-embedder"`` is supported.
 
-Here is a simple example using cURL:
+Currently, only ``"e5-small-v2"`` is supported for local deployments.
+If you're running the app in Vespa Cloud,
+``"e5-base-v2"``, ``"e5-large-v2"``, ``"multilingual-e5-base"``and ``"minilm-l6-v2"``
+are also available.
+
+
+If you're using Vespa Cloud, you can use the ``vespa curl`` utility:
+
+    vespa curl -- -X POST --data-raw \
+    '{
+        "text": "text to embed",
+        "embedder": "e5-small-v2"
+    }' \
+    /embedding
+
+If you're running the app locally, you can use normal ``curl``:
 
     curl 'http://127.0.0.1:8080/embedding'  \
-    -X POST --data-raw $'{ \
-      "text": "text to embed", \
-      "embedder": "hugging-face-embedder"  \
+    -X POST --data-raw  \
+    '{ 
+      "text": "text to embed", 
+      "embedder": "e5-small-v2"  
     }'
 
-The output should look something like this:
+The output should look something like this in both cases:
 
     {
-        "embedder":"hugging-face-embedder",
+        "embedder":"e5-small-v2",
         "text":"text to embed",
         "embedding":"tensor<float>(x[384]):[-0.5786399, 0.20775521, ...]"
     }
 
+## Adding more local embedders
+
+More embedders from the [model hub](https://cloud.vespa.ai/en/model-hub) can be added
+for local deployments, but this increases compile/deployment time.
+To add a model, download its ``model.onnx`` and ``tokenizer.json`` files and add them
+to a new subdirectory in ``src/main/application/embedder-models``.
+Then, add it as a component in ``services.xml``.
 
 
diff --git a/examples/embedding-service/src/main/application/services.xml b/examples/embedding-service/src/main/application/services.xml
@@ -36,18 +36,34 @@
       <binding>http://*/embedding</binding>
     </handler>
 
-    <component id="hugging-face-embedder" type="hugging-face-embedder">
-      <transformer-model path="models/model.onnx"/>
-      <tokenizer-model path="models/tokenizer.json"/>
+    <!-- Embedders fetched from https://cloud.vespa.ai/en/model-hub -->
+
+    <!-- Available locally and in Vespa Cloud -->
+    <component id="e5-small-v2" type="hugging-face-embedder">
+      <transformer-model
+              path="embedder-models/e5-small-v2/model.onnx"
+              model-id="e5-small-v2"
+      />
+      <tokenizer-model path="embedder-models/e5-small-v2/tokenizer.json"/>
     </component>
 
-    <!-- TODO Add more components
-    <component id="another-embedder" type="hugging-face-embedder">
-      <transformer-model path="models/model.onnx"/>
-      <tokenizer-model path="models/tokenizer.json"/>
+    <!-- Available only in Vespa Cloud -->
+    <component id="e5-base-v2" type="hugging-face-embedder">
+      <transformer-model model-id="e5-base-v2"/>
+    </component>
+
+    <component id="e5-large-v2" type="hugging-face-embedder">
+      <transformer-model model-id="e5-large-v2" />
     </component>
-    -->
 
+    <component id="multilingual-e5-base" type="hugging-face-embedder">
+      <transformer-model model-id="multilingual-e5-base" />
+    </component>
+
+    <component id="minilm-l6-v2" type="bert-embedder">
+      <transformer-model model-id="minilm-l6-v2" />
+      <tokenizer-vocab model-id="bert-base-uncased"/>
+    </component>
 
     <!--
         <nodes> specifies the nodes that should run this cluster, and through the <resources>

diff --git a/.../embedding-service/src/main/java/ai/vespa/example/embedding_service/EmbeddingHandler.java b/.../embedding-service/src/main/java/ai/vespa/example/embedding_service/EmbeddingHandler.java
@@ -12,12 +12,22 @@
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.util.Map;
 import java.util.concurrent.Executor;
 
 public class EmbeddingHandler extends ThreadedHttpRequestHandler {
     private ComponentRegistry<Embedder> availableEmbedders;
     private ObjectMapper jsonMapper;
 
+    // Mappings fetched from https://cloud.vespa.ai/en/model-hub#using-e5-models
+    private final Map<String, TensorType> modelTensorTypeMap = Map.of(
+            "e5-small-v2", TensorType.fromSpec("tensor<float>(x[384])"),
+            "e5-base-v2", TensorType.fromSpec("tensor<float>(x[768])"),
+            "e5-large-v2", TensorType.fromSpec("tensor<float>(x[1024])"),
+            "multilingual-e5-base", TensorType.fromSpec("tensor<float>(x[768])"),
+            "minilm-l6-v2", TensorType.fromSpec("tensor<float>(x[384])")
+    );
+
     @Inject
     public EmbeddingHandler(Executor executor, ComponentRegistry<Embedder> embedders) {
         super(executor);
@@ -30,8 +40,26 @@ public HttpResponse handle(HttpRequest httpRequest) {
         Data requestData = parseRequestJson(httpRequest);
 
         Embedder embedder = availableEmbedders.getComponent(requestData.embedder());
+        if (embedder == null) {
+            return new HttpResponse(400) {
+                @Override
+                public void render(OutputStream outputStream) throws IOException {
+                    outputStream.write(jsonMapper.writeValueAsBytes(Map.of("error", "Embedder '" + requestData.embedder()  + "' not found")));
+                }
+            };
+        }
+
+        TensorType type = modelTensorTypeMap.get(requestData.embedder());
+        if (type == null) {
+            return new HttpResponse(400) {
+                @Override
+                public void render(OutputStream outputStream) throws IOException {
+                    outputStream.write(jsonMapper.writeValueAsBytes(Map.of("error", "TensorType for embedder '" + requestData.embedder()  + "' not found")));
+                }
+            };
+        }
+
         Embedder.Context context = new Embedder.Context("");
-        TensorType type = TensorType.fromSpec("tensor<float>(x[384])");
         String embedding = embedder.embed(requestData.text(), context, type).toString();
 
         Data responseData = new Data(requestData.text(), requestData.embedder(), embedding);