Skip to content

Commit

Permalink
Merge pull request #1227 from vespa-engine/tmaregge/cloud-and-local-s…
Browse files Browse the repository at this point in the history
…upport-embedding-service

Tmaregge/cloud and local support embedding service
  • Loading branch information
bratseth committed Jun 25, 2023
2 parents 91c5352 + b04f01d commit 88c4696
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 20 deletions.
2 changes: 1 addition & 1 deletion examples/embedding-service/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ target/
src/main/application/security/
*.pem

src/main/application/models/
src/main/application/embedder-models/
src/main/application/models.generated
44 changes: 34 additions & 10 deletions examples/embedding-service/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@ component can be used to process HTTP requests.
In this application, a handler is used to implement an embedding service,
which takes a string as an input and returns a vector embedding of that string.

## TODO Setup for Vespa Cloud deployment

## Setup for local deployment

1. Set up a Vespa Docker container by following steps 1-5 in the [quick start guide](https://docs.vespa.ai/en/vespa-quick-start.html)
2. Clone this repository: ``vespa clone examples/embedding-service embedding-service``
3. Download the models:
```
mkdir -p src/main/application/models
wget -P src/main/application/models https://data.vespa.oath.cloud/onnx_models/e5-small-v2/model.onnx
wget -P src/main/application/models https://data.vespa.oath.cloud/onnx_models/e5-small-v2/tokenizer.json
mkdir -p src/main/application/embedder-models/e5-small-v2
curl -o src/main/application/embedder-models/e5-small-v2/model.onnx https://data.vespa.oath.cloud/onnx_models/e5-small-v2/model.onnx
curl -o src/main/application/embedder-models/e5-small-v2/tokenizer.json https://data.vespa.oath.cloud/onnx_models/e5-small-v2/tokenizer.json
```
4. Compile and deploy the application: ``mvn install && vespa deploy --wait 300``

Expand All @@ -28,23 +30,45 @@ This sample application is a work in progress.
Currently, it has no GUI.
To interact with the application, you need to somehow send a POST request to the ``embedding`` endpoint,
containing a JSON object specifying the text to be encoded and the embedder to use.
Currently, only ``"hugging-face-embedder"`` is supported.

Here is a simple example using cURL:
Currently, only ``"e5-small-v2"`` is supported for local deployments.
If you're running the app in Vespa Cloud,
``"e5-base-v2"``, ``"e5-large-v2"``, ``"multilingual-e5-base"``and ``"minilm-l6-v2"``
are also available.


If you're using Vespa Cloud, you can use the ``vespa curl`` utility:

vespa curl -- -X POST --data-raw \
'{
"text": "text to embed",
"embedder": "e5-small-v2"
}' \
/embedding

If you're running the app locally, you can use normal ``curl``:

curl 'http://127.0.0.1:8080/embedding' \
-X POST --data-raw $'{ \
"text": "text to embed", \
"embedder": "hugging-face-embedder" \
-X POST --data-raw \
'{
"text": "text to embed",
"embedder": "e5-small-v2"
}'

The output should look something like this:
The output should look something like this in both cases:

{
"embedder":"hugging-face-embedder",
"embedder":"e5-small-v2",
"text":"text to embed",
"embedding":"tensor<float>(x[384]):[-0.5786399, 0.20775521, ...]"
}

## Adding more local embedders

More embedders from the [model hub](https://cloud.vespa.ai/en/model-hub) can be added
for local deployments, but this increases compile/deployment time.
To add a model, download its ``model.onnx`` and ``tokenizer.json`` files and add them
to a new subdirectory in ``src/main/application/embedder-models``.
Then, add it as a component in ``services.xml``.


32 changes: 24 additions & 8 deletions examples/embedding-service/src/main/application/services.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,34 @@
<binding>http://*/embedding</binding>
</handler>

<component id="hugging-face-embedder" type="hugging-face-embedder">
<transformer-model path="models/model.onnx"/>
<tokenizer-model path="models/tokenizer.json"/>
<!-- Embedders fetched from https://cloud.vespa.ai/en/model-hub -->

<!-- Available locally and in Vespa Cloud -->
<component id="e5-small-v2" type="hugging-face-embedder">
<transformer-model
path="embedder-models/e5-small-v2/model.onnx"
model-id="e5-small-v2"
/>
<tokenizer-model path="embedder-models/e5-small-v2/tokenizer.json"/>
</component>

<!-- TODO Add more components
<component id="another-embedder" type="hugging-face-embedder">
<transformer-model path="models/model.onnx"/>
<tokenizer-model path="models/tokenizer.json"/>
<!-- Available only in Vespa Cloud -->
<component id="e5-base-v2" type="hugging-face-embedder">
<transformer-model model-id="e5-base-v2"/>
</component>

<component id="e5-large-v2" type="hugging-face-embedder">
<transformer-model model-id="e5-large-v2" />
</component>
-->

<component id="multilingual-e5-base" type="hugging-face-embedder">
<transformer-model model-id="multilingual-e5-base" />
</component>

<component id="minilm-l6-v2" type="bert-embedder">
<transformer-model model-id="minilm-l6-v2" />
<tokenizer-vocab model-id="bert-base-uncased"/>
</component>

<!--
<nodes> specifies the nodes that should run this cluster, and through the <resources>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,22 @@

import java.io.IOException;
import java.io.OutputStream;
import java.util.Map;
import java.util.concurrent.Executor;

public class EmbeddingHandler extends ThreadedHttpRequestHandler {
private ComponentRegistry<Embedder> availableEmbedders;
private ObjectMapper jsonMapper;

// Mappings fetched from https://cloud.vespa.ai/en/model-hub#using-e5-models
private final Map<String, TensorType> modelTensorTypeMap = Map.of(
"e5-small-v2", TensorType.fromSpec("tensor<float>(x[384])"),
"e5-base-v2", TensorType.fromSpec("tensor<float>(x[768])"),
"e5-large-v2", TensorType.fromSpec("tensor<float>(x[1024])"),
"multilingual-e5-base", TensorType.fromSpec("tensor<float>(x[768])"),
"minilm-l6-v2", TensorType.fromSpec("tensor<float>(x[384])")
);

@Inject
public EmbeddingHandler(Executor executor, ComponentRegistry<Embedder> embedders) {
super(executor);
Expand All @@ -30,8 +40,26 @@ public HttpResponse handle(HttpRequest httpRequest) {
Data requestData = parseRequestJson(httpRequest);

Embedder embedder = availableEmbedders.getComponent(requestData.embedder());
if (embedder == null) {
return new HttpResponse(400) {
@Override
public void render(OutputStream outputStream) throws IOException {
outputStream.write(jsonMapper.writeValueAsBytes(Map.of("error", "Embedder '" + requestData.embedder() + "' not found")));
}
};
}

TensorType type = modelTensorTypeMap.get(requestData.embedder());
if (type == null) {
return new HttpResponse(400) {
@Override
public void render(OutputStream outputStream) throws IOException {
outputStream.write(jsonMapper.writeValueAsBytes(Map.of("error", "TensorType for embedder '" + requestData.embedder() + "' not found")));
}
};
}

Embedder.Context context = new Embedder.Context("");
TensorType type = TensorType.fromSpec("tensor<float>(x[384])");
String embedding = embedder.embed(requestData.text(), context, type).toString();

Data responseData = new Data(requestData.text(), requestData.embedder(), embedding);
Expand Down

0 comments on commit 88c4696

Please sign in to comment.