From 78289af2851a10b9f857051cb22bb00c7528f555 Mon Sep 17 00:00:00 2001 From: Marcelo Garcia <129542431+MarceloAGG@users.noreply.github.com> Date: Fri, 30 Aug 2024 03:06:31 -0700 Subject: [PATCH] .Net: Add support for ImageContent to use data URIs in ChatPromptParser so templates can use base64 encoded images. (#8401) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Motivation and Context At present, including images in prompt templates using base64 data encoding is not possible. This limitation is due to `ChatPromptParser.cs` exclusively calling the `ImageContent` constructor that requires a URI, which leads to an `InvalidOperationException`. The change required is straightforward and the limitation has been discussed before, [for example here](https://github.com/microsoft/semantic-kernel/discussions/7121). Closes #7150. ### Description The proposed trivial fix involves a simple check to determine if the content starts with `data:`, and if it does the `ImageContent` constructor that accepts a `dataUri` is utilized instead. ### Contribution Checklist - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone :smile: --------- Co-authored-by: Marcelo Garcia 🛸 --- .../HandlebarsVisionPrompts.cs | 51 +++++++++++++++++++ dotnet/samples/Concepts/README.md | 3 +- .../AI/ChatCompletion/ChatPromptParser.cs | 9 +++- .../Prompt/ChatPromptParserTests.cs | 49 ++++++++++++++++++ 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 dotnet/samples/Concepts/PromptTemplates/HandlebarsVisionPrompts.cs diff --git a/dotnet/samples/Concepts/PromptTemplates/HandlebarsVisionPrompts.cs b/dotnet/samples/Concepts/PromptTemplates/HandlebarsVisionPrompts.cs new file mode 100644 index 000000000000..195d281da570 --- /dev/null +++ b/dotnet/samples/Concepts/PromptTemplates/HandlebarsVisionPrompts.cs @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.PromptTemplates.Handlebars; + +namespace PromptTemplates; + +// This example shows how to use chat completion handlebars template prompts with base64 encoded images as a parameter. +public class HandlebarsVisionPrompts(ITestOutputHelper output) : BaseTest(output) +{ + [Fact] + public async Task RunAsync() + { + const string HandlebarsTemplate = """ + You are an AI assistant designed to help with image recognition tasks. + + {{request}} + {{imageData}} + + """; + + var kernel = Kernel.CreateBuilder() + .AddOpenAIChatCompletion( + modelId: TestConfiguration.OpenAI.ChatModelId, + apiKey: TestConfiguration.OpenAI.ApiKey) + .Build(); + + var templateFactory = new HandlebarsPromptTemplateFactory(); + var promptTemplateConfig = new PromptTemplateConfig() + { + Template = HandlebarsTemplate, + TemplateFormat = "handlebars", + Name = "Vision_Chat_Prompt", + }; + var function = kernel.CreateFunctionFromPrompt(promptTemplateConfig, templateFactory); + + var arguments = new KernelArguments(new Dictionary + { + {"request","Describe this image:"}, + {"imageData", "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAAXNSR0IArs4c6QAAACVJREFUKFNj/KTO/J+BCMA4iBUyQX1A0I10VAizCj1oMdyISyEAFoQbHwTcuS8AAAAASUVORK5CYII="} + }); + + var response = await kernel.InvokeAsync(function, arguments); + Console.WriteLine(response); + + /* + Output: + The image is a solid block of bright red color. There are no additional features, shapes, or textures present. + */ + } +} diff --git a/dotnet/samples/Concepts/README.md b/dotnet/samples/Concepts/README.md index 26eef28982a7..937d832dfcba 100644 --- a/dotnet/samples/Concepts/README.md +++ b/dotnet/samples/Concepts/README.md @@ -142,7 +142,8 @@ Down below you can find the code snippets that demonstrate the usage of many Sem - [MultiplePromptTemplates](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/PromptTemplates/MultiplePromptTemplates.cs) - [PromptFunctionsWithChatGPT](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/PromptTemplates/PromptFunctionsWithChatGPT.cs) - [TemplateLanguage](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/PromptTemplates/TemplateLanguage.cs) -- [PromptyFunction](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/PromptYemplates/PromptyFunction.cs) +- [PromptyFunction](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/PromptTemplates/PromptyFunction.cs) +- [HandlebarsVisionPrompts](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/PromptTemplates/HandlebarsVisionPrompts.cs) ## RAG - Retrieval-Augmented Generation diff --git a/dotnet/src/SemanticKernel.Abstractions/AI/ChatCompletion/ChatPromptParser.cs b/dotnet/src/SemanticKernel.Abstractions/AI/ChatCompletion/ChatPromptParser.cs index c9cae7acb070..12d63de28d3c 100644 --- a/dotnet/src/SemanticKernel.Abstractions/AI/ChatCompletion/ChatPromptParser.cs +++ b/dotnet/src/SemanticKernel.Abstractions/AI/ChatCompletion/ChatPromptParser.cs @@ -75,7 +75,14 @@ private static ChatMessageContent ParseChatNode(PromptNode node) { if (childNode.TagName.Equals(ImageTagName, StringComparison.OrdinalIgnoreCase)) { - items.Add(new ImageContent(new Uri(childNode.Content!))); + if (childNode.Content!.StartsWith("data:", StringComparison.OrdinalIgnoreCase)) + { + items.Add(new ImageContent(childNode.Content)); + } + else + { + items.Add(new ImageContent(new Uri(childNode.Content!))); + } } else if (childNode.TagName.Equals(TextTagName, StringComparison.OrdinalIgnoreCase)) { diff --git a/dotnet/src/SemanticKernel.UnitTests/Prompt/ChatPromptParserTests.cs b/dotnet/src/SemanticKernel.UnitTests/Prompt/ChatPromptParserTests.cs index ecb051b7d7b1..e3ad0cd53a5c 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Prompt/ChatPromptParserTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Prompt/ChatPromptParserTests.cs @@ -114,6 +114,40 @@ public void ItReturnsChatHistoryWithValidContentItemsIncludeCData() """, c.Content)); } + [Fact] + public void ItReturnsChatHistoryWithValidDataImageContent() + { + // Arrange + string prompt = GetValidPromptWithDataUriImageContent(); + + // Act + bool result = ChatPromptParser.TryParse(prompt, out var chatHistory); + + // Assert + Assert.True(result); + Assert.NotNull(chatHistory); + + Assert.Collection(chatHistory, + c => Assert.Equal("What can I help with?", c.Content), + c => + { + Assert.Equal("Explain this image", c.Content); + Assert.Collection(c.Items, + o => + { + Assert.IsType(o); + Assert.Equal("Explain this image", ((TextContent)o).Text); + }, + o => + { + Assert.IsType(o); + Assert.Equal("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAAXNSR0IArs4c6QAAACVJREFUKFNj/KTO/J+BCMA4iBUyQX1A0I10VAizCj1oMdyISyEAFoQbHwTcuS8AAAAASUVORK5CYII=", ((ImageContent)o).DataUri); + Assert.Equal("image/png", ((ImageContent)o).MimeType); + Assert.NotNull(((ImageContent)o).Data); + }); + }); + } + [Fact] public void ItReturnsChatHistoryWithValidContentItemsIncludeCode() { @@ -210,6 +244,21 @@ Second line. """; } + private static string GetValidPromptWithDataUriImageContent() + { + return + """ + + What can I help with? + + + Explain this image + data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAAXNSR0IArs4c6QAAACVJREFUKFNj/KTO/J+BCMA4iBUyQX1A0I10VAizCj1oMdyISyEAFoQbHwTcuS8AAAAASUVORK5CYII= + + + """; + } + private static string GetValidPromptWithCDataSection() { return