From c89fe04dce3b999c36c7f130b8eb929b11d6e97b Mon Sep 17 00:00:00 2001 From: Abby Harrison <54643756+awharrison-28@users.noreply.github.com> Date: Mon, 31 Jul 2023 21:02:49 -0700 Subject: [PATCH] .Net: DotNet: Remove cosmosdb memory store (#2239) ### Motivation and Context CosmosDB memory store is an extremely inefficient and expensive to store vectors in cloud storage. If a user wishes to use Azure to host a vector indexing service, existing SK Memory alternatives that we recommend are: - Azure Cognitive Search - Postgres Memory Store - pg vector can be enabled for Azure Database for PostgreSQL https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/how-to-use-pgvector ### Description Removes CosmosMemoryStore from SK repo. - [ ] The code builds clean without any errors or warnings - [ ] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [ ] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone :smile: --------- Co-authored-by: Gil LaHaye --- dotnet/SK-dotnet.sln | 10 +- .../Connectors.Memory.CosmosDB.csproj | 29 -- .../CosmosMemoryRecord.cs | 39 --- .../CosmosMemoryStore.cs | 324 ------------------ 4 files changed, 1 insertion(+), 401 deletions(-) delete mode 100644 dotnet/src/Connectors/Connectors.Memory.CosmosDB/Connectors.Memory.CosmosDB.csproj delete mode 100644 dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryRecord.cs delete mode 100644 dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryStore.cs diff --git a/dotnet/SK-dotnet.sln b/dotnet/SK-dotnet.sln index cef07010a480..bde158c3c1ac 100644 --- a/dotnet/SK-dotnet.sln +++ b/dotnet/SK-dotnet.sln @@ -61,8 +61,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.Qdrant", EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.Sqlite", "src\Connectors\Connectors.Memory.Sqlite\Connectors.Memory.Sqlite.csproj", "{EC004F12-2F60-4EDD-B3CD-3A504900D929}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.CosmosDB", "src\Connectors\Connectors.Memory.CosmosDB\Connectors.Memory.CosmosDB.csproj", "{EA61C289-7928-4B78-A9C1-7AAD61F907CD}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.Postgres", "src\Connectors\Connectors.Memory.Postgres\Connectors.Memory.Postgres.csproj", "{C9F957FA-A70F-4A6D-8F95-23FCD7F4FB87}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Connectors.Memory.Redis", "src\Connectors\Connectors.Memory.Redis\Connectors.Memory.Redis.csproj", "{3720F5ED-FB4D-485E-8A93-CDE60DEF0805}" @@ -240,11 +238,6 @@ Global {EC004F12-2F60-4EDD-B3CD-3A504900D929}.Publish|Any CPU.Build.0 = Publish|Any CPU {EC004F12-2F60-4EDD-B3CD-3A504900D929}.Release|Any CPU.ActiveCfg = Release|Any CPU {EC004F12-2F60-4EDD-B3CD-3A504900D929}.Release|Any CPU.Build.0 = Release|Any CPU - {EA61C289-7928-4B78-A9C1-7AAD61F907CD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {EA61C289-7928-4B78-A9C1-7AAD61F907CD}.Debug|Any CPU.Build.0 = Debug|Any CPU - {EA61C289-7928-4B78-A9C1-7AAD61F907CD}.Publish|Any CPU.ActiveCfg = Release|Any CPU - {EA61C289-7928-4B78-A9C1-7AAD61F907CD}.Release|Any CPU.ActiveCfg = Release|Any CPU - {EA61C289-7928-4B78-A9C1-7AAD61F907CD}.Release|Any CPU.Build.0 = Release|Any CPU {C9F957FA-A70F-4A6D-8F95-23FCD7F4FB87}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {C9F957FA-A70F-4A6D-8F95-23FCD7F4FB87}.Debug|Any CPU.Build.0 = Debug|Any CPU {C9F957FA-A70F-4A6D-8F95-23FCD7F4FB87}.Publish|Any CPU.ActiveCfg = Publish|Any CPU @@ -390,7 +383,6 @@ Global {EB3FC57F-E591-4C88-BCD5-B6A1BC635168} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} {5DEBAA62-F117-496A-8778-FED3604B70E2} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} {EC004F12-2F60-4EDD-B3CD-3A504900D929} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} - {EA61C289-7928-4B78-A9C1-7AAD61F907CD} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} {C9F957FA-A70F-4A6D-8F95-23FCD7F4FB87} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} {3720F5ED-FB4D-485E-8A93-CDE60DEF0805} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} {185E0CE8-C2DA-4E4C-A491-E8EB40316315} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} @@ -415,12 +407,12 @@ Global {B00AD427-0047-4850-BEF9-BA8237EA9D8B} = {958AD708-F048-4FAF-94ED-D2F2B92748B9} {DB950192-30F1-48B1-88D7-F43FECCA1A1C} = {958AD708-F048-4FAF-94ED-D2F2B92748B9} {1C19D805-3573-4477-BF07-40180FCDE1BD} = {958AD708-F048-4FAF-94ED-D2F2B92748B9} + {3CDE10B2-AE8F-4FC4-8D55-92D4AD32E144} = {958AD708-F048-4FAF-94ED-D2F2B92748B9} {0D0C4DAD-E6BC-4504-AE3A-EEA4E35920C1} = {9ECD1AA0-75B3-4E25-B0B5-9F0945B64974} {E6EDAB8F-3406-4DBF-9AAB-DF40DC2CA0FA} = {FA3720F1-C99A-49B2-9577-A940257098BF} {677F1381-7830-4115-9C1A-58B282629DC6} = {0247C2C9-86C3-45BA-8873-28B0948EDC0C} {4762BCAF-E1C5-4714-B88D-E50FA333C50E} = {078F96B4-09E1-4E0E-B214-F71A4F4BF633} {C754950A-E16C-4F96-9CC7-9328E361B5AF} = {FA3720F1-C99A-49B2-9577-A940257098BF} - {3CDE10B2-AE8F-4FC4-8D55-92D4AD32E144} = {958AD708-F048-4FAF-94ED-D2F2B92748B9} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FBDC56A3-86AD-4323-AA0F-201E59123B83} diff --git a/dotnet/src/Connectors/Connectors.Memory.CosmosDB/Connectors.Memory.CosmosDB.csproj b/dotnet/src/Connectors/Connectors.Memory.CosmosDB/Connectors.Memory.CosmosDB.csproj deleted file mode 100644 index 8761054a8de9..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.CosmosDB/Connectors.Memory.CosmosDB.csproj +++ /dev/null @@ -1,29 +0,0 @@ - - - - - Microsoft.SemanticKernel.Connectors.Memory.AzureCosmosDb - $(AssemblyName) - netstandard2.0 - - - - - - - - - Semantic Kernel - Azure Cosmos Db Connector - Azure Cosmos Db connector for Semantic Kernel skills and semantic memory - - - - - - - - - - - - diff --git a/dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryRecord.cs b/dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryRecord.cs deleted file mode 100644 index 1d2d10f0e40f..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryRecord.cs +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System; -using Newtonsoft.Json; -using Newtonsoft.Json.Serialization; - -namespace Microsoft.SemanticKernel.Connectors.Memory.AzureCosmosDb; - -/// -/// A Cosmos memory record. -/// -[JsonObject(NamingStrategyType = typeof(CamelCaseNamingStrategy))] -public class CosmosMemoryRecord -{ - /// - /// Unique identifier of the memory record. - /// - public string Id { get; set; } = string.Empty; - - /// - /// Unique identifier of the collection. - /// - public string CollectionId { get; set; } = string.Empty; - - /// - /// Optional timestamp. - /// - public DateTimeOffset? Timestamp { get; set; } - - /// - /// The embedding data as a string. - /// - public string EmbeddingString { get; set; } = string.Empty; - - /// - /// Metadata as a string. - /// - public string MetadataString { get; set; } = string.Empty; -} diff --git a/dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryStore.cs b/dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryStore.cs deleted file mode 100644 index 76a35a73753b..000000000000 --- a/dotnet/src/Connectors/Connectors.Memory.CosmosDB/CosmosMemoryStore.cs +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -using System.Collections.Generic; -using System.Linq; -using System.Net; -using System.Runtime.CompilerServices; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Azure.Cosmos; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Logging.Abstractions; -using Microsoft.SemanticKernel.AI.Embeddings; -using Microsoft.SemanticKernel.AI.Embeddings.VectorOperations; -using Microsoft.SemanticKernel.Memory; -using Microsoft.SemanticKernel.Memory.Collections; - -namespace Microsoft.SemanticKernel.Connectors.Memory.AzureCosmosDb; - -/// -/// An implementation of for Azure Cosmos DB. -/// -/// The Embedding data is saved to the Azure Cosmos DB database container specified in the constructor. -/// The embedding data persists between subsequent instances and has similarity search capability, handled by the client as Azure Cosmos DB is not a vector-native DB. -/// -public sealed class CosmosMemoryStore : IMemoryStore -{ - private Database _database; - private string _databaseName; - private ILogger _logger; - -#pragma warning disable CS8618 // Non-nullable field is uninitialized: Class instance is created and populated via factory method. - private CosmosMemoryStore() - { - } -#pragma warning restore CS8618 // Non-nullable field is uninitialized - - /// - /// Factory method to initialize a new instance of the class. - /// - /// Client with endpoint and authentication to the Azure CosmosDB Account. - /// The name of the database to back the memory store. - /// Optional logger. - /// The to monitor for cancellation requests. The default is . - /// - public static async Task CreateAsync(CosmosClient client, string databaseName, ILogger? logger = null, CancellationToken cancellationToken = default) - { - var newStore = new CosmosMemoryStore(); - - newStore._databaseName = databaseName; - newStore._logger = logger ?? NullLogger.Instance; - var response = await client.CreateDatabaseIfNotExistsAsync(newStore._databaseName, cancellationToken: cancellationToken).ConfigureAwait(false); - - if (response.StatusCode == HttpStatusCode.Created) - { - newStore._logger.LogDebug("Created database {0}", newStore._databaseName); - } - else if (response.StatusCode == HttpStatusCode.OK) - { - newStore._logger.LogDebug("Database {0}", newStore._databaseName); - } - else - { - throw new CosmosException("Database does not exist and was not created", response.StatusCode, 0, newStore._databaseName, 0); - } - - newStore._database = response.Database; - - return newStore; - } - - /// - public IAsyncEnumerable GetCollectionsAsync(CancellationToken cancellationToken = default) - { - // Azure Cosmos DB does not support listing all Containers, this does not break the interface but it is not ideal. - this._logger.LogWarning("Listing all containers is not supported by Azure Cosmos DB, returning empty list."); - - return Enumerable.Empty().ToAsyncEnumerable(); - } - - /// - public async Task CreateCollectionAsync(string collectionName, CancellationToken cancellationToken = default) - { - var response = await this._database.CreateContainerIfNotExistsAsync(collectionName, "/" + collectionName, cancellationToken: cancellationToken).ConfigureAwait(false); - - if (response.StatusCode == HttpStatusCode.Created) - { - this._logger.LogDebug("Created collection {0}", collectionName); - } - else if (response.StatusCode == HttpStatusCode.OK) - { - this._logger.LogDebug("Collection {0} already exists", collectionName); - } - else - { - throw new CosmosException("Collection does not exist and was not created", response.StatusCode, 0, collectionName, 0); - } - } - - /// - public Task DoesCollectionExistAsync(string collectionName, CancellationToken cancellationToken = default) - { - // Azure Cosmos DB does not support checking if container exists without attempting to create it. - // Note that CreateCollectionIfNotExistsAsync() is idempotent. This does not break the interface but it is not ideal. - return Task.FromResult(false); - } - - /// - public async Task DeleteCollectionAsync(string collectionName, CancellationToken cancellationToken = default) - { - var container = this._database.Client.GetContainer(this._databaseName, collectionName); - try - { - await container.DeleteContainerAsync(cancellationToken: cancellationToken).ConfigureAwait(false); - } - catch (CosmosException ex) - { - this._logger.LogError(ex, "Failed to delete collection {0}: {2} - {3}", collectionName, ex.StatusCode, ex.Message); - } - } - - /// - public async Task GetAsync(string collectionName, string key, bool withEmbedding = false, CancellationToken cancellationToken = default) - { - var id = this.ToCosmosFriendlyId(key); - var partitionKey = PartitionKey.None; - - var container = this._database.Client.GetContainer(this._databaseName, collectionName); - MemoryRecord? memoryRecord = null; - - var response = await container.ReadItemAsync(id, partitionKey, cancellationToken: cancellationToken).ConfigureAwait(false); - - if (response == null) - { - this._logger?.LogWarning("Received no get response collection {1}", collectionName); - } - else if (response.StatusCode != HttpStatusCode.OK) - { - this._logger?.LogWarning("Failed to get record from collection {1} with status code {2}", collectionName, response.StatusCode); - } - else - { - var result = response.Resource; - - float[]? vector = withEmbedding ? System.Text.Json.JsonSerializer.Deserialize(result.EmbeddingString) : System.Array.Empty(); - - if (vector != null) - { - memoryRecord = MemoryRecord.FromJsonMetadata( - result.MetadataString, - new Embedding(vector, transferOwnership: true), - result.Id, - result.Timestamp); - } - } - - return memoryRecord; - } - - /// - public async IAsyncEnumerable GetBatchAsync(string collectionName, IEnumerable keys, bool withEmbeddings = false, - [EnumeratorCancellation] CancellationToken cancellationToken = default) - { - foreach (var key in keys) - { - var record = await this.GetAsync(collectionName, key, withEmbeddings, cancellationToken).ConfigureAwait(false); - - if (record != null) - { - yield return record; - } - } - } - - /// - public async Task UpsertAsync(string collectionName, MemoryRecord record, CancellationToken cancellationToken = default) - { - record.Key = this.ToCosmosFriendlyId(record.Metadata.Id); - - var entity = new CosmosMemoryRecord - { - CollectionId = this.ToCosmosFriendlyId(collectionName), - Id = record.Key, - Timestamp = record.Timestamp, - EmbeddingString = System.Text.Json.JsonSerializer.Serialize(record.Embedding.Vector), - MetadataString = record.GetSerializedMetadata() - }; - - var container = this._database.Client.GetContainer(this._databaseName, collectionName); - - var response = await container.UpsertItemAsync(entity, cancellationToken: cancellationToken).ConfigureAwait(false); - - if (response.StatusCode is HttpStatusCode.OK or HttpStatusCode.Created) - { - this._logger.LogDebug("Upserted item to collection {0}", collectionName); - } - else - { - throw new CosmosException("Unable to upsert item collection", response.StatusCode, 0, collectionName, 0); - } - - return record.Key; - } - - /// - public async IAsyncEnumerable UpsertBatchAsync(string collectionName, IEnumerable records, [EnumeratorCancellation] CancellationToken cancellationToken = default) - { - foreach (var r in records) - { - yield return await this.UpsertAsync(collectionName, r, cancellationToken).ConfigureAwait(false); - } - } - - /// - public async Task RemoveAsync(string collectionName, string key, CancellationToken cancellationToken = default) - { - var container = this._database.Client.GetContainer(this._databaseName, collectionName); - var response = await container.DeleteItemAsync( - key, - PartitionKey.None, - cancellationToken: cancellationToken).ConfigureAwait(false); - - if (response.StatusCode == HttpStatusCode.OK) - { - this._logger.LogDebug("Record deleted from {0}", collectionName); - } - else - { - throw new CosmosException("Unable to delete record", response.StatusCode, 0, collectionName, 0); - } - } - - /// - public async Task RemoveBatchAsync(string collectionName, IEnumerable keys, CancellationToken cancellationToken = default) - { - await Task.WhenAll(keys.Select(k => this.RemoveAsync(collectionName, k, cancellationToken))).ConfigureAwait(false); - } - - /// - public async IAsyncEnumerable<(MemoryRecord, double)> GetNearestMatchesAsync( - string collectionName, - Embedding embedding, - int limit, - double minRelevanceScore = 0, - bool withEmbeddings = false, - [EnumeratorCancellation] CancellationToken cancellationToken = default) - { - { - if (limit <= 0) - { - yield break; - } - - var collectionMemories = new List(); - TopNCollection embeddings = new(limit); - - await foreach (var record in this.GetAllAsync(collectionName, cancellationToken)) - { - if (record != null) - { - double similarity = embedding - .AsReadOnlySpan() - .CosineSimilarity(record.Embedding.AsReadOnlySpan()); - if (similarity >= minRelevanceScore) - { - var entry = withEmbeddings ? record : MemoryRecord.FromMetadata(record.Metadata, Embedding.Empty, record.Key, record.Timestamp); - embeddings.Add(new(entry, similarity)); - } - } - } - - embeddings.SortByScore(); - - foreach (var item in embeddings) - { - yield return (item.Value, item.Score.Value); - } - } - } - - /// - public async Task<(MemoryRecord, double)?> GetNearestMatchAsync(string collectionName, Embedding embedding, double minRelevanceScore = 0, bool withEmbedding = false, - CancellationToken cancellationToken = default) - { - return await this.GetNearestMatchesAsync( - collectionName: collectionName, - embedding: embedding, - limit: 1, - minRelevanceScore: minRelevanceScore, - withEmbeddings: withEmbedding, - cancellationToken: cancellationToken).FirstOrDefaultAsync(cancellationToken: cancellationToken).ConfigureAwait(false); - } - - private async IAsyncEnumerable GetAllAsync(string collectionName, [EnumeratorCancellation] CancellationToken cancellationToken = default) - { - var container = this._database.Client.GetContainer(this._databaseName, collectionName); - var query = new QueryDefinition("SELECT * FROM c"); - - var iterator = container.GetItemQueryIterator(query); - - while (iterator.HasMoreResults) //read all result in batch - { - var items = await iterator.ReadNextAsync(cancellationToken).ConfigureAwait(false); - - foreach (var item in items) - { - var vector = System.Text.Json.JsonSerializer.Deserialize(item.EmbeddingString); - - if (vector != null) - { - yield return MemoryRecord.FromJsonMetadata( - item.MetadataString, - new Embedding(vector, transferOwnership: true), - item.Id, - item.Timestamp); - } - } - } - } - - private string ToCosmosFriendlyId(string id) - { - return $"{id.Trim().Replace(' ', '-').Replace('/', '_').Replace('\\', '_').Replace('?', '_').Replace('#', '_').ToUpperInvariant()}"; - } -}