Skip to content

Commit

Permalink
Setting up Mongo Vector as an extension of the primary Mongo extension
Browse files Browse the repository at this point in the history
  • Loading branch information
bowencode committed Mar 29, 2024
1 parent 3bd4579 commit 77a81c8
Show file tree
Hide file tree
Showing 17 changed files with 160 additions and 10 deletions.
7 changes: 7 additions & 0 deletions CosmosDbDataMigrationTool.sln
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PostgreSQL", "PostgreSQL",
Extensions\PostgreSQL\README.md = Extensions\PostgreSQL\README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{31BC84E1-55E5-45AA-BFAC-90732F20588B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -193,6 +195,10 @@ Global
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.Build.0 = Debug|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.ActiveCfg = Release|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.Build.0 = Release|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -226,6 +232,7 @@ Global
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E} = {39930280-DA29-4814-837B-FA7F252EB3EC}
{85820167-DB94-458B-B09B-9E823996C692} = {1B927C5F-50FC-42A6-BAF6-B00E6D760543}
{1B927C5F-50FC-42A6-BAF6-B00E6D760543} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
{31BC84E1-55E5-45AA-BFAC-90732F20588B} = {F18E789A-D32D-48D3-B75F-1196D7215F74}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {662B3F27-70D8-45E6-A1C0-1438A9C8A542}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
using MongoDB.Driver;
using MongoDB.Driver.Core.Events;

namespace Cosmos.DataTransfer.MongoVectorExtension;
namespace Cosmos.DataTransfer.MongoExtension;
public class Context
{
private readonly IMongoDatabase database = null!;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<OutputType>Exe</OutputType>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
<PackageReference Include="MongoDB.Driver" Version="2.19.1" />
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
</ItemGroup>

<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=PublishToExtensionsFolder" />
</Target>

</Project>
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using System.Linq.Expressions;

namespace Cosmos.DataTransfer.MongoVectorExtension;
namespace Cosmos.DataTransfer.MongoExtension;

public interface IRepository<TDocument>
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
using Cosmos.DataTransfer.Interfaces;
using MongoDB.Bson;

namespace Cosmos.DataTransfer.MongoVectorExtension;
namespace Cosmos.DataTransfer.MongoExtension;
public class MongoDataItem : IDataItem
{
private readonly BsonDocument record;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
using System.ComponentModel.Composition;
using Cosmos.DataTransfer.Interfaces;
using Cosmos.DataTransfer.MongoExtension.Settings;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using MongoDB.Bson;

namespace Cosmos.DataTransfer.MongoExtension;
[Export(typeof(IDataSinkExtension))]
public class MongoDataSinkExtension : IDataSinkExtensionWithSettings
{
public string DisplayName => "MongoDB";

public async Task WriteAsync(IAsyncEnumerable<IDataItem> dataItems, IConfiguration config, IDataSourceExtension dataSource, ILogger logger, CancellationToken cancellationToken = default)
{
var settings = config.Get<MongoSinkSettings>();
settings.Validate();

if (!string.IsNullOrEmpty(settings.ConnectionString) && !string.IsNullOrEmpty(settings.DatabaseName) && !string.IsNullOrEmpty(settings.Collection))
{
var context = new Context(settings.ConnectionString, settings.DatabaseName);
var repo = context.GetRepository<BsonDocument>(settings.Collection);

var batchSize = settings.BatchSize ?? 1000;

var objects = new List<BsonDocument>();
int itemCount = 0;
await foreach (var item in dataItems.WithCancellation(cancellationToken))
{
var dict = item.BuildDynamicObjectTree();
objects.Add(new BsonDocument(dict));
itemCount++;

if (objects.Count == batchSize)
{
await repo.AddRange(objects);
logger.LogInformation("Added {ItemCount} items to collection '{Collection}'", itemCount, settings.Collection);
objects.Clear();
}
}

if (objects.Any())
{
await repo.AddRange(objects);
}

if (itemCount > 0)
logger.LogInformation("Added {ItemCount} total items to collection '{Collection}'", itemCount, settings.Collection);
else
logger.LogWarning("No items added to collection '{Collection}'", settings.Collection);
}
}

public IEnumerable<IDataExtensionSettings> GetSettings()
{
yield return new MongoSinkSettings();
}
}
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
using System.ComponentModel.Composition;
using System.Runtime.CompilerServices;
using Cosmos.DataTransfer.Interfaces;
using Cosmos.DataTransfer.MongoVectorExtension.Settings;
using Cosmos.DataTransfer.MongoExtension.Settings;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using MongoDB.Bson;

namespace Cosmos.DataTransfer.MongoVectorExtension;
namespace Cosmos.DataTransfer.MongoExtension;
[Export(typeof(IDataSourceExtension))]
internal class MongoVectorDataSourceExtension : IDataSourceExtensionWithSettings
internal class MongoDataSourceExtension : IDataSourceExtensionWithSettings
{
public string DisplayName => $"MongoDB-Vector{ExtensionExtensions.BetaExtensionTag}";
public string DisplayName => "MongoDB";

public async IAsyncEnumerable<IDataItem> ReadAsync(IConfiguration config, ILogger logger, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
using System.Linq.Expressions;
using MongoDB.Driver;

namespace Cosmos.DataTransfer.MongoVectorExtension;
namespace Cosmos.DataTransfer.MongoExtension;

public class MongoRepository<TDocument> : IRepository<TDocument>
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Console.WriteLine("Starting Mongo extension");
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
https://go.microsoft.com/fwlink/?LinkID=208121.
-->
<Project>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<Configuration>Debug</Configuration>
<Platform>Any CPU</Platform>
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Debug\net6.0\Extensions</PublishDir>
<PublishProtocol>FileSystem</PublishProtocol>
<_TargetId>Folder</_TargetId>
<TargetFramework>net6.0</TargetFramework>
<SelfContained>false</SelfContained>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' != 'Debug' ">
<Configuration>Release</Configuration>
<Platform>Any CPU</Platform>
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Release\net6.0\Extensions</PublishDir>
<PublishProtocol>FileSystem</PublishProtocol>
<_TargetId>Folder</_TargetId>
<TargetFramework>net6.0</TargetFramework>
<SelfContained>false</SelfContained>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
using Cosmos.DataTransfer.Interfaces;
using Cosmos.DataTransfer.Interfaces.Manifest;

namespace Cosmos.DataTransfer.MongoVectorExtension.Settings;
namespace Cosmos.DataTransfer.MongoExtension.Settings;
public class MongoBaseSettings : IDataExtensionSettings
{
[Required]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using System.ComponentModel.DataAnnotations;

namespace Cosmos.DataTransfer.MongoExtension.Settings;
public class MongoSinkSettings : MongoBaseSettings
{
[Required]
public string? Collection { get; set; }

public int? BatchSize { get; set; }
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
namespace Cosmos.DataTransfer.MongoVectorExtension.Settings;
namespace Cosmos.DataTransfer.MongoExtension.Settings;
public class MongoSourceSettings : MongoBaseSettings
{
public string? Collection { get; set; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

<ItemGroup>
<ProjectReference Include="..\..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
<ProjectReference Include="..\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj" />
</ItemGroup>

<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using Azure;
using Azure.AI.OpenAI;
using Cosmos.DataTransfer.Interfaces;
using Cosmos.DataTransfer.MongoExtension;
using Cosmos.DataTransfer.MongoVectorExtension.Settings;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.ComponentModel.DataAnnotations;
using Cosmos.DataTransfer.MongoExtension.Settings;

namespace Cosmos.DataTransfer.MongoVectorExtension.Settings;
public class MongoVectorSinkSettings : MongoBaseSettings
Expand Down
23 changes: 23 additions & 0 deletions Extensions/Mongo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,28 @@ Source and sink settings require both `ConnectionString` and `DatabaseName` para

### Sink

```json
{
"ConnectionString": "",
"DatabaseName: "",
"Collection": ""
}
```

# MongoDB Vector Extension (Beta)

The MongoDB Vector extension is a Sink only extension that builds on the MongoDB extension by providing additional capabilities for generating embeddings using Azure OpenAI APIs.

> **Note**: When specifying the MongoDB Vector extension as the Sink property in configuration, utilize the name **MongoDB-Vector(beta)**.
## Settings

The settings are based on the MongoDB extension settings with additional parameters for generating embeddings.

### Additional Sink Settings

The sink settings require the following additional parameters:

- `GenerateEmbedding`: If set to true, the sink will generate embeddings for the records before writing them to the database. The sink requires the `OpenAIUrl`, `OpenAIKey`, and `OpenAIDeploymentModel` parameters to be set. Following paramaters are required if this is true
- `OpenAIUrl`: The URL of the OpenAI API
- `OpenAIKey`: The API key for the OpenAI API
Expand All @@ -41,3 +63,4 @@ Source and sink settings require both `ConnectionString` and `DatabaseName` para
"DestPropEmbedding": ""
}
```

0 comments on commit 77a81c8

Please sign in to comment.