From d055c7e187f355d54ff1407e356d3f92de67bffa Mon Sep 17 00:00:00 2001
From: TesAnti <8780022+TesAnti@users.noreply.github.com>
Date: Wed, 18 Oct 2023 22:58:08 +0300
Subject: [PATCH] feat: Added RecursiveCharacterTextSplitter. Closes #36

* fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain

* implemented verbosity at LLMChain. Implemented ToString for the Message class.

* ported CharacterTextSplitter
- had to create and use TextSplitterBase class and Document class to match structure of original python implementation
- i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder)
- added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments)

* changed LengthFunctionDelegate to Func<string, int>

* Revert "changed LengthFunctionDelegate to Func<string, int>"

This reverts commit 9c340d4559b65bfae1dbdc2a490478700d5291c4.

* changed LengthFunctionDelegate to Func<string, int>

* ported RecursiveCharacterTextSplitter
added tests

---------

Co-authored-by: Konstantin S <havendv@gmail.com>
---
 src/libs/LangChain.Core/Base/TextSplitter.cs  | 15 ++--
 .../RecursiveCharacterTextSplitter.cs         | 80 +++++++++++++++++++
 .../TextSplitterTests.cs                      | 24 ++++++
 3 files changed, 114 insertions(+), 5 deletions(-)
 create mode 100644 src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs
diff --git a/src/libs/LangChain.Core/Base/TextSplitter.cs b/src/libs/LangChain.Core/Base/TextSplitter.cs
index 6cd7884a..0e0d1400 100644
--- a/src/libs/LangChain.Core/Base/TextSplitter.cs
+++ b/src/libs/LangChain.Core/Base/TextSplitter.cs
@@ -29,6 +29,10 @@ protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string
         _lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
     }
 
+    protected int ChunkSize => _chunkSize;
+
+    protected int ChunkOverlap => _chunkOverlap;
+
     public abstract List<string> SplitText(string text);
 
     /// <summary>
@@ -87,6 +91,7 @@ public List<Document> SplitDocuments(List<Document> documents)
     /// </summary>
     protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
     {
+        var separatorLen = _lengthFunction(separator);
         var docs = new List<string>(); // result of chunks
         var currentDoc = new List<string>(); // documents of current chunk
         int total = 0;
@@ -94,9 +99,9 @@ protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
         foreach (var split in splits)
         {
             int len = _lengthFunction(split);
-
+            
             // if we can't fit the next split into current chunk
-            if (total + len >= _chunkSize)
+            if (total + len + (currentDoc.Count>0?separatorLen:0)>= _chunkSize)
             {
                 // if the chunk is already was too big
                 if (total > _chunkSize)
@@ -116,9 +121,9 @@ protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
                     }
 
                     // start erasing docs from the beginning of the chunk until we can fit the next split
-                    while (total > _chunkOverlap || (total + len > _chunkSize && total > 0))
+                    while (total > _chunkOverlap || (total + len + (currentDoc.Count > 1 ? separatorLen : 0) > _chunkSize && total > 0))
                     {
-                        total -= _lengthFunction(currentDoc[0]);
+                        total -= _lengthFunction(currentDoc[0]) + (currentDoc.Count > 1 ? separatorLen : 0);
                         currentDoc.RemoveAt(0);
                     }
                 }
@@ -126,7 +131,7 @@ protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
 
             // add the next split to the current chunk
             currentDoc.Add(split);
-            total += len; // recalculate the total length of the current chunk
+            total += len + (currentDoc.Count > 1 ? separatorLen : 0); // recalculate the total length of the current chunk
         }
 
         // add the last chunk
diff --git a/src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs b/src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs
new file mode 100644
index 00000000..7e26f4bd
--- /dev/null
+++ b/src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs
@@ -0,0 +1,80 @@
+﻿using LangChain.Base;
+
+namespace LangChain.TextSplitters;
+
+/// <summary>
+/// Implementation of splitting text that looks at characters.
+/// Recursively tries to split by different characters to find one
+/// that works.
+/// </summary>
+public class RecursiveCharacterTextSplitter:TextSplitter
+{
+    private readonly List<string> _separators;
+
+    public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
+    {
+        _separators = separators ?? new List<string> { "\n\n", "\n", " ", "" };
+    }
+
+    public override List<string> SplitText(string text)
+    {
+        List<string> finalChunks = new List<string>();
+        string separator = _separators.Last();
+
+        foreach (string _s in _separators)
+        {
+            if (_s.Length == 0)
+            {
+                separator = _s;
+                break;
+            }
+
+            if (text.Contains(_s))
+            {
+                separator = _s;
+                break;
+            }
+        }
+
+        List<string> splits;
+        if (separator.Length!=0)
+        {
+            splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList();
+        }
+        else
+        {
+            splits = text.ToCharArray().Select(c => c.ToString()).ToList();
+        }
+
+    
+        List<string> goodSplits = new List<string>();
+
+        foreach (string s in splits)
+        {
+            if (s.Length < base.ChunkSize)
+            {
+                goodSplits.Add(s);
+            }
+            else
+            {
+                if (goodSplits.Any())
+                {
+                    List<string> mergedText = MergeSplits(goodSplits, separator);
+                    finalChunks.AddRange(mergedText);
+                    goodSplits.Clear();
+                }
+
+                List<string> otherInfo = SplitText(s);
+                finalChunks.AddRange(otherInfo);
+            }
+        }
+
+        if (goodSplits.Any())
+        {
+            List<string> mergedText = MergeSplits(goodSplits, separator);
+            finalChunks.AddRange(mergedText);
+        }
+
+        return finalChunks;
+    }
+}
\ No newline at end of file
diff --git a/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs b/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs
index 04479284..24b8c3e3 100644
--- a/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs
+++ b/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs
@@ -49,4 +49,28 @@ public void CharacterSplitterMetadataTest()
 
     }
 
+    [TestMethod]
+    public void RecursiveCharacterTextSplitterTest()
+    {
+        // based on https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter
+
+        var state_of_the_union_txt = H.Resources.state_of_the_union_txt.AsString();
+        var textSplitter = new RecursiveCharacterTextSplitter(chunkSize: 100, chunkOverlap: 20);
+        
+        
+        var texts = textSplitter.CreateDocuments(new List<string>() { state_of_the_union_txt });
+
+
+        var expected1 =
+            "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and";
+        var actual1 = texts[0].PageContent;
+        Assert.AreEqual(expected1,actual1);
+
+        var expected2 = "of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.";
+        var actual2 = texts[1].PageContent;
+        Assert.AreEqual(expected2,actual2);
+
+
+    }
+
 }
\ No newline at end of file