Skip to content

Commit

Permalink
feat: Added RecursiveCharacterTextSplitter. Closes #36
Browse files Browse the repository at this point in the history
* fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain

* implemented verbosity at LLMChain. Implemented ToString for the Message class.

* ported CharacterTextSplitter
- had to create and use TextSplitterBase class and Document class to match structure of original python implementation
- i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder)
- added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments)

* changed LengthFunctionDelegate to Func<string, int>

* Revert "changed LengthFunctionDelegate to Func<string, int>"

This reverts commit 9c340d4.

* changed LengthFunctionDelegate to Func<string, int>

* ported RecursiveCharacterTextSplitter
added tests

---------

Co-authored-by: Konstantin S <havendv@gmail.com>
  • Loading branch information
TesAnti and HavenDV authored Oct 18, 2023
1 parent 6248d77 commit d055c7e
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 5 deletions.
15 changes: 10 additions & 5 deletions src/libs/LangChain.Core/Base/TextSplitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string
_lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
}

protected int ChunkSize => _chunkSize;

protected int ChunkOverlap => _chunkOverlap;

public abstract List<string> SplitText(string text);

/// <summary>
Expand Down Expand Up @@ -87,16 +91,17 @@ public List<Document> SplitDocuments(List<Document> documents)
/// </summary>
protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
{
var separatorLen = _lengthFunction(separator);
var docs = new List<string>(); // result of chunks
var currentDoc = new List<string>(); // documents of current chunk
int total = 0;

foreach (var split in splits)
{
int len = _lengthFunction(split);

// if we can't fit the next split into current chunk
if (total + len >= _chunkSize)
if (total + len + (currentDoc.Count>0?separatorLen:0)>= _chunkSize)
{
// if the chunk is already was too big
if (total > _chunkSize)
Expand All @@ -116,17 +121,17 @@ protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
}

// start erasing docs from the beginning of the chunk until we can fit the next split
while (total > _chunkOverlap || (total + len > _chunkSize && total > 0))
while (total > _chunkOverlap || (total + len + (currentDoc.Count > 1 ? separatorLen : 0) > _chunkSize && total > 0))
{
total -= _lengthFunction(currentDoc[0]);
total -= _lengthFunction(currentDoc[0]) + (currentDoc.Count > 1 ? separatorLen : 0);
currentDoc.RemoveAt(0);
}
}
}

// add the next split to the current chunk
currentDoc.Add(split);
total += len; // recalculate the total length of the current chunk
total += len + (currentDoc.Count > 1 ? separatorLen : 0); // recalculate the total length of the current chunk
}

// add the last chunk
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
using LangChain.Base;

namespace LangChain.TextSplitters;

/// <summary>
/// Implementation of splitting text that looks at characters.
/// Recursively tries to split by different characters to find one
/// that works.
/// </summary>
public class RecursiveCharacterTextSplitter:TextSplitter
{
private readonly List<string> _separators;

public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
{
_separators = separators ?? new List<string> { "\n\n", "\n", " ", "" };
}

public override List<string> SplitText(string text)
{
List<string> finalChunks = new List<string>();
string separator = _separators.Last();

foreach (string _s in _separators)
{
if (_s.Length == 0)
{
separator = _s;
break;
}

if (text.Contains(_s))
{
separator = _s;
break;
}
}

List<string> splits;
if (separator.Length!=0)
{
splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList();
}
else
{
splits = text.ToCharArray().Select(c => c.ToString()).ToList();
}


List<string> goodSplits = new List<string>();

foreach (string s in splits)
{
if (s.Length < base.ChunkSize)
{
goodSplits.Add(s);
}
else
{
if (goodSplits.Any())
{
List<string> mergedText = MergeSplits(goodSplits, separator);
finalChunks.AddRange(mergedText);
goodSplits.Clear();
}

List<string> otherInfo = SplitText(s);
finalChunks.AddRange(otherInfo);
}
}

if (goodSplits.Any())
{
List<string> mergedText = MergeSplits(goodSplits, separator);
finalChunks.AddRange(mergedText);
}

return finalChunks;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,28 @@ public void CharacterSplitterMetadataTest()

}

[TestMethod]
public void RecursiveCharacterTextSplitterTest()
{
// based on https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter

var state_of_the_union_txt = H.Resources.state_of_the_union_txt.AsString();
var textSplitter = new RecursiveCharacterTextSplitter(chunkSize: 100, chunkOverlap: 20);


var texts = textSplitter.CreateDocuments(new List<string>() { state_of_the_union_txt });


var expected1 =
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and";
var actual1 = texts[0].PageContent;
Assert.AreEqual(expected1,actual1);

var expected2 = "of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.";
var actual2 = texts[1].PageContent;
Assert.AreEqual(expected2,actual2);


}

}

0 comments on commit d055c7e

Please sign in to comment.