Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RecursiveCharacterTextSplitter #36

Merged
merged 10 commits into from
Oct 18, 2023
15 changes: 10 additions & 5 deletions src/libs/LangChain.Core/Base/TextSplitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func<string
_lengthFunction = lengthFunction ?? new Func<string, int>((str) => str.Length);
}

protected int ChunkSize => _chunkSize;

protected int ChunkOverlap => _chunkOverlap;

public abstract List<string> SplitText(string text);

/// <summary>
Expand Down Expand Up @@ -87,16 +91,17 @@ public List<Document> SplitDocuments(List<Document> documents)
/// </summary>
protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
{
var separatorLen = _lengthFunction(separator);
var docs = new List<string>(); // result of chunks
var currentDoc = new List<string>(); // documents of current chunk
int total = 0;

foreach (var split in splits)
{
int len = _lengthFunction(split);

// if we can't fit the next split into current chunk
if (total + len >= _chunkSize)
if (total + len + (currentDoc.Count>0?separatorLen:0)>= _chunkSize)
{
// if the chunk is already was too big
if (total > _chunkSize)
Expand All @@ -116,17 +121,17 @@ protected List<string> MergeSplits(IEnumerable<string> splits, string separator)
}

// start erasing docs from the beginning of the chunk until we can fit the next split
while (total > _chunkOverlap || (total + len > _chunkSize && total > 0))
while (total > _chunkOverlap || (total + len + (currentDoc.Count > 1 ? separatorLen : 0) > _chunkSize && total > 0))
{
total -= _lengthFunction(currentDoc[0]);
total -= _lengthFunction(currentDoc[0]) + (currentDoc.Count > 1 ? separatorLen : 0);
currentDoc.RemoveAt(0);
}
}
}

// add the next split to the current chunk
currentDoc.Add(split);
total += len; // recalculate the total length of the current chunk
total += len + (currentDoc.Count > 1 ? separatorLen : 0); // recalculate the total length of the current chunk
}

// add the last chunk
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
using LangChain.Base;

namespace LangChain.TextSplitters;

/// <summary>
/// Implementation of splitting text that looks at characters.
/// Recursively tries to split by different characters to find one
/// that works.
/// </summary>
public class RecursiveCharacterTextSplitter:TextSplitter
{
private readonly List<string> _separators;

public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction)
{
_separators = separators ?? new List<string> { "\n\n", "\n", " ", "" };
}

public override List<string> SplitText(string text)
{
List<string> finalChunks = new List<string>();
string separator = _separators.Last();

foreach (string _s in _separators)
{
if (_s.Length == 0)
{
separator = _s;
break;
}

if (text.Contains(_s))
{
separator = _s;
break;
}
}

List<string> splits;
if (separator.Length!=0)
{
splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList();
}
else
{
splits = text.ToCharArray().Select(c => c.ToString()).ToList();
}


List<string> goodSplits = new List<string>();

foreach (string s in splits)
{
if (s.Length < base.ChunkSize)
{
goodSplits.Add(s);
}
else
{
if (goodSplits.Any())
{
List<string> mergedText = MergeSplits(goodSplits, separator);
finalChunks.AddRange(mergedText);
goodSplits.Clear();
}

List<string> otherInfo = SplitText(s);
finalChunks.AddRange(otherInfo);
}
}

if (goodSplits.Any())
{
List<string> mergedText = MergeSplits(goodSplits, separator);
finalChunks.AddRange(mergedText);
}

return finalChunks;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,28 @@ public void CharacterSplitterMetadataTest()

}

[TestMethod]
public void RecursiveCharacterTextSplitterTest()
{
// based on https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter

var state_of_the_union_txt = H.Resources.state_of_the_union_txt.AsString();
var textSplitter = new RecursiveCharacterTextSplitter(chunkSize: 100, chunkOverlap: 20);


var texts = textSplitter.CreateDocuments(new List<string>() { state_of_the_union_txt });


var expected1 =
"Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and";
var actual1 = texts[0].PageContent;
Assert.AreEqual(expected1,actual1);

var expected2 = "of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.";
var actual2 = texts[1].PageContent;
Assert.AreEqual(expected2,actual2);


}

}
Loading