From d055c7e187f355d54ff1407e356d3f92de67bffa Mon Sep 17 00:00:00 2001 From: TesAnti <8780022+TesAnti@users.noreply.github.com> Date: Wed, 18 Oct 2023 22:58:08 +0300 Subject: [PATCH] feat: Added RecursiveCharacterTextSplitter. Closes #36 * fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain * implemented verbosity at LLMChain. Implemented ToString for the Message class. * ported CharacterTextSplitter - had to create and use TextSplitterBase class and Document class to match structure of original python implementation - i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder) - added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments) * changed LengthFunctionDelegate to Func * Revert "changed LengthFunctionDelegate to Func" This reverts commit 9c340d4559b65bfae1dbdc2a490478700d5291c4. * changed LengthFunctionDelegate to Func * ported RecursiveCharacterTextSplitter added tests --------- Co-authored-by: Konstantin S --- src/libs/LangChain.Core/Base/TextSplitter.cs | 15 ++-- .../RecursiveCharacterTextSplitter.cs | 80 +++++++++++++++++++ .../TextSplitterTests.cs | 24 ++++++ 3 files changed, 114 insertions(+), 5 deletions(-) create mode 100644 src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs diff --git a/src/libs/LangChain.Core/Base/TextSplitter.cs b/src/libs/LangChain.Core/Base/TextSplitter.cs index 6cd7884a..0e0d1400 100644 --- a/src/libs/LangChain.Core/Base/TextSplitter.cs +++ b/src/libs/LangChain.Core/Base/TextSplitter.cs @@ -29,6 +29,10 @@ protected TextSplitter(int chunkSize = 4000, int chunkOverlap = 200, Func((str) => str.Length); } + protected int ChunkSize => _chunkSize; + + protected int ChunkOverlap => _chunkOverlap; + public abstract List SplitText(string text); /// @@ -87,6 +91,7 @@ public List SplitDocuments(List documents) /// protected List MergeSplits(IEnumerable splits, string separator) { + var separatorLen = _lengthFunction(separator); var docs = new List(); // result of chunks var currentDoc = new List(); // documents of current chunk int total = 0; @@ -94,9 +99,9 @@ protected List MergeSplits(IEnumerable splits, string separator) foreach (var split in splits) { int len = _lengthFunction(split); - + // if we can't fit the next split into current chunk - if (total + len >= _chunkSize) + if (total + len + (currentDoc.Count>0?separatorLen:0)>= _chunkSize) { // if the chunk is already was too big if (total > _chunkSize) @@ -116,9 +121,9 @@ protected List MergeSplits(IEnumerable splits, string separator) } // start erasing docs from the beginning of the chunk until we can fit the next split - while (total > _chunkOverlap || (total + len > _chunkSize && total > 0)) + while (total > _chunkOverlap || (total + len + (currentDoc.Count > 1 ? separatorLen : 0) > _chunkSize && total > 0)) { - total -= _lengthFunction(currentDoc[0]); + total -= _lengthFunction(currentDoc[0]) + (currentDoc.Count > 1 ? separatorLen : 0); currentDoc.RemoveAt(0); } } @@ -126,7 +131,7 @@ protected List MergeSplits(IEnumerable splits, string separator) // add the next split to the current chunk currentDoc.Add(split); - total += len; // recalculate the total length of the current chunk + total += len + (currentDoc.Count > 1 ? separatorLen : 0); // recalculate the total length of the current chunk } // add the last chunk diff --git a/src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs b/src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs new file mode 100644 index 00000000..7e26f4bd --- /dev/null +++ b/src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs @@ -0,0 +1,80 @@ +using LangChain.Base; + +namespace LangChain.TextSplitters; + +/// +/// Implementation of splitting text that looks at characters. +/// Recursively tries to split by different characters to find one +/// that works. +/// +public class RecursiveCharacterTextSplitter:TextSplitter +{ + private readonly List _separators; + + public RecursiveCharacterTextSplitter(List? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction) + { + _separators = separators ?? new List { "\n\n", "\n", " ", "" }; + } + + public override List SplitText(string text) + { + List finalChunks = new List(); + string separator = _separators.Last(); + + foreach (string _s in _separators) + { + if (_s.Length == 0) + { + separator = _s; + break; + } + + if (text.Contains(_s)) + { + separator = _s; + break; + } + } + + List splits; + if (separator.Length!=0) + { + splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList(); + } + else + { + splits = text.ToCharArray().Select(c => c.ToString()).ToList(); + } + + + List goodSplits = new List(); + + foreach (string s in splits) + { + if (s.Length < base.ChunkSize) + { + goodSplits.Add(s); + } + else + { + if (goodSplits.Any()) + { + List mergedText = MergeSplits(goodSplits, separator); + finalChunks.AddRange(mergedText); + goodSplits.Clear(); + } + + List otherInfo = SplitText(s); + finalChunks.AddRange(otherInfo); + } + } + + if (goodSplits.Any()) + { + List mergedText = MergeSplits(goodSplits, separator); + finalChunks.AddRange(mergedText); + } + + return finalChunks; + } +} \ No newline at end of file diff --git a/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs b/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs index 04479284..24b8c3e3 100644 --- a/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs +++ b/src/tests/LangChain.Splitters.CSharp.UnitTests/TextSplitterTests.cs @@ -49,4 +49,28 @@ public void CharacterSplitterMetadataTest() } + [TestMethod] + public void RecursiveCharacterTextSplitterTest() + { + // based on https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter + + var state_of_the_union_txt = H.Resources.state_of_the_union_txt.AsString(); + var textSplitter = new RecursiveCharacterTextSplitter(chunkSize: 100, chunkOverlap: 20); + + + var texts = textSplitter.CreateDocuments(new List() { state_of_the_union_txt }); + + + var expected1 = + "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and"; + var actual1 = texts[0].PageContent; + Assert.AreEqual(expected1,actual1); + + var expected2 = "of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans."; + var actual2 = texts[1].PageContent; + Assert.AreEqual(expected2,actual2); + + + } + } \ No newline at end of file