-
-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Added RecursiveCharacterTextSplitter. Closes #36
* fix for LlmChain OutputKey. The result of CallAsync was always containing only ["text'] key ignoring OutputKey. This was causing an error when used with SequentialChain * implemented verbosity at LLMChain. Implemented ToString for the Message class. * ported CharacterTextSplitter - had to create and use TextSplitterBase class and Document class to match structure of original python implementation - i have not implemented methods which are relying on external source of information(from_huggingface_tokenizer,from_tiktoken_encoder) - added comments which are explaining logic behind implementation. it should help new people to understand what it does(original python implementation kind of hard to understand in some moments) * changed LengthFunctionDelegate to Func<string, int> * Revert "changed LengthFunctionDelegate to Func<string, int>" This reverts commit 9c340d4. * changed LengthFunctionDelegate to Func<string, int> * ported RecursiveCharacterTextSplitter added tests --------- Co-authored-by: Konstantin S <havendv@gmail.com>
- Loading branch information
Showing
3 changed files
with
114 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
80 changes: 80 additions & 0 deletions
80
src/libs/LangChain.Core/TextSplitters/RecursiveCharacterTextSplitter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
using LangChain.Base; | ||
|
||
namespace LangChain.TextSplitters; | ||
|
||
/// <summary> | ||
/// Implementation of splitting text that looks at characters. | ||
/// Recursively tries to split by different characters to find one | ||
/// that works. | ||
/// </summary> | ||
public class RecursiveCharacterTextSplitter:TextSplitter | ||
{ | ||
private readonly List<string> _separators; | ||
|
||
public RecursiveCharacterTextSplitter(List<string>? separators=null, int chunkSize = 4000, int chunkOverlap = 200, Func<string, int>? lengthFunction = null) : base(chunkSize, chunkOverlap, lengthFunction) | ||
{ | ||
_separators = separators ?? new List<string> { "\n\n", "\n", " ", "" }; | ||
} | ||
|
||
public override List<string> SplitText(string text) | ||
{ | ||
List<string> finalChunks = new List<string>(); | ||
string separator = _separators.Last(); | ||
|
||
foreach (string _s in _separators) | ||
{ | ||
if (_s.Length == 0) | ||
{ | ||
separator = _s; | ||
break; | ||
} | ||
|
||
if (text.Contains(_s)) | ||
{ | ||
separator = _s; | ||
break; | ||
} | ||
} | ||
|
||
List<string> splits; | ||
if (separator.Length!=0) | ||
{ | ||
splits = text.Split(new string[] {separator}, StringSplitOptions.None).ToList(); | ||
} | ||
else | ||
{ | ||
splits = text.ToCharArray().Select(c => c.ToString()).ToList(); | ||
} | ||
|
||
|
||
List<string> goodSplits = new List<string>(); | ||
|
||
foreach (string s in splits) | ||
{ | ||
if (s.Length < base.ChunkSize) | ||
{ | ||
goodSplits.Add(s); | ||
} | ||
else | ||
{ | ||
if (goodSplits.Any()) | ||
{ | ||
List<string> mergedText = MergeSplits(goodSplits, separator); | ||
finalChunks.AddRange(mergedText); | ||
goodSplits.Clear(); | ||
} | ||
|
||
List<string> otherInfo = SplitText(s); | ||
finalChunks.AddRange(otherInfo); | ||
} | ||
} | ||
|
||
if (goodSplits.Any()) | ||
{ | ||
List<string> mergedText = MergeSplits(goodSplits, separator); | ||
finalChunks.AddRange(mergedText); | ||
} | ||
|
||
return finalChunks; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters