Skip to content

Commit

Permalink
breakup main chunkit file into modules
Browse files Browse the repository at this point in the history
  • Loading branch information
jparkerweb committed Sep 24, 2024
1 parent ee8b818 commit a4180cc
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 319 deletions.
100 changes: 100 additions & 0 deletions chunkingUtils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import { tokenizer } from './embeddingUtils.js';
import { cosineSimilarity } from './similarityUtils.js';
import { createEmbedding } from './embeddingUtils.js';

// -----------------------------------------------------------
// -- Function to create chunks of text based on similarity --
// -----------------------------------------------------------
export function createChunks(sentences, similarities, maxTokenSize, similarityThreshold, logging) {
let chunks = [];
let currentChunk = [sentences[0]];
let currentChunkSize;
let sentenceTokenCount;

if (logging) { console.log(`!! new chunk !! --> 1`) }

for (let i = 1; i < sentences.length; i++) {
currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size;
sentenceTokenCount = tokenizer(sentences[i]).input_ids.size;

if (logging) {
console.log('sentenceTokenCount', sentenceTokenCount);
console.log('currentChunkSize', currentChunkSize);
console.log('maxTokenSize', maxTokenSize);
if (similarities) {
console.log('similarity', similarities[i - 1])
console.log('similarityThreshold', similarityThreshold)
}
}

if (similarities) {
if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) {
currentChunk.push(sentences[i]);
if (logging) { console.log('keep going...') }
} else {
chunks.push(currentChunk.join(" "));
currentChunk = [sentences[i]];
if (logging) {
console.log('stop...')
console.log('\n')
console.log(`!! new chunk !! --> ${chunks.length + 1}`)
}
}
} else {
if (currentChunkSize + sentenceTokenCount <= maxTokenSize) {
currentChunk.push(sentences[i]);
if (logging) { console.log('keep going...') }
} else {
chunks.push(currentChunk.join(" "));
currentChunk = [sentences[i]];
if (logging) {
console.log('stop...')
console.log('\n')
console.log(`!! new chunk !! --> ${chunks.length + 1}`)
}
}
}
}

if (currentChunk.length > 0 && currentChunk[0] !== "") {
chunks.push(currentChunk.join(" "));
}

return chunks;
}

// --------------------------------------------------------------
// -- Optimize and Rebalance Chunks (optionally use Similarity) --
// --------------------------------------------------------------
export async function optimizeAndRebalanceChunks(combinedChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold = 0.5) {
let optimizedChunks = [];
let currentChunkText = "";
let currentChunkTokenCount = 0;
let currentEmbedding = null;

for (let index = 0; index < combinedChunks.length; index++) {
const chunk = combinedChunks[index];
const chunkTokenCount = tokenizer(chunk).input_ids.size;

if (currentChunkText && (currentChunkTokenCount + chunkTokenCount <= maxTokenSize)) {
const nextEmbedding = await createEmbedding(chunk);
const similarity = currentEmbedding ? cosineSimilarity(currentEmbedding, nextEmbedding) : 0;

if (similarity >= combineChunksSimilarityThreshold) {
currentChunkText += " " + chunk;
currentChunkTokenCount += chunkTokenCount;
currentEmbedding = nextEmbedding;
continue;
}
}

if (currentChunkText) optimizedChunks.push(currentChunkText);
currentChunkText = chunk;
currentChunkTokenCount = chunkTokenCount;
currentEmbedding = await createEmbedding(chunk);
}

if (currentChunkText) optimizedChunks.push(currentChunkText);

return optimizedChunks.filter(chunk => chunk);
}
Loading

0 comments on commit a4180cc

Please sign in to comment.