add pre-download models tool

jparkerweb · Jul 12, 2024 · a7cec77 · a7cec77
1 parent 07c603f
commit a7cec77
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -225,3 +225,11 @@ The behavior of the `chunkit` function can be finely tuned using several optiona
 - **Type**: Boolean
 - **Default**: `true`
 - **Description**: Indicates whether to use a quantized version of the specified model. Quantized models generally offer faster performance with a slight trade-off in accuracy, which can be beneficial when processing very large datasets.
+
+---
+
+## 💾 Pre-Downloading Models
+
+Fill out the `tools/download-models.list.json` file with a list of models you want pre-downloaded, and if they are quantized or not (See the Curated ONNX Embedding Models section above for a list of models to try)
+
+[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/JItZqsL3umY/0.jpg)](https://www.youtube.com/watch?v=JItZqsL3umY)
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "semantic-chunking",
-  "version": "1.2.1",
+  "version": "1.3.0",
   "description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).",
   "repository": {
     "type": "git",
@@ -23,11 +23,13 @@
   "license": "ISC",
   "scripts": {
     "clean-models": "find ./models -type f ! -name '*.url' -delete",
-    "clean-models-win": "powershell -Command \"Get-ChildItem -Path ./models -Recurse | Where-Object { !$_.PSIsContainer -and $_.Extension -ne '.url' } | Remove-Item\""
+    "clean-models-win": "powershell -Command \"Get-ChildItem -Path ./models -Recurse | Where-Object { !$_.PSIsContainer -and $_.Extension -ne '.url' } | Remove-Item\"",
+    "download-models": "node ./tools/download-models.js"
   },
   "dependencies": {
     "@stdlib/nlp-sentencize": "^0.2.1",
     "@xenova/transformers": "^2.17.2",
+    "cli-progress": "^3.12.0",
     "fs": "^0.0.1-security"
   }
 }
diff --git a/tools/download-models-list.json b/tools/download-models-list.json
@@ -0,0 +1,10 @@
+[
+    {
+        "modelName": "Xenova/all-MiniLM-L6-v2",
+        "quantized": true
+    },
+    {
+        "modelName": "BAAI/bge-small-en-v1.5",
+        "quantized": false
+    }
+]
diff --git a/tools/download-models.js b/tools/download-models.js
@@ -0,0 +1,41 @@
+import { pipeline, env as transformerCache } from "@xenova/transformers"
+import cliProgress from "cli-progress"
+import fs from "fs/promises"
+
+const data = await fs.readFile('./tools/download-models-list.json', 'utf8');
+const downloadModelsList = JSON.parse(data);
+
+
+transformerCache.cacheDir = "models"
+
+console.log(`Downloading embedding models…`)
+
+const bar = new cliProgress.SingleBar({
+	clearOnComplete: false,
+	hideCursor: true,
+	format: "[{bar}] {value}% | {model}",
+})
+
+
+let started = false
+
+for (const model of downloadModelsList) {
+    console.log(`model: ${model.modelName}, quantized: ${model.quantized}`)
+    bar.start(100, 0, { model: model.modelName})
+
+    await pipeline("feature-extraction", model.modelName, {
+        quantized: model.quantized,
+        progress_callback: (data) => {
+            started = started || data.status === "download"
+            if (!started) return
+
+            if (data.progress) bar.update(Math.floor(data.progress))
+        },
+    })
+    bar.update(100)
+    bar.stop()
+}
+
+
+
+console.log("Success!")