From a7cec77321e112b1d58badbdfbc89001e728be78 Mon Sep 17 00:00:00 2001 From: Justin Parker Date: Thu, 11 Jul 2024 23:37:14 -0400 Subject: [PATCH] add pre-download models tool --- README.md | 8 +++ package-lock.json | 97 ++++++++++++++++++++++++++------- package.json | 6 +- tools/download-models-list.json | 10 ++++ tools/download-models.js | 41 ++++++++++++++ 5 files changed, 140 insertions(+), 22 deletions(-) create mode 100644 tools/download-models-list.json create mode 100644 tools/download-models.js diff --git a/README.md b/README.md index 887d895..fe0770b 100644 --- a/README.md +++ b/README.md @@ -225,3 +225,11 @@ The behavior of the `chunkit` function can be finely tuned using several optiona - **Type**: Boolean - **Default**: `true` - **Description**: Indicates whether to use a quantized version of the specified model. Quantized models generally offer faster performance with a slight trade-off in accuracy, which can be beneficial when processing very large datasets. + +--- + +## 💾 Pre-Downloading Models + +Fill out the `tools/download-models.list.json` file with a list of models you want pre-downloaded, and if they are quantized or not (See the Curated ONNX Embedding Models section above for a list of models to try) + +[![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/JItZqsL3umY/0.jpg)](https://www.youtube.com/watch?v=JItZqsL3umY) \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 0898f0a..06afc87 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,16 +1,17 @@ { "name": "semantic-chunking", - "version": "1.2.1", + "version": "1.3.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "semantic-chunking", - "version": "1.2.1", + "version": "1.3.0", "license": "ISC", "dependencies": { "@stdlib/nlp-sentencize": "^0.2.1", "@xenova/transformers": "^2.17.2", + "cli-progress": "^3.12.0", "fs": "^0.0.1-security" } }, @@ -558,9 +559,9 @@ "integrity": "sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==" }, "node_modules/@types/node": { - "version": "20.14.2", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.2.tgz", - "integrity": "sha512-xyu6WAMVwv6AKFLB+e/7ySZVr/0zLCzOa7rSpq6jNwpqOrUbcACDWC+53d4n2QHOnDou0fbIsg8wZu/sxrnI4Q==", + "version": "20.14.10", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.10.tgz", + "integrity": "sha512-MdiXf+nDuMvY0gJKxyfZ7/6UFsETO7mGKF54MVD/ekJS6HdFtpZFBgrh6Pseu64XTb2MLyFPlbW6hj8HYRQNOQ==", "dependencies": { "undici-types": "~5.26.4" } @@ -578,15 +579,23 @@ "onnxruntime-node": "1.14.0" } }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "engines": { + "node": ">=8" + } + }, "node_modules/b4a": { "version": "1.6.6", "resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz", "integrity": "sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg==" }, "node_modules/bare-events": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.3.1.tgz", - "integrity": "sha512-sJnSOTVESURZ61XgEleqmP255T6zTYwHPwE4r6SssIh0U9/uDvfpdoJYpVUerJJZH2fueO+CdT8ZT+OC/7aZDA==", + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.4.2.tgz", + "integrity": "sha512-qMKFd2qG/36aA4GwvKq8MxnPgCQAmBWmSyLWsJcbn8v03wvIPQ/hG1Ms8bPzndZxMDoHpxez5VOS+gC9Yi24/Q==", "optional": true }, "node_modules/bare-fs": { @@ -601,9 +610,9 @@ } }, "node_modules/bare-os": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.3.0.tgz", - "integrity": "sha512-oPb8oMM1xZbhRQBngTgpcQ5gXw6kjOaRsSWsIeNyRxGed2w/ARyP7ScBYpWR1qfX2E5rS3gBw6OWcSQo+s+kUg==", + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-2.4.0.tgz", + "integrity": "sha512-v8DTT08AS/G0F9xrhyLtepoo9EJBJ85FRSMbu1pQUlAf6A8T0tEEQGMVObWeqpjhSPXsE0VGlluFBJu2fdoTNg==", "optional": true }, "node_modules/bare-path": { @@ -616,9 +625,9 @@ } }, "node_modules/bare-stream": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.0.1.tgz", - "integrity": "sha512-ubLyoDqPnUf5o0kSFp709HC0WRZuxVuh4pbte5eY95Xvx5bdvz07c2JFmXBfqqe60q+9PJ8S4X5GRvmcNSKMxg==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.1.3.tgz", + "integrity": "sha512-tiDAH9H/kP+tvNO5sczyn9ZAA7utrSMobyDchsnyyXBuUe2FSQWbxhtuHB8jwpHYYevVo2UJpcmvvjrbHboUUQ==", "optional": true, "dependencies": { "streamx": "^2.18.0" @@ -681,6 +690,17 @@ "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" }, + "node_modules/cli-progress": { + "version": "3.12.0", + "resolved": "https://registry.npmjs.org/cli-progress/-/cli-progress-3.12.0.tgz", + "integrity": "sha512-tRkV3HJ1ASwm19THiiLIXLO7Im7wlTuKnvkYaTkyoAPefqjNg7W7DHKUlGRxy9vxDvbyCYQkQozvptuMkGCg8A==", + "dependencies": { + "string-width": "^4.2.3" + }, + "engines": { + "node": ">=4" + } + }, "node_modules/color": { "version": "4.2.3", "resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz", @@ -748,6 +768,11 @@ "node": ">=8" } }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==" + }, "node_modules/end-of-stream": { "version": "1.4.4", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", @@ -828,6 +853,14 @@ "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==" }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "engines": { + "node": ">=8" + } + }, "node_modules/long": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz", @@ -863,9 +896,9 @@ "integrity": "sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==" }, "node_modules/node-abi": { - "version": "3.63.0", - "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.63.0.tgz", - "integrity": "sha512-vAszCsOUrUxjGAmdnM/pq7gUgie0IRteCQMX6d4A534fQCR93EJU5qgzBvU6EkFfK27s0T3HEV3BOyJIr7OMYw==", + "version": "3.65.0", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.65.0.tgz", + "integrity": "sha512-ThjYBfoDNr08AWx6hGaRbfPwxKV9kVzAzOzlLKbk2CuqXE2xnCh+cbAGnwM3t8Lq4v9rUB7VfondlkBckcJrVA==", "dependencies": { "semver": "^7.3.5" }, @@ -1172,6 +1205,30 @@ "safe-buffer": "~5.2.0" } }, + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/strip-json-comments": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", @@ -1204,9 +1261,9 @@ } }, "node_modules/text-decoder": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.0.tgz", - "integrity": "sha512-TmLJNj6UgX8xcUZo4UDStGQtDiTzF7BzWlzn9g7UWrjkpHr5uJTK1ld16wZ3LXb2vb6jH8qU89dW5whuMdXYdw==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.1.1.tgz", + "integrity": "sha512-8zll7REEv4GDD3x4/0pW+ppIxSNs7H1J10IKFZsuOMscumCdM2a+toDGLPA3T+1+fLBql4zbt5z83GEQGGV5VA==", "dependencies": { "b4a": "^1.6.4" } diff --git a/package.json b/package.json index bdd9440..f49dd8c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "semantic-chunking", - "version": "1.2.1", + "version": "1.3.0", "description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).", "repository": { "type": "git", @@ -23,11 +23,13 @@ "license": "ISC", "scripts": { "clean-models": "find ./models -type f ! -name '*.url' -delete", - "clean-models-win": "powershell -Command \"Get-ChildItem -Path ./models -Recurse | Where-Object { !$_.PSIsContainer -and $_.Extension -ne '.url' } | Remove-Item\"" + "clean-models-win": "powershell -Command \"Get-ChildItem -Path ./models -Recurse | Where-Object { !$_.PSIsContainer -and $_.Extension -ne '.url' } | Remove-Item\"", + "download-models": "node ./tools/download-models.js" }, "dependencies": { "@stdlib/nlp-sentencize": "^0.2.1", "@xenova/transformers": "^2.17.2", + "cli-progress": "^3.12.0", "fs": "^0.0.1-security" } } diff --git a/tools/download-models-list.json b/tools/download-models-list.json new file mode 100644 index 0000000..aab3172 --- /dev/null +++ b/tools/download-models-list.json @@ -0,0 +1,10 @@ +[ + { + "modelName": "Xenova/all-MiniLM-L6-v2", + "quantized": true + }, + { + "modelName": "BAAI/bge-small-en-v1.5", + "quantized": false + } +] \ No newline at end of file diff --git a/tools/download-models.js b/tools/download-models.js new file mode 100644 index 0000000..2c7d0eb --- /dev/null +++ b/tools/download-models.js @@ -0,0 +1,41 @@ +import { pipeline, env as transformerCache } from "@xenova/transformers" +import cliProgress from "cli-progress" +import fs from "fs/promises" + +const data = await fs.readFile('./tools/download-models-list.json', 'utf8'); +const downloadModelsList = JSON.parse(data); + + +transformerCache.cacheDir = "models" + +console.log(`Downloading embedding models…`) + +const bar = new cliProgress.SingleBar({ + clearOnComplete: false, + hideCursor: true, + format: "[{bar}] {value}% | {model}", +}) + + +let started = false + +for (const model of downloadModelsList) { + console.log(`model: ${model.modelName}, quantized: ${model.quantized}`) + bar.start(100, 0, { model: model.modelName}) + + await pipeline("feature-extraction", model.modelName, { + quantized: model.quantized, + progress_callback: (data) => { + started = started || data.status === "download" + if (!started) return + + if (data.progress) bar.update(Math.floor(data.progress)) + }, + }) + bar.update(100) + bar.stop() +} + + + +console.log("Success!") \ No newline at end of file