From 1bb5ccdbcebce2f2c33d457592a788ba9c6aec2f Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 26 Aug 2025 04:04:32 +0300
Subject: [PATCH 1/6] fix: shrink CUDA module sizes

---
 llama/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 0298ce1f..f4f26fbe 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -86,8 +86,11 @@ endif()
 
 if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
     find_package(CUDAToolkit)
+
     if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
         set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
+    elseif (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.2")
+        set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-real")
     endif()
 endif()
 

From a947dde1cfd647e4c037ed37ed341bd273756e4f Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 26 Aug 2025 04:11:19 +0300
Subject: [PATCH 2/6] build: try a different approach

---
 llama/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index f4f26fbe..6b2cb353 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -88,9 +88,7 @@ if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
     find_package(CUDAToolkit)
 
     if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
-        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
-    elseif (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.2")
-        set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-real")
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-virtual;89-real;90-real")
     endif()
 endif()
 

From a304dc57885ad3efe0fb10fb2b18d57b393ce3ef Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 26 Aug 2025 18:20:53 +0300
Subject: [PATCH 3/6] build: build CUDA 13 as the main target and 12.4 as the
 fallback

---
 .github/workflows/build.yml | 34 +++++++++++++++++++---------------
 llama/CMakeLists.txt        |  4 +++-
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9f2b856e..59ad77be 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -104,7 +104,11 @@ jobs:
         if: matrix.config.name == 'Ubuntu (1)'
         run: |
           sudo apt-get update
-          sudo apt-get install ninja-build cmake libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+          sudo apt-get install ninja-build libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
+          
+          wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
+          sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
+          rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
           
           which aarch64-linux-gnu-gcc
           which aarch64-linux-gnu-g++
@@ -126,17 +130,8 @@ jobs:
           
           cmake --version
 
-      - name: Install Cuda 12.4 on Windows (1)
+      - name: Install Cuda 13.0 on Windows (1)
         if: matrix.config.name == 'Windows (1)'
-        uses: Jimver/cuda-toolkit@v0.2.15
-        with:
-          cuda: '12.4.0'
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
-          use-local-cache: false
-
-      - name: Install Cuda 13.0 on Windows (2)
-        if: matrix.config.name == 'Windows (2)'
         shell: bash
         timeout-minutes: 30
         run: |
@@ -155,20 +150,29 @@ jobs:
           echo "CUDA_PATH_VX_Y=CUDA_PATH_V13_0" >> $GITHUB_ENV
           echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\bin" >> $GITHUB_PATH
 
-      - name: Install Cuda 12.4 on Ubuntu
-        if: matrix.config.name == 'Ubuntu (1)'
+      - name: Install Cuda 12.4 on Windows (2)
+        if: matrix.config.name == 'Windows (2)'
         uses: Jimver/cuda-toolkit@v0.2.15
         with:
           cuda: '12.4.0'
           method: 'network'
+          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+          use-local-cache: false
 
-      - name: Install Cuda 13.0 on Ubuntu
-        if: matrix.config.name == 'Ubuntu (2)'
+      - name: Install Cuda 13.0 on Ubuntu (1)
+        if: matrix.config.name == 'Ubuntu (1)'
         uses: Jimver/cuda-toolkit@v0.2.27
         with:
           cuda: '13.0.0'
           method: 'network'
 
+      - name: Install Cuda 12.4 on Ubuntu (2)
+        if: matrix.config.name == 'Ubuntu (2)'
+        uses: Jimver/cuda-toolkit@v0.2.15
+        with:
+          cuda: '12.4.0'
+          method: 'network'
+
       - name: Install Vulkan SDK on Windows (1)
         if: matrix.config.name == 'Windows (1)'
         shell: powershell
diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 6b2cb353..22fefe12 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -88,7 +88,9 @@ if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
     find_package(CUDAToolkit)
 
     if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
-        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-virtual;89-real;90-real")
+        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
+    elseif (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.2")
+        set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual")
     endif()
 endif()
 

From 51fcce5d9d8cc1b9bd607954d2b052a8f30a9783 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Tue, 26 Aug 2025 19:10:52 +0300
Subject: [PATCH 4/6] test: fix tests

---
 .../llama3.2/sequenceState.test.ts            | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts
index e6267045..3b248114 100644
--- a/test/modelDependent/llama3.2/sequenceState.test.ts
+++ b/test/modelDependent/llama3.2/sequenceState.test.ts
@@ -75,8 +75,8 @@ describe("llama 3.2", () => {
                 const chatSession1_1 = new LlamaChatSession({
                     contextSequence: contextSequence1
                 });
-                const res1_1 = await chatSession1_1.prompt("What did I tell you to remember?", {maxTokens: 12});
-                expect(res1_1).to.toMatchInlineSnapshot("\"You didn't tell me to remember anything. This is the\"");
+                const res1_1 = await chatSession1_1.prompt("What did I tell you to remember?", {maxTokens: 10});
+                expect(res1_1).to.toMatchInlineSnapshot("\"You didn't tell me to remember anything. This\"");
 
                 await contextSequence1.clearHistory();
                 const contextSequence1TokensState2 = contextSequence1.tokenMeter.getState();
@@ -85,7 +85,7 @@ describe("llama 3.2", () => {
                 expect(TokenMeter.diff(contextSequence1TokensState2, contextSequence1TokensState1)).toMatchInlineSnapshot(`
                   {
                     "usedInputTokens": 101,
-                    "usedOutputTokens": 12,
+                    "usedOutputTokens": 10,
                   }
                 `);
 
@@ -139,8 +139,8 @@ describe("llama 3.2", () => {
                     contextSequence: contextSequence1
                 });
 
-                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 6});
-                expect(res1).to.toMatchInlineSnapshot("\"That's a clever phrase.\"");
+                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4});
+                expect(res1).to.toMatchInlineSnapshot("\"That's a clever\"");
 
 
                 const stateFile1Path = await getTempTestFilePath("state1");
@@ -150,12 +150,12 @@ describe("llama 3.2", () => {
                 const contextSequence1TokensState = contextSequence1.tokenMeter.getState();
 
                 expect(contextSequence1.contextTokens).to.eql(state1Tokens);
-                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105");
-                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.49MB"');
+                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103");
+                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.27MB"');
                 expect(contextSequence1TokensState).to.toMatchInlineSnapshot(`
                   {
                     "usedInputTokens": 99,
-                    "usedOutputTokens": 6,
+                    "usedOutputTokens": 4,
                   }
                 `);
 
@@ -200,8 +200,8 @@ describe("llama 3.2", () => {
                     contextSequence: contextSequence1
                 });
 
-                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 6});
-                expect(res1).to.toMatchInlineSnapshot("\"That's a clever phrase.\"");
+                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4});
+                expect(res1).to.toMatchInlineSnapshot("\"That's a clever\"");
 
 
                 const stateFile1Path = await getTempTestFilePath("state1");
@@ -211,12 +211,12 @@ describe("llama 3.2", () => {
                 const contextSequence1TokensState = contextSequence1.tokenMeter.getState();
 
                 expect(contextSequence1.contextTokens).to.eql(state1Tokens);
-                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105");
-                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.49MB"');
+                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103");
+                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.27MB"');
                 expect(contextSequence1TokensState).to.toMatchInlineSnapshot(`
                   {
                     "usedInputTokens": 99,
-                    "usedOutputTokens": 6,
+                    "usedOutputTokens": 4,
                   }
                 `);
 
@@ -258,8 +258,8 @@ describe("llama 3.2", () => {
                     contextSequence: contextSequence1
                 });
 
-                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 6});
-                expect(res1).to.toMatchInlineSnapshot("\"That's a clever phrase.\"");
+                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4});
+                expect(res1).to.toMatchInlineSnapshot("\"That's a clever\"");
 
 
                 const stateFile1Path = await getTempTestFilePath("state1");
@@ -269,12 +269,12 @@ describe("llama 3.2", () => {
                 const contextSequence1TokensState = contextSequence1.tokenMeter.getState();
 
                 expect(contextSequence1.contextTokens).to.eql(state1Tokens);
-                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105");
-                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.49MB"');
+                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103");
+                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.27MB"');
                 expect(contextSequence1TokensState).to.toMatchInlineSnapshot(`
                   {
                     "usedInputTokens": 99,
-                    "usedOutputTokens": 6,
+                    "usedOutputTokens": 4,
                   }
                 `);
 

From b6acbcf331222ef3a6de626352febd7b37d312e3 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 27 Aug 2025 00:30:16 +0300
Subject: [PATCH 5/6] feat: move prebuilt fallback CUDA backend to an
 additional npm package

---
 llama/CMakeLists.txt                          |  3 -
 package.json                                  |  2 +
 .../linux-x64-cuda-ext/.gitignore             |  1 +
 .../linux-x64-cuda-ext/LICENSE                | 21 +++++++
 .../linux-x64-cuda-ext/README.md              |  5 ++
 .../linux-x64-cuda-ext/package-lock.json      | 38 ++++++++++++
 .../linux-x64-cuda-ext/package.json           | 48 +++++++++++++++
 .../linux-x64-cuda-ext/src/index.ts           | 14 +++++
 .../linux-x64-cuda-ext/tsconfig.json          | 34 +++++++++++
 .../win-x64-cuda-ext/.gitignore               |  1 +
 .../@node-llama-cpp/win-x64-cuda-ext/LICENSE  | 21 +++++++
 .../win-x64-cuda-ext/README.md                |  5 ++
 .../win-x64-cuda-ext/package-lock.json        | 38 ++++++++++++
 .../win-x64-cuda-ext/package.json             | 47 +++++++++++++++
 .../win-x64-cuda-ext/src/index.ts             | 14 +++++
 .../win-x64-cuda-ext/tsconfig.json            | 34 +++++++++++
 ...movePrebuiltBinariesToStandaloneModules.ts | 23 ++++++++
 .../publishStandalonePrebuiltBinaryModules.ts | 12 +++-
 src/bindings/Llama.ts                         | 13 +++--
 src/bindings/getLlama.ts                      | 10 +++-
 src/bindings/utils/compileLLamaCpp.ts         | 58 ++++++++++++++++---
 src/bindings/utils/testBindingBinary.ts       |  7 ++-
 22 files changed, 430 insertions(+), 19 deletions(-)
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/.gitignore
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/LICENSE
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/README.md
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/package-lock.json
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/package.json
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/src/index.ts
 create mode 100644 packages/@node-llama-cpp/linux-x64-cuda-ext/tsconfig.json
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/.gitignore
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/LICENSE
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/README.md
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/package-lock.json
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/package.json
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/src/index.ts
 create mode 100644 packages/@node-llama-cpp/win-x64-cuda-ext/tsconfig.json

diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt
index 22fefe12..0298ce1f 100644
--- a/llama/CMakeLists.txt
+++ b/llama/CMakeLists.txt
@@ -86,11 +86,8 @@ endif()
 
 if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
     find_package(CUDAToolkit)
-
     if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
         set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
-    elseif (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.2")
-        set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;89-virtual")
     endif()
 endif()
 
diff --git a/package.json b/package.json
index 3f0a1271..ac229831 100644
--- a/package.json
+++ b/package.json
@@ -228,12 +228,14 @@
     "@node-llama-cpp/linux-armv7l": "0.1.0",
     "@node-llama-cpp/linux-x64": "0.1.0",
     "@node-llama-cpp/linux-x64-cuda": "0.1.0",
+    "@node-llama-cpp/linux-x64-cuda-ext": "0.1.0",
     "@node-llama-cpp/linux-x64-vulkan": "0.1.0",
     "@node-llama-cpp/mac-arm64-metal": "0.1.0",
     "@node-llama-cpp/mac-x64": "0.1.0",
     "@node-llama-cpp/win-arm64": "0.1.0",
     "@node-llama-cpp/win-x64": "0.1.0",
     "@node-llama-cpp/win-x64-cuda": "0.1.0",
+    "@node-llama-cpp/win-x64-cuda-ext": "0.1.0",
     "@node-llama-cpp/win-x64-vulkan": "0.1.0"
   }
 }
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/.gitignore b/packages/@node-llama-cpp/linux-x64-cuda-ext/.gitignore
new file mode 100644
index 00000000..9b1c8b13
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/.gitignore
@@ -0,0 +1 @@
+/dist
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/LICENSE b/packages/@node-llama-cpp/linux-x64-cuda-ext/LICENSE
new file mode 100644
index 00000000..22789ae3
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Gilad S.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/README.md b/packages/@node-llama-cpp/linux-x64-cuda-ext/README.md
new file mode 100644
index 00000000..25de8451
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/README.md
@@ -0,0 +1,5 @@
+# [`node-llama-cpp`](https://github.com/withcatai/node-llama-cpp)
+This is an extension of the [`@node-llama-cpp/linux-x64-cuda`](https://www.npmjs.com/package/@node-llama-cpp/linux-x64-cuda) package
+that provides a prebuilt binary for [`node-llama-cpp`](https://github.com/withcatai/node-llama-cpp) for Linux x64 with CUDA support.
+
+Do not install this package directly.
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/package-lock.json b/packages/@node-llama-cpp/linux-x64-cuda-ext/package-lock.json
new file mode 100644
index 00000000..6d1e124e
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/package-lock.json
@@ -0,0 +1,38 @@
+{
+  "name": "@node-llama-cpp/linux-x64-cuda-ext",
+  "version": "0.1.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "@node-llama-cpp/linux-x64-cuda-ext",
+      "version": "0.1.0",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "os": [
+        "linux"
+      ],
+      "devDependencies": {
+        "typescript": "^5.2.2"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      }
+    },
+    "node_modules/typescript": {
+      "version": "5.5.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
+      "integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==",
+      "dev": true,
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    }
+  }
+}
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/package.json b/packages/@node-llama-cpp/linux-x64-cuda-ext/package.json
new file mode 100644
index 00000000..d94a632d
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/package.json
@@ -0,0 +1,48 @@
+{
+  "name": "@node-llama-cpp/linux-x64-cuda-ext",
+  "version": "0.1.0",
+  "description": "Extension of @node-llama-cpp/linux-x64-cuda - prebuilt binary for node-llama-cpp for Linux x64 with CUDA support",
+  "main": "dist/index.js",
+  "type": "module",
+  "files": [
+    "dist/",
+    "bins/",
+    "package.json",
+    "README.md",
+    "LICENSE"
+  ],
+  "exports": {
+    ".": {
+      "import": "./dist/index.js",
+      "node": "./dist/index.js",
+      "default": "./dist/index.js"
+    }
+  },
+  "engines": {
+    "node": ">=20.0.0"
+  },
+  "os": ["linux"],
+  "cpu": ["x64"],
+  "libc": ["glibc"],
+  "scripts": {
+    "prebuild": "rimraf ./dist ./tsconfig.tsbuildinfo",
+    "build": "tsc --build tsconfig.json --force",
+    "prewatch": "rimraf ./dist ./tsconfig.tsbuildinfo",
+    "watch": "tsc --build tsconfig.json --watch --force",
+    "clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/withcatai/node-llama-cpp.git"
+  },
+  "author": "Gilad S.",
+  "license": "MIT",
+  "preferUnplugged": true,
+  "bugs": {
+    "url": "https://github.com/withcatai/node-llama-cpp/issues"
+  },
+  "homepage": "https://node-llama-cpp.withcat.ai",
+  "devDependencies": {
+    "typescript": "^5.2.2"
+  }
+}
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/src/index.ts b/packages/@node-llama-cpp/linux-x64-cuda-ext/src/index.ts
new file mode 100644
index 00000000..a4cb56d5
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/src/index.ts
@@ -0,0 +1,14 @@
+import path from "path";
+import {fileURLToPath} from "url";
+import fs from "node:fs/promises";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const binsDir = path.join(__dirname, "..", "bins");
+const packageVersion: string = (JSON.parse(await fs.readFile(path.join(__dirname, "..", "package.json"), "utf8"))).version;
+
+export function getBinsDir() {
+    return {
+        binsDir,
+        packageVersion
+    };
+}
diff --git a/packages/@node-llama-cpp/linux-x64-cuda-ext/tsconfig.json b/packages/@node-llama-cpp/linux-x64-cuda-ext/tsconfig.json
new file mode 100644
index 00000000..f6f82db3
--- /dev/null
+++ b/packages/@node-llama-cpp/linux-x64-cuda-ext/tsconfig.json
@@ -0,0 +1,34 @@
+{
+    "compilerOptions": {
+        "lib": ["es2022"],
+        "module": "es2022",
+        "target": "es2022",
+        "esModuleInterop": true,
+        "noImplicitAny": true,
+        "noImplicitReturns": true,
+        "noImplicitThis": true,
+        "noImplicitOverride": true,
+        "removeComments": false,
+        "allowSyntheticDefaultImports": true,
+        "forceConsistentCasingInFileNames": true,
+        "noFallthroughCasesInSwitch": true,
+        "skipLibCheck": true,
+        "moduleResolution": "node",
+        "resolveJsonModule": false,
+        "strictNullChecks": true,
+        "isolatedModules": true,
+        "noEmit": false,
+        "outDir": "./dist",
+        "strict": true,
+        "sourceMap": false,
+        "composite": false,
+        "declaration": false,
+        "stripInternal": true
+    },
+    "files": [
+        "./src/index.ts"
+    ],
+    "include": [
+        "./src"
+    ]
+}
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/.gitignore b/packages/@node-llama-cpp/win-x64-cuda-ext/.gitignore
new file mode 100644
index 00000000..9b1c8b13
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/.gitignore
@@ -0,0 +1 @@
+/dist
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/LICENSE b/packages/@node-llama-cpp/win-x64-cuda-ext/LICENSE
new file mode 100644
index 00000000..22789ae3
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Gilad S.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/README.md b/packages/@node-llama-cpp/win-x64-cuda-ext/README.md
new file mode 100644
index 00000000..ad2b83cc
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/README.md
@@ -0,0 +1,5 @@
+# [`node-llama-cpp`](https://github.com/withcatai/node-llama-cpp)
+This is an extension of the [`@node-llama-cpp/win-x64-cuda`](https://www.npmjs.com/package/@node-llama-cpp/win-x64-cuda) package
+that provides a prebuilt binary for [`node-llama-cpp`](https://github.com/withcatai/node-llama-cpp) for Windows x64 with CUDA support.
+
+Do not install this package directly.
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/package-lock.json b/packages/@node-llama-cpp/win-x64-cuda-ext/package-lock.json
new file mode 100644
index 00000000..986136c2
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/package-lock.json
@@ -0,0 +1,38 @@
+{
+  "name": "@node-llama-cpp/win-x64-cuda-ext",
+  "version": "0.1.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "@node-llama-cpp/win-x64-cuda-ext",
+      "version": "0.1.0",
+      "cpu": [
+        "x64"
+      ],
+      "license": "MIT",
+      "os": [
+        "win32"
+      ],
+      "devDependencies": {
+        "typescript": "^5.2.2"
+      },
+      "engines": {
+        "node": ">=20.0.0"
+      }
+    },
+    "node_modules/typescript": {
+      "version": "5.5.2",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.2.tgz",
+      "integrity": "sha512-NcRtPEOsPFFWjobJEtfihkLCZCXZt/os3zf8nTxjVH3RvTSxjrCamJpbExGvYOF+tFHc3pA65qpdwPbzjohhew==",
+      "dev": true,
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    }
+  }
+}
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/package.json b/packages/@node-llama-cpp/win-x64-cuda-ext/package.json
new file mode 100644
index 00000000..4f910221
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/package.json
@@ -0,0 +1,47 @@
+{
+  "name": "@node-llama-cpp/win-x64-cuda-ext",
+  "version": "0.1.0",
+  "description": "Extension of @node-llama-cpp/win-x64-cuda - prebuilt binary for node-llama-cpp for Windows x64 with CUDA support",
+  "main": "dist/index.js",
+  "type": "module",
+  "files": [
+    "dist/",
+    "bins/",
+    "package.json",
+    "README.md",
+    "LICENSE"
+  ],
+  "exports": {
+    ".": {
+      "import": "./dist/index.js",
+      "node": "./dist/index.js",
+      "default": "./dist/index.js"
+    }
+  },
+  "engines": {
+    "node": ">=20.0.0"
+  },
+  "os": ["win32"],
+  "cpu": ["x64"],
+  "scripts": {
+    "prebuild": "rimraf ./dist ./tsconfig.tsbuildinfo",
+    "build": "tsc --build tsconfig.json --force",
+    "prewatch": "rimraf ./dist ./tsconfig.tsbuildinfo",
+    "watch": "tsc --build tsconfig.json --watch --force",
+    "clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/withcatai/node-llama-cpp.git"
+  },
+  "author": "Gilad S.",
+  "license": "MIT",
+  "preferUnplugged": true,
+  "bugs": {
+    "url": "https://github.com/withcatai/node-llama-cpp/issues"
+  },
+  "homepage": "https://node-llama-cpp.withcat.ai",
+  "devDependencies": {
+    "typescript": "^5.2.2"
+  }
+}
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/src/index.ts b/packages/@node-llama-cpp/win-x64-cuda-ext/src/index.ts
new file mode 100644
index 00000000..a4cb56d5
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/src/index.ts
@@ -0,0 +1,14 @@
+import path from "path";
+import {fileURLToPath} from "url";
+import fs from "node:fs/promises";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const binsDir = path.join(__dirname, "..", "bins");
+const packageVersion: string = (JSON.parse(await fs.readFile(path.join(__dirname, "..", "package.json"), "utf8"))).version;
+
+export function getBinsDir() {
+    return {
+        binsDir,
+        packageVersion
+    };
+}
diff --git a/packages/@node-llama-cpp/win-x64-cuda-ext/tsconfig.json b/packages/@node-llama-cpp/win-x64-cuda-ext/tsconfig.json
new file mode 100644
index 00000000..f6f82db3
--- /dev/null
+++ b/packages/@node-llama-cpp/win-x64-cuda-ext/tsconfig.json
@@ -0,0 +1,34 @@
+{
+    "compilerOptions": {
+        "lib": ["es2022"],
+        "module": "es2022",
+        "target": "es2022",
+        "esModuleInterop": true,
+        "noImplicitAny": true,
+        "noImplicitReturns": true,
+        "noImplicitThis": true,
+        "noImplicitOverride": true,
+        "removeComments": false,
+        "allowSyntheticDefaultImports": true,
+        "forceConsistentCasingInFileNames": true,
+        "noFallthroughCasesInSwitch": true,
+        "skipLibCheck": true,
+        "moduleResolution": "node",
+        "resolveJsonModule": false,
+        "strictNullChecks": true,
+        "isolatedModules": true,
+        "noEmit": false,
+        "outDir": "./dist",
+        "strict": true,
+        "sourceMap": false,
+        "composite": false,
+        "declaration": false,
+        "stripInternal": true
+    },
+    "files": [
+        "./src/index.ts"
+    ],
+    "include": [
+        "./src"
+    ]
+}
diff --git a/scripts/movePrebuiltBinariesToStandaloneModules.ts b/scripts/movePrebuiltBinariesToStandaloneModules.ts
index aa094dfa..792b8fb9 100644
--- a/scripts/movePrebuiltBinariesToStandaloneModules.ts
+++ b/scripts/movePrebuiltBinariesToStandaloneModules.ts
@@ -27,9 +27,31 @@ async function moveBinariesFolderToStandaloneModule(folderNameFilter: (folderNam
     }
 }
 
+async function moveBinariesFallbackDirToStandaloneExtModule(folderNameFilter: (folderName: string) => boolean, packageName: string) {
+    for (const folderName of await fs.readdir(binsDirectory)) {
+        if (!folderNameFilter(folderName))
+            continue;
+
+        const packagePath = path.join(packageDirectory, packageName);
+        const packageBinsPath = path.join(packagePath, "bins");
+        const fallbackDir = path.join(binsDirectory, folderName, "fallback");
+
+        if (!(await fs.pathExists(fallbackDir))) {
+            console.warn(`No fallback directory in "${folderName}"`);
+            continue;
+        }
+
+        console.info(`Moving "${folderName}/fallback" to "${packageName}"`);
+
+        await fs.ensureDir(path.join(packageBinsPath, folderName));
+        await fs.move(fallbackDir, path.join(packageBinsPath, folderName, "fallback"));
+    }
+}
+
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("mac-arm64-metal"), "@node-llama-cpp/mac-arm64-metal");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("mac-x64"), "@node-llama-cpp/mac-x64");
 
+await moveBinariesFallbackDirToStandaloneExtModule((folderName) => folderName.startsWith("linux-x64-cuda"), "@node-llama-cpp/linux-x64-cuda-ext");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("linux-x64-cuda"), "@node-llama-cpp/linux-x64-cuda");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("linux-x64-vulkan"), "@node-llama-cpp/linux-x64-vulkan");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("linux-x64"), "@node-llama-cpp/linux-x64");
@@ -37,6 +59,7 @@ await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("linux-arm64"), "@node-llama-cpp/linux-arm64");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("linux-armv7l"), "@node-llama-cpp/linux-armv7l");
 
+await moveBinariesFallbackDirToStandaloneExtModule((folderName) => folderName.startsWith("win-x64-cuda"), "@node-llama-cpp/win-x64-cuda-ext");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("win-x64-cuda"), "@node-llama-cpp/win-x64-cuda");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("win-x64-vulkan"), "@node-llama-cpp/win-x64-vulkan");
 await moveBinariesFolderToStandaloneModule((folderName) => folderName.startsWith("win-x64"), "@node-llama-cpp/win-x64");
diff --git a/scripts/publishStandalonePrebuiltBinaryModules.ts b/scripts/publishStandalonePrebuiltBinaryModules.ts
index 63933a47..fad7423a 100644
--- a/scripts/publishStandalonePrebuiltBinaryModules.ts
+++ b/scripts/publishStandalonePrebuiltBinaryModules.ts
@@ -27,7 +27,17 @@ const {packageVersion} = argv;
 if (packageVersion === "")
     throw new Error("packageVersion is empty");
 
-for (const packageName of await fs.readdir(subPackagesDirectory)) {
+const packageNames = (await fs.readdir(subPackagesDirectory))
+    .sort((a, b) => {
+        if (a.endsWith("-ext"))
+            return -1;
+        else if (b.endsWith("-ext"))
+            return 1;
+
+        return a.localeCompare(b);
+    });
+
+for (const packageName of packageNames) {
     const packagePath = path.join(subPackagesDirectory, packageName);
     const packagePackageJsonPath = path.join(packagePath, "package.json");
 
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index c3d4230e..cb0e4574 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -72,11 +72,12 @@ export class Llama {
     public readonly onDispose = new EventRelay<void>();
 
     private constructor({
-        bindings, bindingPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu, maxThreads,
-        vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator
+        bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu,
+        maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator
     }: {
         bindings: BindingModule,
         bindingPath: string,
+        extBackendsPath?: string,
         logLevel: LlamaLogLevel,
         logger: (level: LlamaLogLevel, message: string) => void,
         buildType: "localBuild" | "prebuilt",
@@ -114,7 +115,7 @@ export class Llama {
         let loadedGpu = bindings.getGpuType();
         if (loadedGpu == null || (loadedGpu === false && buildGpu !== false)) {
             const backendsPath = path.dirname(bindingPath);
-            const fallbackBackendsDir = path.join(backendsPath, "fallback");
+            const fallbackBackendsDir = path.join(extBackendsPath ?? backendsPath, "fallback");
 
             bindings.loadBackends(backendsPath);
 
@@ -495,11 +496,12 @@ export class Llama {
 
     /** @internal */
     public static async _create({
-        bindings, bindingPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads, skipLlamaInit = false,
-        debug, numa
+        bindings, bindingPath, extBackendsPath, buildType, buildMetadata, logLevel, logger, vramPadding, ramPadding, maxThreads,
+        skipLlamaInit = false, debug, numa
     }: {
         bindings: BindingModule,
         bindingPath: string,
+        extBackendsPath?: string,
         buildType: "localBuild" | "prebuilt",
         buildMetadata: BuildMetadataFile,
         logLevel: LlamaLogLevel,
@@ -557,6 +559,7 @@ export class Llama {
         const llama = new Llama({
             bindings,
             bindingPath,
+            extBackendsPath,
             buildType,
             cmakeOptions: buildMetadata.buildOptions.customCmakeOptions,
             llamaCppRelease: {
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index 3d6b85a9..8f8c6a70 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -706,7 +706,7 @@ async function loadExistingLlamaBinary({
             });
             const resolvedBindingPath = await resolveActualBindingBinaryPath(localBuildBinPath);
             const binaryCompatible = shouldTestBinaryBeforeLoading
-                ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
+                ? await testBindingBinary(resolvedBindingPath, undefined, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
                 : true;
 
             if (binaryCompatible) {
@@ -765,8 +765,13 @@ async function loadExistingLlamaBinary({
                     buildMetadata
                 });
                 const resolvedBindingPath = await resolveActualBindingBinaryPath(prebuiltBinDetails.binaryPath);
+                const resolvedExtBackendsPath = prebuiltBinDetails.extBackendsPath == null
+                    ? undefined
+                    : await resolveActualBindingBinaryPath(prebuiltBinDetails.extBackendsPath);
                 const binaryCompatible = shouldTestBinaryBeforeLoading
-                    ? await testBindingBinary(resolvedBindingPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs)
+                    ? await testBindingBinary(
+                        resolvedBindingPath, resolvedExtBackendsPath, buildOptions.gpu, undefined, pipeBinaryTestErrorLogs
+                    )
                     : true;
 
                 if (binaryCompatible) {
@@ -775,6 +780,7 @@ async function loadExistingLlamaBinary({
                     return await Llama._create({
                         bindings: binding,
                         bindingPath: resolvedBindingPath,
+                        extBackendsPath: resolvedExtBackendsPath,
                         buildType: "prebuilt",
                         buildMetadata,
                         logLevel,
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index 31bdf437..b88c9b5f 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -388,21 +388,34 @@ export async function getPrebuiltBinaryPath(buildOptions: BuildOptions, folderNa
         return {
             binaryPath,
             folderName,
-            folderPath: localPrebuiltBinaryDirectoryPath
+            folderPath: localPrebuiltBinaryDirectoryPath,
+            extBackendsPath: undefined
         };
 
     const packagePrebuiltBinariesDirectoryPath = await getPrebuiltBinariesPackageDirectoryForBuildOptions(buildOptions);
     if (packagePrebuiltBinariesDirectoryPath == null)
         return null;
 
-    const packagePrebuiltBinaryDirectoryPath = path.join(packagePrebuiltBinariesDirectoryPath, folderName);
+    const prebuiltBinariesDirPath = typeof packagePrebuiltBinariesDirectoryPath === "string"
+        ? packagePrebuiltBinariesDirectoryPath
+        : packagePrebuiltBinariesDirectoryPath.binsDir;
+    const prebuiltBinariesExtDirPath = typeof packagePrebuiltBinariesDirectoryPath === "string"
+        ? undefined
+        : packagePrebuiltBinariesDirectoryPath.extBinsDir;
+
+    const packagePrebuiltBinaryDirectoryPath = path.join(prebuiltBinariesDirPath, folderName);
+    const extPackagePrebuiltBinaryDirectoryPath = prebuiltBinariesExtDirPath == null
+        ? undefined
+        : path.join(prebuiltBinariesExtDirPath, folderName);
+
     const binaryPathFromPackage = await resolvePrebuiltBinaryPath(packagePrebuiltBinaryDirectoryPath);
 
     if (binaryPathFromPackage != null)
         return {
             binaryPath: binaryPathFromPackage,
             folderName,
-            folderPath: packagePrebuiltBinaryDirectoryPath
+            folderPath: packagePrebuiltBinaryDirectoryPath,
+            extBackendsPath: extPackagePrebuiltBinaryDirectoryPath
         };
 
     return null;
@@ -523,6 +536,29 @@ function getPrebuiltBinariesPackageDirectoryForBuildOptions(buildOptions: BuildO
         }
     }
 
+    async function getBinariesPathFromModulesWithExtModule(
+        moduleImport: () => Promise<{getBinsDir(): {binsDir: string, packageVersion: string}}>,
+        extModuleImport: () => Promise<{getBinsDir(): {binsDir: string, packageVersion: string}}>
+    ) {
+        const [
+            moduleBinsDir,
+            extModuleBinsDir
+        ] = await Promise.all([
+            getBinariesPathFromModules(moduleImport),
+            getBinariesPathFromModules(extModuleImport)
+        ]);
+
+        if (moduleBinsDir == null)
+            return null;
+        else if (extModuleBinsDir == null)
+            return moduleBinsDir;
+
+        return {
+            binsDir: moduleBinsDir,
+            extBinsDir: extModuleBinsDir
+        };
+    }
+
     /* eslint-disable import/no-unresolved */
     if (buildOptions.platform === "mac") {
         if (buildOptions.arch === "arm64" && buildOptions.gpu === "metal")
@@ -534,8 +570,12 @@ function getPrebuiltBinariesPackageDirectoryForBuildOptions(buildOptions: BuildO
     } else if (buildOptions.platform === "linux") {
         if (buildOptions.arch === "x64") {
             if (buildOptions.gpu === "cuda")
-                // @ts-ignore
-                return getBinariesPathFromModules(() => import("@node-llama-cpp/linux-x64-cuda"));
+                return getBinariesPathFromModulesWithExtModule(
+                    // @ts-ignore
+                    () => import("@node-llama-cpp/linux-x64-cuda"),
+                    // @ts-ignore
+                    () => import("@node-llama-cpp/linux-x64-cuda-ext")
+                );
             else if (buildOptions.gpu === "vulkan")
                 // @ts-ignore
                 return getBinariesPathFromModules(() => import("@node-llama-cpp/linux-x64-vulkan"));
@@ -551,8 +591,12 @@ function getPrebuiltBinariesPackageDirectoryForBuildOptions(buildOptions: BuildO
     } else if (buildOptions.platform === "win") {
         if (buildOptions.arch === "x64") {
             if (buildOptions.gpu === "cuda")
-                // @ts-ignore
-                return getBinariesPathFromModules(() => import("@node-llama-cpp/win-x64-cuda"));
+                return getBinariesPathFromModulesWithExtModule(
+                    // @ts-ignore
+                    () => import("@node-llama-cpp/win-x64-cuda"),
+                    // @ts-ignore
+                    () => import("@node-llama-cpp/win-x64-cuda-ext")
+                );
             else if (buildOptions.gpu === "vulkan")
                 // @ts-ignore
                 return getBinariesPathFromModules(() => import("@node-llama-cpp/win-x64-vulkan"));
diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
index 0e4dac67..41e7c4f3 100644
--- a/src/bindings/utils/testBindingBinary.ts
+++ b/src/bindings/utils/testBindingBinary.ts
@@ -17,6 +17,7 @@ const expectedFileName = "testBindingBinary";
 
 export async function testBindingBinary(
     bindingBinaryPath: string,
+    extBackendsPath: string | undefined = path.dirname(bindingBinaryPath),
     gpu: BuildGpu,
     testTimeout: number = 1000 * 60 * 5,
     pipeOutputOnNode: boolean = false
@@ -233,6 +234,7 @@ export async function testBindingBinary(
                         subProcess!.sendMessage({
                             type: "start",
                             bindingBinaryPath,
+                            extBackendsPath,
                             gpu
                         });
                     } else if (message.type === "loaded") {
@@ -240,6 +242,7 @@ export async function testBindingBinary(
                         subProcess!.sendMessage({
                             type: "test",
                             bindingBinaryPath,
+                            extBackendsPath,
                             gpu
                         });
                     } else if (message.type === "done") {
@@ -289,7 +292,7 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
                 let loadedGpu = binding.getGpuType();
                 if (loadedGpu == null || (loadedGpu === false && message.gpu !== false)) {
                     const backendsPath = path.dirname(path.resolve(message.bindingBinaryPath));
-                    const fallbackBackendsDir = path.join(backendsPath, "fallback");
+                    const fallbackBackendsDir = path.join(message.extBackendsPath ?? backendsPath, "fallback");
 
                     binding.loadBackends(backendsPath);
 
@@ -337,10 +340,12 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
 type ParentToChildMessage = {
     type: "start",
     bindingBinaryPath: string,
+    extBackendsPath: string,
     gpu: BuildGpu
 } | {
     type: "test",
     bindingBinaryPath: string,
+    extBackendsPath: string,
     gpu: BuildGpu
 } | {
     type: "exit"

From cdad1c373060e81fb9f00b87653f72b68698b893 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 27 Aug 2025 00:36:40 +0300
Subject: [PATCH 6/6] fix: bug

---
 src/bindings/utils/testBindingBinary.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bindings/utils/testBindingBinary.ts b/src/bindings/utils/testBindingBinary.ts
index 41e7c4f3..a3cd6326 100644
--- a/src/bindings/utils/testBindingBinary.ts
+++ b/src/bindings/utils/testBindingBinary.ts
@@ -17,7 +17,7 @@ const expectedFileName = "testBindingBinary";
 
 export async function testBindingBinary(
     bindingBinaryPath: string,
-    extBackendsPath: string | undefined = path.dirname(bindingBinaryPath),
+    extBackendsPath: string | undefined,
     gpu: BuildGpu,
     testTimeout: number = 1000 * 60 * 5,
     pipeOutputOnNode: boolean = false
@@ -292,7 +292,7 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
                 let loadedGpu = binding.getGpuType();
                 if (loadedGpu == null || (loadedGpu === false && message.gpu !== false)) {
                     const backendsPath = path.dirname(path.resolve(message.bindingBinaryPath));
-                    const fallbackBackendsDir = path.join(message.extBackendsPath ?? backendsPath, "fallback");
+                    const fallbackBackendsDir = path.join(path.resolve(message.extBackendsPath ?? backendsPath), "fallback");
 
                     binding.loadBackends(backendsPath);
 
@@ -340,12 +340,12 @@ if (process.env.TEST_BINDING_CP === "true" && (process.parentPort != null || pro
 type ParentToChildMessage = {
     type: "start",
     bindingBinaryPath: string,
-    extBackendsPath: string,
+    extBackendsPath?: string,
     gpu: BuildGpu
 } | {
     type: "test",
     bindingBinaryPath: string,
-    extBackendsPath: string,
+    extBackendsPath?: string,
     gpu: BuildGpu
 } | {
     type: "exit"