From cabafeab69900f035a5730e6c12a574ca8774a08 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 1 Jun 2025 02:05:39 +0300
Subject: [PATCH 01/14] feat: SWA support

---
 llama/addon/AddonContext.cpp                  |  19 ++
 llama/addon/AddonContext.h                    |   1 +
 llama/addon/addon.cpp                         |  14 ++
 src/bindings/AddonTypes.ts                    |   5 +-
 src/cli/commands/ChatCommand.ts               |  19 +-
 src/cli/commands/CompleteCommand.ts           |  17 +-
 src/cli/commands/InfillCommand.ts             |  17 +-
 .../commands/InspectEstimateCommand.ts        |  14 +-
 .../inspect/commands/InspectMeasureCommand.ts |  37 ++-
 src/cli/utils/interactivelyAskForModel.ts     |  14 +-
 src/cli/utils/resolveCommandGgufPath.ts       |   5 +-
 src/evaluator/LlamaContext/LlamaContext.ts    |  53 +++-
 src/evaluator/LlamaContext/types.ts           |  16 ++
 src/evaluator/LlamaModel/LlamaModel.ts        |  25 +-
 src/gguf/insights/GgufInsights.ts             | 234 +++++++++++-------
 .../GgufInsightsConfigurationResolver.ts      |  19 +-
 .../utils/resolveContextContextSizeOption.ts  |   6 +-
 .../utils/resolveModelGpuLayersOption.ts      |  26 +-
 src/gguf/types/GgufMetadataTypes.ts           |   1 +
 19 files changed, 404 insertions(+), 138 deletions(-)
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 1f8a8726..a64e3ada 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -393,6 +393,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
     context_params.n_threads = std::max(cpu_get_num_math(), 1);
     context_params.n_threads_batch = context_params.n_threads;
     context_params.no_perf = true;
+    context_params.swa_full = false;
 
     if (info.Length() > 1 && info[1].IsObject()) {
         Napi::Object options = info[1].As<Napi::Object>();
@@ -433,6 +434,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         if (options.Has("performanceTracking")) {
             context_params.no_perf = !(options.Get("performanceTracking").As<Napi::Boolean>().Value());
         }
+
+        if (options.Has("swaFullCache")) {
+            context_params.swa_full = options.Get("swaFullCache").As<Napi::Boolean>().Value();
+        }
     }
 }
 AddonContext::~AddonContext() {
@@ -620,6 +625,19 @@ Napi::Value AddonContext::ShiftSequenceTokenCells(const Napi::CallbackInfo& info
 
     return info.Env().Undefined();
 }
+Napi::Value AddonContext::GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info) {
+    if (disposed) {
+        Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    int32_t sequenceId = info[0].As<Napi::Number>().Int32Value();
+
+
+    const auto minPosition = llama_kv_self_seq_pos_min(ctx, sequenceId);
+
+    return Napi::Number::New(info.Env(), minPosition);
+}
 Napi::Value AddonContext::DecodeBatch(const Napi::CallbackInfo& info) {
     AddonContextDecodeBatchWorker* worker = new AddonContextDecodeBatchWorker(info.Env(), this);
     worker->Queue();
@@ -926,6 +944,7 @@ void AddonContext::init(Napi::Object exports) {
                 InstanceMethod("disposeSequence", &AddonContext::DisposeSequence),
                 InstanceMethod("removeTokenCellsFromSequence", &AddonContext::RemoveTokenCellsFromSequence),
                 InstanceMethod("shiftSequenceTokenCells", &AddonContext::ShiftSequenceTokenCells),
+                InstanceMethod("getSequenceKvCacheMinPosition", &AddonContext::GetSequenceKvCacheMinPosition),
                 InstanceMethod("decodeBatch", &AddonContext::DecodeBatch),
                 InstanceMethod("sampleToken", &AddonContext::SampleToken),
                 InstanceMethod("getEmbedding", &AddonContext::GetEmbedding),
diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h
index 933ba8f0..0edbedc7 100644
--- a/llama/addon/AddonContext.h
+++ b/llama/addon/AddonContext.h
@@ -36,6 +36,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
         Napi::Value DisposeSequence(const Napi::CallbackInfo& info);
         Napi::Value RemoveTokenCellsFromSequence(const Napi::CallbackInfo& info);
         Napi::Value ShiftSequenceTokenCells(const Napi::CallbackInfo& info);
+        Napi::Value GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info);
         Napi::Value DecodeBatch(const Napi::CallbackInfo& info);
         Napi::Value SampleToken(const Napi::CallbackInfo& info);
 
diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index 943866c0..eef81c25 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -73,6 +73,19 @@ Napi::Value addonGetTypeSizeForGgmlType(const Napi::CallbackInfo& info) {
     return Napi::Number::New(info.Env(), typeSize);
 }
 
+Napi::Value addonGetGgmlGraphOverheadCustom(const Napi::CallbackInfo& info) {
+    if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsBoolean()) {
+        return Napi::Number::New(info.Env(), 0);
+    }
+
+    const size_t size = info[0].As<Napi::Number>().Uint32Value();
+    const bool grads = info[1].As<Napi::Boolean>().Value();
+
+    const auto graphOverhead = ggml_graph_overhead_custom(size, grads);
+
+    return Napi::Number::New(info.Env(), graphOverhead);
+}
+
 Napi::Value addonGetConsts(const Napi::CallbackInfo& info) {
     Napi::Object consts = Napi::Object::New(info.Env());
     consts.Set("ggmlMaxDims", Napi::Number::New(info.Env(), GGML_MAX_DIMS));
@@ -231,6 +244,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
         Napi::PropertyDescriptor::Function("getMathCores", addonGetMathCores),
         Napi::PropertyDescriptor::Function("getBlockSizeForGgmlType", addonGetBlockSizeForGgmlType),
         Napi::PropertyDescriptor::Function("getTypeSizeForGgmlType", addonGetTypeSizeForGgmlType),
+        Napi::PropertyDescriptor::Function("getGgmlGraphOverheadCustom", addonGetGgmlGraphOverheadCustom),
         Napi::PropertyDescriptor::Function("getConsts", addonGetConsts),
         Napi::PropertyDescriptor::Function("setLogger", setLogger),
         Napi::PropertyDescriptor::Function("setLoggerLogLevel", setLoggerLogLevel),
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index b1f3ca0b..e74a1132 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -28,7 +28,8 @@ export type BindingModule = {
             embeddings?: boolean,
             ranking?: boolean,
             threads?: number,
-            performanceTracking?: boolean
+            performanceTracking?: boolean,
+            swaFullCache?: boolean
         }): AddonContext
     },
     AddonGrammar: {
@@ -54,6 +55,7 @@ export type BindingModule = {
     getMathCores(): number,
     getBlockSizeForGgmlType(ggmlType: number): number | undefined,
     getTypeSizeForGgmlType(ggmlType: number): number | undefined,
+    getGgmlGraphOverheadCustom(size: number, grads: boolean): number,
     getConsts(): {
         ggmlMaxDims: number,
         ggmlTypeF16Size: number,
@@ -143,6 +145,7 @@ export type AddonContext = {
     // startPos in inclusive, endPos is exclusive
     shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void,
 
+    getSequenceKvCacheMinPosition(sequenceId: number): number,
     getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array,
     getStateSize(): number,
     getThreads(): number,
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index 79a71c65..26e85cdd 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -45,6 +45,7 @@ type ChatCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     noTrimWhitespace: boolean,
     grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1],
     jsonSchemaGrammarFile?: string,
@@ -162,6 +163,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("swaFullCache", {
+                alias: "noSwa",
+                type: "boolean",
+                default: false,
+                description: "Disable SWA (Sliding Window Attention) on supported models"
+            })
             .option("noTrimWhitespace", {
                 type: "boolean",
                 alias: ["noTrim"],
@@ -308,7 +315,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
-        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention,
+        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
@@ -317,7 +324,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
-                batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed,
+                batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
+                temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
                 timing, noMmap, printTimings
@@ -333,7 +341,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
-    contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
+    contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
+    jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
     tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
@@ -363,11 +372,13 @@ async function RunChat({
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
+        swaFullCache,
         useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
+            swaFullCache,
             useMmap,
             consoleTitle: "Draft model file"
         })
@@ -413,6 +424,7 @@ async function RunChat({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
@@ -446,6 +458,7 @@ async function RunChat({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index f8c7790e..1aae93fd 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -32,6 +32,7 @@ type CompleteCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     threads?: number,
     temperature: number,
     minP: number,
@@ -119,6 +120,12 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("swaFullCache", {
+                alias: "noSwa",
+                type: "boolean",
+                default: false,
+                description: "Disable SWA (Sliding Window Attention) on supported models"
+            })
             .option("threads", {
                 type: "number",
                 defaultDescription: "Number of cores that are useful for math on the current machine",
@@ -235,14 +242,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
-        flashAttention, threads, temperature, minP, topK,
+        flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
         debug, meter, timing, noMmap, printTimings
     }) {
         try {
             await RunCompletion({
-                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
                 tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
@@ -257,7 +264,7 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
 
 
 async function RunCompletion({
-    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
+    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
@@ -286,11 +293,13 @@ async function RunCompletion({
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
+        swaFullCache,
         useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
+            swaFullCache,
             useMmap,
             consoleTitle: "Draft model file"
         })
@@ -329,6 +338,7 @@ async function RunCompletion({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
@@ -362,6 +372,7 @@ async function RunCompletion({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index 7a4a536b..a47df068 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -34,6 +34,7 @@ type InfillCommand = {
     contextSize?: number,
     batchSize?: number,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     threads?: number,
     temperature: number,
     minP: number,
@@ -129,6 +130,12 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 default: false,
                 description: "Enable flash attention"
             })
+            .option("swaFullCache", {
+                alias: "noSwa",
+                type: "boolean",
+                default: false,
+                description: "Disable SWA (Sliding Window Attention) on supported models"
+            })
             .option("threads", {
                 type: "number",
                 defaultDescription: "Number of cores that are useful for math on the current machine",
@@ -245,7 +252,7 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
-        flashAttention, threads, temperature, minP, topK,
+        flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
         debug, meter, timing, noMmap, printTimings
@@ -253,7 +260,7 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
         try {
             await RunInfill({
                 modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
-                threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
+                swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
                 tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
             });
@@ -268,7 +275,7 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
 
 async function RunInfill({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
-    threads, temperature, minP, topK, topP, seed, gpuLayers,
+    swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
 }: InfillCommand) {
@@ -296,11 +303,13 @@ async function RunInfill({
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
+        swaFullCache,
         useMmap
     });
     const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "")
         ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, {
             flashAttention,
+            swaFullCache,
             useMmap,
             consoleTitle: "Draft model file"
         })
@@ -353,6 +362,7 @@ async function RunInfill({
                         ? {fitContext: {contextSize}}
                         : undefined,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextSwaFullCache: swaFullCache,
                 useMmap,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
@@ -386,6 +396,7 @@ async function RunInfill({
                 return await llama.loadModel({
                     modelPath: resolvedDraftModelPath,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextSwaFullCache: swaFullCache,
                     useMmap,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
index db34de6d..ffd5f65e 100644
--- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts
@@ -32,7 +32,8 @@ type InspectEstimateCommand = {
     gpuLayers?: number | "max",
     contextSize?: number | "train",
     embedding?: boolean,
-    noMmap?: boolean
+    noMmap?: boolean,
+    swaFullCache?: boolean
 };
 
 export const InspectEstimateCommand: CommandModule<object, InspectEstimateCommand> = {
@@ -115,10 +116,16 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 type: "boolean",
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
+            })
+            .option("swaFullCache", {
+                alias: "noSwa",
+                type: "boolean",
+                default: false,
+                description: "Disable SWA (Sliding Window Attention) on supported models"
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap
+        modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap, swaFullCache
     }: InspectEstimateCommand) {
         if (gpuLayers === -1) gpuLayers = undefined;
         if (gpuLayers === -2) gpuLayers = "max";
@@ -181,7 +188,8 @@ export const InspectEstimateCommand: CommandModule<object, InspectEstimateComman
                 targetContextSize: contextSize,
                 targetGpuLayers: gpuLayers,
                 embeddingContext: embedding,
-                useMmap
+                useMmap,
+                swaFullCache
             });
         }
 
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index ca7ab77e..7bbc756a 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -31,6 +31,7 @@ type InspectMeasureCommand = {
     minContextSize: number,
     maxContextSize?: number,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     measures: number,
     memory: "vram" | "ram" | "all",
     noMmap: boolean,
@@ -104,6 +105,12 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 default: false,
                 description: "Enable flash attention for the context"
             })
+            .option("swaFullCache", {
+                alias: "noSwa",
+                type: "boolean",
+                default: false,
+                description: "Disable SWA (Sliding Window Attention) on supported models"
+            })
             .option("measures", {
                 alias: "n",
                 type: "number",
@@ -140,8 +147,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10,
-        memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
+        modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache,
+        measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
     }: InspectMeasureCommand) {
         if (maxLayers === -1) maxLayers = undefined;
         if (maxContextSize === -1) maxContextSize = undefined;
@@ -162,7 +169,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
 
         const useMmap = !noMmap && llama.supportsMmap;
         const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, {
-            flashAttention, useMmap
+            flashAttention, swaFullCache, useMmap
         });
 
         console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`);
@@ -216,6 +223,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 maxContextSize,
                 minContextSize,
                 flashAttention,
+                swaFullCache,
                 tests: measures,
                 evaluateText: evaluateText == null
                     ? undefined
@@ -286,7 +294,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                             : ggufInsights.estimateContextResourceRequirements({
                                 contextSize: previousContextSizeCheck,
                                 modelGpuLayers: lastGpuLayers,
-                                flashAttention
+                                flashAttention,
+                                swaFullCache
                             });
 
                         const contextVramEstimation = contextResourceEstimation?.gpuVram;
@@ -496,7 +505,7 @@ const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
     modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention,
-    evaluateText, exitAfterMeasurement = false, onInfo
+    swaFullCache, evaluateText, exitAfterMeasurement = false, onInfo
 }: {
     modelPath: string,
     useMmap?: boolean,
@@ -508,6 +517,7 @@ async function measureModel({
     maxGpuLayers: number,
     minGpuLayers?: number,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     evaluateText?: string,
     exitAfterMeasurement?: boolean,
     onInfo(data: {
@@ -615,6 +625,7 @@ async function measureModel({
                         maxGpuLayers,
                         minGpuLayers,
                         flashAttention,
+                        swaFullCache,
                         evaluateText,
                         exitAfterMeasurement
                     } satisfies ParentToChildMessage);
@@ -716,11 +727,12 @@ async function runTestWorkerLogic() {
     }
 
     async function testContextSizes({
-        model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText,
-        exitAfterMeasurement = false
+        model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache,
+        evaluateText, exitAfterMeasurement = false
     }: {
         model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number,
-        minContextSize?: number, tests: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
+        minContextSize?: number, tests: number, flashAttention?: boolean, swaFullCache?: boolean, evaluateText?: string,
+        exitAfterMeasurement?: boolean
     }) {
         let measurementsDone: number = 0;
         const contextSizeCheckPlan = getContextSizesCheckPlan(
@@ -750,6 +762,7 @@ async function runTestWorkerLogic() {
                     ),
                     ignoreMemorySafetyChecks: currentContextSizeCheck != null,
                     flashAttention,
+                    swaFullCache,
                     failedCreationRemedy: false
                 });
 
@@ -803,11 +816,11 @@ async function runTestWorkerLogic() {
     }
 
     async function testWithGpuLayers({
-        modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText,
+        modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, evaluateText,
         exitAfterMeasurement = false
     }: {
         modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number,
-        minContextSize?: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
+        minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean
     }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
@@ -817,6 +830,7 @@ async function runTestWorkerLogic() {
                 useMmap,
                 gpuLayers,
                 defaultContextFlashAttention: flashAttention,
+                defaultContextSwaFullCache: swaFullCache,
                 ignoreMemorySafetyChecks: true
             });
             const postModelVramUsage = (await llama.getVramState()).used;
@@ -839,6 +853,7 @@ async function runTestWorkerLogic() {
                 maxContextSize,
                 minContextSize,
                 flashAttention,
+                swaFullCache,
                 tests,
                 evaluateText,
                 exitAfterMeasurement
@@ -887,6 +902,7 @@ async function runTestWorkerLogic() {
                     maxContextSize: message.maxContextSize,
                     minContextSize: message.minContextSize,
                     flashAttention: message.flashAttention,
+                    swaFullCache: message.swaFullCache,
                     evaluateText: message.evaluateText,
                     exitAfterMeasurement: message.exitAfterMeasurement
                 });
@@ -976,6 +992,7 @@ type ParentToChildMessage = {
     maxGpuLayers: number,
     minGpuLayers?: number,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     initialMaxContextSize?: number,
     maxContextSize?: number,
     minContextSize?: number,
diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts
index 7ceb9773..a896a5ce 100644
--- a/src/cli/utils/interactivelyAskForModel.ts
+++ b/src/cli/utils/interactivelyAskForModel.ts
@@ -60,6 +60,7 @@ export async function interactivelyAskForModel({
     allowLocalModels = true,
     downloadIntent = true,
     flashAttention = false,
+    swaFullCache = false,
     useMmap
 }: {
     llama: Llama,
@@ -67,6 +68,7 @@ export async function interactivelyAskForModel({
     allowLocalModels?: boolean,
     downloadIntent?: boolean,
     flashAttention?: boolean,
+    swaFullCache?: boolean,
     useMmap?: boolean
 }): Promise<string> {
     let localModelFileOptions: (ModelOption & {type: "localModel"})[] = [];
@@ -120,6 +122,7 @@ export async function interactivelyAskForModel({
 
                         const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({
                             flashAttention: flashAttention && ggufInsights?.flashAttentionSupported,
+                            swaFullCache: swaFullCache,
                             useMmap
                         });
 
@@ -292,7 +295,9 @@ export async function interactivelyAskForModel({
                 },
                 items: options,
                 renderItem(item, focused, rerender) {
-                    return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention, useMmap);
+                    return renderSelectionItem(
+                        item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap
+                    );
                 },
                 canFocusItem(item) {
                     return item.type === "recommendedModel" || item.type === "localModel" || item.type === "action";
@@ -408,7 +413,7 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise<string |
 
 function renderSelectionItem(
     item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean,
-    useMmap?: boolean
+    swaFullCache: boolean, useMmap?: boolean
 ) {
     if (item.type === "localModel") {
         let modelText = item.title instanceof Function
@@ -435,6 +440,7 @@ function renderSelectionItem(
                     rerenderOption: rerender,
                     llama,
                     flashAttention,
+                    swaFullCache,
                     useMmap
                 });
             }
@@ -557,13 +563,14 @@ function renderRecommendedModelTechnicalInfo(
 }
 
 async function selectFileForModelRecommendation({
-    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, useMmap
+    recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap
 }: {
     recommendedModelOption: ModelOption & {type: "recommendedModel"},
     llama: Llama,
     abortSignal: AbortSignal,
     rerenderOption(): void,
     flashAttention: boolean,
+    swaFullCache: boolean,
     useMmap?: boolean
 }) {
     try {
@@ -586,6 +593,7 @@ async function selectFileForModelRecommendation({
 
                 const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({
                     flashAttention,
+                    swaFullCache,
                     useMmap
                 });
 
diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts
index 7b04b0ce..219d1808 100644
--- a/src/cli/utils/resolveCommandGgufPath.ts
+++ b/src/cli/utils/resolveCommandGgufPath.ts
@@ -13,9 +13,9 @@ import {getReadablePath} from "./getReadablePath.js";
 import {interactivelyAskForModel} from "./interactivelyAskForModel.js";
 
 export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record<string, string>, {
-    targetDirectory = cliModelsDirectory, flashAttention = false, useMmap, consoleTitle = "File"
+    targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File"
 }: {
-    targetDirectory?: string, flashAttention?: boolean, useMmap?: boolean, consoleTitle?: string
+    targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string
 } = {}) {
     if (ggufPath == null)
         ggufPath = await interactivelyAskForModel({
@@ -24,6 +24,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama
             allowLocalModels: true,
             downloadIntent: true,
             flashAttention,
+            swaFullCache,
             useMmap
         });
 
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index e5797a4f..f1c263be 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -53,6 +53,7 @@ export class LlamaContext {
     /** @internal */ private readonly _totalSequences: number;
     /** @internal */ private readonly _unusedSequenceIds: number[] = [];
     /** @internal */ private readonly _batchingOptions: Required<BatchingOptions>;
+    /** @internal */ private readonly _swaFullCache: boolean = false;
     /** @internal */ private readonly _queuedDecodeSequenceIds = new Set<number>();
     /** @internal */ private readonly _queuedDecodes: InternalQueuedDecode[] = [];
     /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();
@@ -84,6 +85,7 @@ export class LlamaContext {
             dispatchSchedule: batchingDispatchSchedule = "nextCycle",
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism"
         } = {},
+        swaFullCache = _model.defaultContextSwaFullCache,
         performanceTracking = false,
         _embeddings,
         _ranking
@@ -120,6 +122,7 @@ export class LlamaContext {
                 : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1)
         );
         this._performanceTracking = !!performanceTracking;
+        this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize,
@@ -128,7 +131,8 @@ export class LlamaContext {
             threads: this._idealThreads,
             embeddings: _embeddings,
             ranking: _ranking,
-            performanceTracking: this._performanceTracking
+            performanceTracking: this._performanceTracking,
+            swaFullCache: this._swaFullCache
         }));
         this._batchingOptions = {
             dispatchSchedule: batchingDispatchSchedule,
@@ -783,6 +787,7 @@ export class LlamaContext {
         const flashAttention = _model.flashAttentionSupported
             ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
             : false;
+        const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache;
         const loraOptions = typeof options.lora === "string"
             ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"]
             : options.lora satisfies LlamaContextOptions["lora"];
@@ -799,6 +804,7 @@ export class LlamaContext {
             modelGpuLayers: _model.gpuLayers,
             modelTrainContextSize: _model.trainContextSize,
             flashAttention,
+            swaFullCache,
             getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
             llamaGpu: _model._llama.gpu,
             ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
@@ -821,10 +827,11 @@ export class LlamaContext {
                 isEmbeddingContext: options._embeddings,
                 modelGpuLayers: _model.gpuLayers,
                 batchSize,
-                flashAttention
+                flashAttention,
+                swaFullCache
             });
 
-            const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention});
+            const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention, swaFullCache});
             const contextCreationVramReservation = options.ignoreMemorySafetyChecks
                 ? null
                 : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram);
@@ -1035,6 +1042,29 @@ export class LlamaContextSequence {
         return this._tokenPredictor;
     }
 
+    /**
+     * Get the index of the first token in the KV cache.
+     *
+     * If you remove any tokens from the state that come before this index,
+     * no cached prefix tokens evaluation state will be used for the next evaluation.
+     *
+     * For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}`
+     * then the cached state for range `0-10` will be used in the next evaluation,
+     * but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all
+     * and will be re-evaluated in the next evaluation.
+     *
+     * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
+     *
+     * When SWA is used, this index will usually be `Math.max(0, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
+     *
+     * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
+     */
+    public get stateCellsStartIndex() {
+        this._ensureNotDisposed();
+
+        return this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId);
+    }
+
     /**
      * Statistics of token predictions using the sequence's `tokenPredictor`.
      *
@@ -1218,6 +1248,13 @@ export class LlamaContextSequence {
                     return ranges;
                 }, [] as ContextTokensDeleteRange[]);
 
+            const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0)
+                ? 0
+                : this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId);
+            if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition)
+                // we have to drop the cache and reevaluate the sequence due to missing KV cache
+                deletionSuccessful = false;
+
             const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens)
                 ? this._loadedTokenPredictions.length
                 : 0;
@@ -1578,12 +1615,13 @@ export class LlamaContextSequence {
         }
     }
 
+    /* eslint-disable @stylistic/max-len */
     /**
      * Save the current context sequence evaluation state to a file.
-     * @see [Saving and restoring a context sequence evaluation state
-     * ](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
      */
     public async saveStateToFile(filePath: string) {
+        /* eslint-enable @stylistic/max-len */
         this._ensureNotDisposed();
 
         const resolvedPath = path.resolve(process.cwd(), filePath);
@@ -1606,14 +1644,14 @@ export class LlamaContextSequence {
         }
     }
 
+    /* eslint-disable @stylistic/max-len */
     /**
      * Load a context sequence evaluation state from a file.
      *
      * Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error.
      *
      * You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process.
-     * @see [Saving and restoring a context sequence evaluation state
-     * ](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
+     * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state)
      */
     public async loadStateFromFile(filePath: string, acceptRisk: {
         /**
@@ -1623,6 +1661,7 @@ export class LlamaContextSequence {
          */
         acceptRisk: true
     }) {
+        /* eslint-enable @stylistic/max-len */
         if (!acceptRisk.acceptRisk)
             throw new Error("The `acceptRisk` option must be set to `true` to use this feature");
 
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 16d17bce..52a18bf9 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -99,6 +99,22 @@ export type LlamaContextOptions = {
      */
     batching?: BatchingOptions,
 
+    /**
+     * When using SWA (Sliding Window Attention) on a supported model,
+     * extend the sliding window size to the current context size (meaning practically disabling SWA).
+     *
+     * Enabling this option will consume more memory on models that support SWA (Sliding Window Attention),
+     * but will allow reusing the evaluation cache of any prefix length of the context sequence state
+     * (instead of just the size of the sliding window when SWA is used).
+     *
+     * This option has no effect on models that do not support SWA (Sliding Window Attention).
+     *
+     * > **Note:** you can check the SWA size using `model.fileInsights.swaSize`.
+     *
+     * Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`);
+     */
+    swaFullCache?: boolean,
+
     /**
      * Load the provided LoRA adapters onto the context.
      * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 0be0bddc..f53ab21a 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -111,6 +111,17 @@ export type LlamaModelOptions = {
      */
     defaultContextFlashAttention?: boolean,
 
+    /**
+     * When using SWA (Sliding Window Attention) on a supported model,
+     * extend the sliding window size to the current context size (meaning practically disabling SWA)
+     * by default for contexts created with this model.
+     *
+     * See the `swaFullCache` option of the `.createContext()` method for more information.
+     *
+     * Defaults to `false`.
+     */
+    defaultContextSwaFullCache?: boolean,
+
     /**
      * Called with the load percentage when the model is being loaded.
      * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive).
@@ -140,6 +151,7 @@ export type LlamaModelOptions = {
 
 const defaultUseMmap = true;
 const defaultContextFlashAttentionEnabled = false;
+const defaultContextSwaFullCache = false;
 
 export class LlamaModel {
     /** @internal */ public readonly _llama: Llama;
@@ -157,6 +169,7 @@ export class LlamaModel {
     /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle;
     /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean;
     /** @internal */ private readonly _defaultContextFlashAttention: boolean;
+    /** @internal */ private readonly _defaultContextSwaFullCache: boolean;
     /** @internal */ private readonly _flashAttentionSupported: boolean;
     /** @internal */ private readonly _loraAdapters = new Map<string, AddonModelLora>();
     /** @internal */ private _typeDescription?: ModelTypeDescription;
@@ -177,6 +190,7 @@ export class LlamaModel {
         _fileInsights,
         _defaultContextFlashAttentionOptionEnabled,
         _defaultContextFlashAttention,
+        _defaultContextSwaFullCache,
         _flashAttentionSupported
     }: {
         _llama: Llama,
@@ -184,6 +198,7 @@ export class LlamaModel {
         _fileInsights: GgufInsights,
         _defaultContextFlashAttentionOptionEnabled: boolean,
         _defaultContextFlashAttention: boolean,
+        _defaultContextSwaFullCache: boolean,
         _flashAttentionSupported: boolean
     }) {
         this._llama = _llama;
@@ -196,6 +211,7 @@ export class LlamaModel {
         this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle();
         this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled;
         this._defaultContextFlashAttention = _defaultContextFlashAttention;
+        this._defaultContextSwaFullCache = _defaultContextSwaFullCache;
         this._flashAttentionSupported = _flashAttentionSupported;
         const overridesList = ggufMetadataOverridesToList(metadataOverrides);
         this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
@@ -321,6 +337,10 @@ export class LlamaModel {
         return this._defaultContextFlashAttention;
     }
 
+    public get defaultContextSwaFullCache() {
+        return this._defaultContextSwaFullCache;
+    }
+
     /**
      * Transform text into tokens that can be fed to the model
      * @param text - the text to tokenize
@@ -700,9 +720,11 @@ export class LlamaModel {
         const resolvedDefaultContextFlashAttention = flashAttentionSupported
             ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
             : false;
+        const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache;
         const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
             ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
             defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
+            defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache,
             useMmap
         });
         const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({
@@ -716,7 +738,8 @@ export class LlamaModel {
             _llama,
             _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false,
             _flashAttentionSupported: flashAttentionSupported,
-            _defaultContextFlashAttention: resolvedDefaultContextFlashAttention
+            _defaultContextFlashAttention: resolvedDefaultContextFlashAttention,
+            _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache
         });
         const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks
             ? null
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index 8b0f85e9..eb8330a8 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -15,7 +15,7 @@ export type GgufInsightsResourceRequirements = {
 export class GgufInsights {
     /** @internal */ public readonly _llama: Llama;
     /** @internal */ private readonly _modelSize: number;
-    /** @internal */ private _totalLayers: number | null = null;
+    /** @internal */ private _totalFileLayers: number | null = null;
     /** @internal */ private readonly _ggufFileInfo: GgufFileInfo;
     /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver;
 
@@ -71,13 +71,8 @@ export class GgufInsights {
     }
 
     public get totalLayers() {
-        if (this._totalLayers != null)
-            return this._totalLayers;
-
         const outputLayers = 1;
-        this._totalLayers = this._getFileLayers() + outputLayers;
-
-        return this._totalLayers;
+        return this._getTotalFileLayers() + outputLayers;
     }
 
     public get modelSize() {
@@ -133,6 +128,23 @@ export class GgufInsights {
         return false;
     }
 
+    /**
+     * The size of the SWA (Sliding Window Attention).
+     *
+     * When `undefined`, the model does not use sliding window attention.
+     */
+    public get swaSize() {
+        const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window;
+        if (slidingWindow == null || slidingWindow <= 0)
+            return undefined;
+
+        const trainContextSize = this.trainContextSize;
+        if (trainContextSize != null && slidingWindow >= trainContextSize)
+            return undefined;
+
+        return slidingWindow;
+    }
+
     public estimateModelResourceRequirements({
         gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap
     }: {
@@ -152,72 +164,72 @@ export class GgufInsights {
      * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
      */
     public estimateContextResourceRequirements({
-        contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false
+        contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false,
+        swaFullCache = false
     }: {
         contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean,
-        flashAttention?: boolean, includeGraphOverhead?: boolean
+        flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean
     }): GgufInsightsResourceRequirements {
         if (sequences == null) sequences = getDefaultContextSequences();
         if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences});
 
-        const actualContextSize = contextSize * sequences;
-
-        const totalLayers = this.totalLayers;
-        const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalLayers, totalLayers));
-        const finalCpuLayers = totalLayers - finalGpuLayers;
         const llmData = this._ggufFileInfo.architectureMetadata;
+        const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
+        const slidingWindow = this.swaSize ?? 0;
+        const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize &&
+            (this.trainContextSize == null || slidingWindow < this.trainContextSize);
+        const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture);
+        const nonSwaPercent = swaPattern <= 1
+            ? 1
+            : (1 / (swaPattern + (flashAttention ? -0.5 : -1)));
+
+        // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp`
+        const kvCachePadding = flashAttention
+            ? 256
+            : 32;
+        const actualContextSize = sequences * contextSize;
+        const kvSize = usingSWA
+            ? (
+                (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) +
+                nonSwaPercent * actualContextSize
+            )
+            : actualContextSize;
+
+        const totalFileLayers = this._getTotalFileLayers();
+        const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers));
+        const finalCpuLayers = totalFileLayers - finalGpuLayers;
+        const usingGpu = finalGpuLayers !== 0;
 
         const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
-        const logitsSize = vocabularySize * batchSize;
-        const embedSize = isEmbeddingContext
-            ? (llmData.embedding_length ?? 0) * batchSize
-            : 0;
+        const embeddingSize = llmData.embedding_length ?? 0;
 
         const sizeTBytes = 8; // sizeof(size_t)
         const floatBytes = 4; // sizeof(float)
         const uint32TBytes = 4; // sizeof(uint32_t)
         const int32TBytes = 4; // sizeof(int32_t)
 
-        // source: `llama_state_get_size` in `llama.cpp`
-        const sRngSize = sizeTBytes;
-        const sRng = 64 * 1024; // LLAMA_MAX_RNG_STATE
-        const sNOutputs = sizeTBytes;
-        const sNOutputPos = batchSize * int32TBytes;
-        const sLogitsSize = sizeTBytes;
-        const sLogits = logitsSize * floatBytes;
-        const sEmbeddingSize = sizeTBytes;
-        const sEmbedding = embedSize * floatBytes;
-        const sKvBufSize = sizeTBytes;
-        const sKvHead = uint32TBytes;
-        const sKvSize = uint32TBytes;
-        const sKvUsed = uint32TBytes;
-        const sKv = 2 * int32TBytes * modelGpuLayers * this._llama._consts.ggmlTensorOverhead;
-        const sKvCell = this._llama._consts.llamaPosSize + sizeTBytes + this._llama._consts.llamaSeqIdSize;
-        const kvSelfLength = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
-            ? Math.max(1, sequences)
-            : actualContextSize;
-        const sKvCells = kvSelfLength * sKvCell;
-
-        const overheadMemory = (
-            sRngSize +
-            sRng +
-            sNOutputs +
-            sNOutputPos +
-            sLogitsSize +
-            sLogits +
-            sEmbeddingSize +
-            sEmbedding +
-            sKvBufSize +
-            sKvHead +
-            sKvSize +
-            sKvUsed +
-            sKv +
-            sKvCells
-        );
+        const estimateOutput = (nOutputs: number) => {
+            // source: `llama_context::output_reserve` in `llama-context.cpp`
+            const nOutputsMax = Math.max(batchSize, nOutputs);
+
+            const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5;
+            const hasLogits = isT5 || !isEmbeddingContext;
+            const hasEmbd = isT5 || isEmbeddingContext;
+
+            const logitsSize = hasLogits
+                ? (vocabularySize * nOutputsMax)
+                : 0;
+            const embdSize = hasEmbd
+                ? (embeddingSize * nOutputsMax)
+                : 0;
+            const outputBufferSize = (logitsSize + embdSize) * floatBytes;
+
+            const outputIdsArr = int32TBytes * batchSize;
+
+            return outputBufferSize + outputIdsArr;
+        };
 
-        // Estimates the memory allocated by `ggml_backend_sched_reserve` in `llama_new_context_with_model` in `llama.cpp`.
-        // If you read this line and have better insights on how to estimate this memory, please open a PR to improve it :)
-        const estimateGraphOverheadMemory = () => {
+        const estimateGraphOverheadMemory = (): number => {
             const s1MB = Math.pow(1024, 2);
             const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
 
@@ -234,23 +246,23 @@ export class GgufInsights {
                 if (expertCount > 0) {
                     const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2;
 
-                    return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (actualContextSize * headCount));
+                    return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount));
                 }
 
-                return int32TBytes * batchSize * (embeddingLength + (actualContextSize * headCount));
+                return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount));
             } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) {
                 if (modelGpuLayers === this.totalLayers) {
                     defaultCalculationAdjustment -= (s1MB * 340) * (
                         this.trainContextSize == null
                             ? 1
-                            : actualContextSize / this.trainContextSize
+                            : kvSize / this.trainContextSize
                     );
                 } else {
                     defaultCalculationAdjustment -= (s1MB * 250) + (
                         (s1MB * 50) * (
                             this.trainContextSize == null
                                 ? 1
-                                : actualContextSize / this.trainContextSize
+                                : kvSize / this.trainContextSize
                         )
                     );
                 }
@@ -263,7 +275,7 @@ export class GgufInsights {
                         (s1MB * 270) * (
                             this.trainContextSize == null
                                 ? 1
-                                : actualContextSize / this.trainContextSize
+                                : kvSize / this.trainContextSize
                         )
                     );
                 } else {
@@ -271,21 +283,21 @@ export class GgufInsights {
                         (s1MB * 150) * (
                             this.trainContextSize == null
                                 ? 1
-                                : Math.max(0, (1 - (actualContextSize / this.trainContextSize)))
+                                : Math.max(0, (1 - (kvSize / this.trainContextSize)))
                         )
                     );
                 }
             } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) {
                 const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
 
-                return (int32TBytes * batchSize * actualContextSize * headCount) - (50 * s1MB);
+                return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB);
 
                 // if (modelGpuLayers === this.totalLayers) {
                 //     defaultCalculationAdjustment += -(s1MB * 20) + (
                 //         (s1MB * 250) * (
                 //             this.trainContextSize == null
                 //                 ? 1
-                //                 : actualContextSize / this.trainContextSize
+                //                 : kvSize / this.trainContextSize
                 //         )
                 //     );
                 // } else {
@@ -293,7 +305,7 @@ export class GgufInsights {
                 //         (s1MB * 300) * (
                 //             this.trainContextSize == null
                 //                 ? 1
-                //                 : actualContextSize / this.trainContextSize
+                //                 : kvSize / this.trainContextSize
                 //         )
                 //     );
                 // }
@@ -312,37 +324,49 @@ export class GgufInsights {
 
             if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) {
                 // magic numbers for estimation. will be improved in the future
-                return (totalElements * 123 * (actualContextSize / 4096)) + defaultCalculationAdjustment;
+                return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment;
             }
 
             // magic numbers for estimation. will be improved in the future
-            return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment;
+            return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment;
         };
 
+        const gpuKVCacheSize = usingGpu
+            ? this._estimateKvMemorySizeInBytes(
+                kvSize,
+                finalGpuLayers < totalFileLayers
+                    ? (finalGpuLayers + 1)
+                    : finalGpuLayers
+            )
+            : 0;
+        const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers);
+
+        // source: `llama_context::graph_max_nodes` in `llama-context.cpp`
+        const maxNodes = Math.max(65536, 5 * tensorInfo.length);
+        const cpuNodes = 5 * (tensorInfo.length * (finalCpuLayers / totalFileLayers));
+        const gpuNodes = maxNodes - cpuNodes;
+
+        const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) +
+            this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false);
+        const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) +
+            this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false);
+
         const graphOverheadMemory = (flashAttention || !includeGraphOverhead)
             ? 0
             : estimateGraphOverheadMemory();
+        const graphOverheadGpuSize = usingGpu
+            ? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers))
+            : 0;
+        const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize;
 
-        const usingGpu = finalGpuLayers !== 0;
+        const outputBufferSize = estimateOutput(sequences);
 
-        const cpuRam = (
-            !usingGpu
-                ? (overheadMemory + graphOverheadMemory)
+        const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + (
+            usingGpu
+                ? outputBufferSize
                 : 0
-        ) +
-            this._estimateKvMemorySizeInBytes(actualContextSize, finalCpuLayers);
-        const gpuVram = usingGpu
-            ? (
-                overheadMemory +
-                graphOverheadMemory +
-                this._estimateKvMemorySizeInBytes(
-                    actualContextSize,
-                    finalGpuLayers < totalLayers
-                        ? (finalGpuLayers + 1)
-                        : finalGpuLayers
-                )
-            )
-            : 0;
+        );
+        const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize;
 
         return {
             cpuRam,
@@ -449,7 +473,7 @@ export class GgufInsights {
     }
 
     /** @internal */
-    public _estimateKvMemorySizeInBytes(contextSize: number, layers: number) {
+    public _estimateKvMemorySizeInBytes(kvSize: number, layers: number) {
         // source: `llama_kv_cache_init` in `llama.cpp`
         const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0;
         const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0;
@@ -483,8 +507,8 @@ export class GgufInsights {
             const totalNEmbdKGqa = nEmbdKGqa + modelNEmbdKS;
             const totalNEmbdVGqa = nEmbdVGqa + modelNEmbdVS;
 
-            totalElementsK += totalNEmbdKGqa * contextSize;
-            totalElementsV += totalNEmbdVGqa * contextSize;
+            totalElementsK += totalNEmbdKGqa * kvSize;
+            totalElementsV += totalNEmbdVGqa * kvSize;
         }
 
         const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba
@@ -504,6 +528,16 @@ export class GgufInsights {
         );
     }
 
+    /** @internal */
+    private _getTotalFileLayers() {
+        if (this._totalFileLayers != null)
+            return this._totalFileLayers;
+
+        this._totalFileLayers = this._getFileLayers();
+
+        return this._totalFileLayers;
+    }
+
     /**
      * @param ggufFileInfo
      * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance.
@@ -718,3 +752,25 @@ function isTokenEmbedLayer(layerName: string) {
 
     return firstPart === "token_embd";
 }
+
+function ggmlPad(value: number, padding: number): number {
+    return ((value + padding - 1) & ~(padding - 1));
+}
+
+function getSwaPatternForArchitecture(architecture?: GgufArchitectureType): number {
+    // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern`
+    switch (architecture) {
+        case GgufArchitectureType.llama4:
+            return 4;
+        case GgufArchitectureType.phi3:
+            return 1;
+        case GgufArchitectureType.gemma2:
+            return 2;
+        case GgufArchitectureType.gemma3:
+            return 6;
+        case GgufArchitectureType.cohere2:
+            return 4;
+    }
+
+    return 1;
+}
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index 05595c98..cbae45d5 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -39,12 +39,14 @@ export class GgufInsightsConfigurationResolver {
         targetContextSize,
         embeddingContext = false,
         flashAttention = false,
+        swaFullCache = false,
         useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         targetGpuLayers?: number | "max",
         targetContextSize?: number,
         embeddingContext?: boolean,
         flashAttention?: boolean,
+        swaFullCache?: boolean,
         useMmap?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
@@ -63,6 +65,7 @@ export class GgufInsightsConfigurationResolver {
     } = {}) {
         const compatibilityScore = await this.scoreModelConfigurationCompatibility({
             flashAttention,
+            swaFullCache,
             contextSize: targetContextSize,
             embeddingContext,
             forceGpuLayers: targetGpuLayers,
@@ -105,6 +108,7 @@ export class GgufInsightsConfigurationResolver {
         contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096),
         embeddingContext = false,
         flashAttention = false,
+        swaFullCache = false,
         maximumFittedContextSizeMultiplier = 100,
         maximumUnfitConfigurationResourceMultiplier = 100,
         forceStrictContextSize = false,
@@ -114,6 +118,7 @@ export class GgufInsightsConfigurationResolver {
         contextSize?: number,
         embeddingContext?: boolean,
         flashAttention?: boolean,
+        swaFullCache?: boolean,
         maximumFittedContextSizeMultiplier?: number,
         maximumUnfitConfigurationResourceMultiplier?: number,
 
@@ -209,6 +214,7 @@ export class GgufInsightsConfigurationResolver {
                     llamaGpu,
                     llamaSupportsGpuOffloading,
                     defaultContextFlashAttention: flashAttention,
+                    defaultContextSwaFullCache: swaFullCache,
                     ignoreMemorySafetyChecks: forceGpuLayers != null,
                     useMmap
                 }
@@ -263,7 +269,8 @@ export class GgufInsightsConfigurationResolver {
                 modelGpuLayers: resolvedGpuLayers,
                 modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes,
                 ignoreMemorySafetyChecks: forceStrictContextSize,
-                flashAttention
+                flashAttention,
+                swaFullCache
             });
             contextFitsMemory = true;
         } catch (err) {
@@ -275,7 +282,8 @@ export class GgufInsightsConfigurationResolver {
             contextSize: resolvedContextSize,
             isEmbeddingContext: embeddingContext,
             modelGpuLayers: resolvedGpuLayers,
-            flashAttention
+            flashAttention,
+            swaFullCache
         });
 
         const rankPoints = {
@@ -371,11 +379,12 @@ export class GgufInsightsConfigurationResolver {
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu,
         llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading,
         defaultContextFlashAttention = false,
+        defaultContextSwaFullCache = false,
         useMmap = this._ggufInsights._llama.supportsMmap
     }: {
         ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>,
         llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean,
-        useMmap?: boolean
+        defaultContextSwaFullCache?: boolean, useMmap?: boolean
     } = {}) {
         return resolveModelGpuLayersOption(gpuLayers, {
             ggufInsights: this._ggufInsights,
@@ -385,6 +394,7 @@ export class GgufInsightsConfigurationResolver {
             llamaGpu,
             llamaSupportsGpuOffloading,
             defaultContextFlashAttention,
+            defaultContextSwaFullCache,
             useMmap
         });
     }
@@ -399,6 +409,7 @@ export class GgufInsightsConfigurationResolver {
         batchSize,
         modelTrainContextSize,
         flashAttention = false,
+        swaFullCache = false,
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()),
         getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()),
@@ -410,6 +421,7 @@ export class GgufInsightsConfigurationResolver {
         modelGpuLayers: number,
         modelTrainContextSize: number,
         flashAttention?: boolean,
+        swaFullCache?: boolean,
         batchSize?: LlamaContextOptions["batchSize"],
         sequences?: number,
         getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>,
@@ -427,6 +439,7 @@ export class GgufInsightsConfigurationResolver {
             modelGpuLayers,
             modelTrainContextSize,
             flashAttention,
+            swaFullCache,
             getVramState,
             getRamState,
             getSwapState,
diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
index f800f712..49ace603 100644
--- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts
+++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
@@ -9,7 +9,7 @@ import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js";
 const defaultMaxContextSizeSwapUse = 2048;
 
 export async function resolveContextContextSizeOption({
-    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention,
+    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache,
     getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false,
     maxContextSizeSwapUse = defaultMaxContextSizeSwapUse
 }: {
@@ -20,6 +20,7 @@ export async function resolveContextContextSizeOption({
     modelGpuLayers: number,
     modelTrainContextSize: number,
     flashAttention: boolean,
+    swaFullCache: boolean,
     getVramState(): Promise<{total: number, free: number, unifiedSize: number}>,
     getRamState(): Promise<{total: number, free: number}>,
     getSwapState(): Promise<{total: number, free: number}>,
@@ -52,6 +53,7 @@ export async function resolveContextContextSizeOption({
             modelGpuLayers: modelGpuLayers,
             sequences,
             flashAttention,
+            swaFullCache,
             isEmbeddingContext
         });
 
@@ -97,6 +99,7 @@ export async function resolveContextContextSizeOption({
                 modelGpuLayers: modelGpuLayers,
                 sequences,
                 flashAttention,
+                swaFullCache,
                 isEmbeddingContext
             });
 
@@ -145,6 +148,7 @@ export async function resolveContextContextSizeOption({
             modelGpuLayers: modelGpuLayers,
             sequences,
             flashAttention,
+            swaFullCache,
             isEmbeddingContext
         });
 
diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
index 1edae352..62d58141 100644
--- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
+++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
@@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5;
 
 export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], {
     ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize,
-    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap
+    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap
 }: {
     ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean,
     getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu,
-    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, useMmap?: boolean
+    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean
 }): Promise<number> {
     if (gpuLayers == null)
         gpuLayers = "auto";
@@ -37,6 +37,7 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
             ggufInsights,
             currentVram: vramState.free,
             defaultContextFlashAttention,
+            defaultContextSwaFullCache,
             useMmap
         });
 
@@ -73,6 +74,7 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
                 ? gpuLayers.max
                 : undefined,
             defaultContextFlashAttention,
+            defaultContextSwaFullCache,
             useMmap
         });
 
@@ -95,6 +97,7 @@ function getBestGpuLayersForFreeVram({
     minGpuLayers,
     maxGpuLayers,
     defaultContextFlashAttention,
+    defaultContextSwaFullCache,
     useMmap
 }: {
     ggufInsights: GgufInsights,
@@ -103,6 +106,7 @@ function getBestGpuLayersForFreeVram({
     minGpuLayers?: number,
     maxGpuLayers?: number,
     defaultContextFlashAttention: boolean,
+    defaultContextSwaFullCache: boolean,
     useMmap?: boolean
 }) {
     return findBestOption({
@@ -123,6 +127,7 @@ function getBestGpuLayersForFreeVram({
                 currentVram: freeVram,
                 fitContext,
                 defaultContextFlashAttention,
+                defaultContextSwaFullCache,
                 useMmap
             });
 
@@ -182,10 +187,10 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer
 }
 
 function getVramRequiredForGpuLayers({
-    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, useMmap
+    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap
 }: {
     gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean},
-    defaultContextFlashAttention: boolean, useMmap?: boolean
+    defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean
 }) {
     const modelVram = ggufInsights.estimateModelResourceRequirements({
         gpuLayers,
@@ -202,7 +207,8 @@ function getVramRequiredForGpuLayers({
             modelGpuLayers: gpuLayers,
             sequences: 1,
             isEmbeddingContext: fitContext.embeddingContext ?? false,
-            flashAttention: defaultContextFlashAttention
+            flashAttention: defaultContextFlashAttention,
+            swaFullCache: defaultContextSwaFullCache
         }).gpuVram;
 
         const totalVram = modelVram + contextVram;
@@ -221,7 +227,8 @@ function getVramRequiredForGpuLayers({
         ggufInsights,
         vram: currentVram - modelVram,
         isEmbeddingContext: fitContext?.embeddingContext ?? false,
-        flashAttention: defaultContextFlashAttention
+        flashAttention: defaultContextFlashAttention,
+        swaFullCache: defaultContextSwaFullCache
     });
 
     if (maxContext == null || modelVram + maxContext.vram > currentVram)
@@ -234,8 +241,8 @@ function getVramRequiredForGpuLayers({
     };
 }
 
-function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention}: {
-    gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean
+function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache}: {
+    gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, swaFullCache: boolean
 }) {
     const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize});
 
@@ -250,7 +257,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb
                 modelGpuLayers: gpuLayers,
                 sequences: 1,
                 isEmbeddingContext,
-                flashAttention
+                flashAttention,
+                swaFullCache
             }).gpuVram;
 
             if (contextVram <= vram)
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index 827493fc..c665b958 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -316,6 +316,7 @@ export type GgufMetadataDefaultArchitectureType = {
         readonly layer_norm_rms_epsilon?: number,
         readonly key_length?: number,
         readonly value_length?: number,
+        readonly sliding_window?: number,
         readonly causal?: boolean
     },
 

From 2e4877ad8880e96823d263210fc24f7b0c048666 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 1 Jun 2025 02:12:13 +0300
Subject: [PATCH 02/14] fix: bugs and types

---
 src/cli/commands/DebugCommand.ts          |  1 +
 src/cli/utils/interactivelyAskForModel.ts |  2 +-
 src/gguf/types/GgufMetadataTypes.ts       |  4 ++--
 src/gguf/types/GgufTensorInfoTypes.ts     | 14 ++++++++++++--
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/cli/commands/DebugCommand.ts b/src/cli/commands/DebugCommand.ts
index 149de90d..d2ee7117 100644
--- a/src/cli/commands/DebugCommand.ts
+++ b/src/cli/commands/DebugCommand.ts
@@ -65,5 +65,6 @@ async function DebugCmakeOptionsFunction() {
     console.info();
 
     console.info(`${chalk.yellow("CMake options:")} ${prettyPrintObject(llama.cmakeOptions)}`);
+    console.info(`${chalk.yellow("Release:")} ${prettyPrintObject(llama.llamaCppRelease)}`);
 }
 
diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts
index a896a5ce..8238daec 100644
--- a/src/cli/utils/interactivelyAskForModel.ts
+++ b/src/cli/utils/interactivelyAskForModel.ts
@@ -122,7 +122,7 @@ export async function interactivelyAskForModel({
 
                         const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({
                             flashAttention: flashAttention && ggufInsights?.flashAttentionSupported,
-                            swaFullCache: swaFullCache,
+                            swaFullCache,
                             useMmap
                         });
 
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index c665b958..5f8a48e1 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -135,8 +135,8 @@ export enum GgufFileType {
     MOSTLY_Q4_0_4_4 = 33, // deprecated
     MOSTLY_Q4_0_4_8 = 34, // deprecated
     MOSTLY_Q4_0_8_8 = 35, // deprecated
-    MOSTLY_TQ1_0 = 36, // deprecated
-    MOSTLY_TQ2_0 = 37 // deprecated
+    MOSTLY_TQ1_0 = 36,
+    MOSTLY_TQ2_0 = 37
 }
 
 
diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts
index 28ae45c3..b23bf1f8 100644
--- a/src/gguf/types/GgufTensorInfoTypes.ts
+++ b/src/gguf/types/GgufTensorInfoTypes.ts
@@ -6,7 +6,7 @@ export type GgufTensorInfo = {
 
     /**
      * Adjusted offset relative to the file.
-     * 
+     *
      * Added by the GGUF parser - not part of the file's metadata.
      */
     readonly fileOffset: number | bigint,
@@ -49,5 +49,15 @@ export const enum GgmlType {
     I16 = 25,
     I32 = 26,
     I64 = 27,
-    F64 = 28
+    F64 = 28,
+    IQ1_M = 29,
+    BF16 = 30,
+    Q4_0_4_4 = 31,
+    Q4_0_4_8 = 32,
+    Q4_0_8_8 = 33,
+    TQ1_0 = 34,
+    TQ2_0 = 35,
+    IQ4_NL_4_4 = 36,
+    IQ4_NL_4_8 = 37,
+    IQ4_NL_8_8 = 38,
 }

From 25f016df7dde0787cdd652687aae9ccad3b51d66 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 1 Jun 2025 03:24:43 +0300
Subject: [PATCH 03/14] feat: thought budget, improve prompt completion

---
 docs/guide/chat-session.md                    |  55 ++++++
 ...entSettingsFromTokenizerAndChatTemplate.ts |   3 +-
 src/cli/commands/ChatCommand.ts               |  20 ++-
 src/evaluator/LlamaChat/LlamaChat.ts          | 166 +++++++++++++++---
 .../LlamaChatSession/LlamaChatSession.ts      |  22 ++-
 .../llama3.2/promptCompletion.test.ts         |  98 +++++++++++
 .../qwen3-0.6b/thinkingBudget.test.ts         |  95 ++++++++++
 test/utils/modelFiles.ts                      |   3 +-
 8 files changed, 431 insertions(+), 31 deletions(-)
 create mode 100644 test/modelDependent/llama3.2/promptCompletion.test.ts
 create mode 100644 test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts

diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md
index 992a6487..ec9c8541 100644
--- a/docs/guide/chat-session.md
+++ b/docs/guide/chat-session.md
@@ -898,3 +898,58 @@ const fullResponse = a1.response
 
 console.log("Full response: " + fullResponse);
 ```
+
+## Set Thinking Budget {#thinking-budget}
+You can set a thinking budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments).
+```typescript
+import {
+    getLlama, LlamaChatSession, resolveModelFile, Token
+} from "node-llama-cpp";
+
+const modelPath = await resolveModelFile("hf:Qwen/Qwen3-14B-GGUF:Q4_K_M");
+
+const llama = await getLlama();
+const model = await llama.loadModel({modelPath});
+const context = await model.createContext();
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence()
+});
+
+
+const q1 = "Where do llamas come from?";
+console.log("User: " + q1);
+
+const maxThoughtTokens = 100;
+
+let responseTokens = 0;
+let thoughtTokens = 0;
+
+process.stdout.write("AI: ");
+const response = await session.prompt(q1, {
+    budgets: {
+        thoughtTokens: maxThoughtTokens
+    },
+    onResponseChunk(chunk) {
+        const isThoughtSegment = chunk.type === "segment" &&
+            chunk.segmentType === "thought";
+
+        if (chunk.type === "segment" && chunk.segmentStartTime != null)
+            process.stdout.write(` [segment start: ${chunk.segmentType}] `);
+
+        process.stdout.write(chunk.text);
+
+        if (chunk.type === "segment" && chunk.segmentEndTime != null)
+            process.stdout.write(` [segment end: ${chunk.segmentType}] `);
+
+        if (isThoughtSegment)
+            thoughtTokens += chunk.tokens.length;
+        else
+            responseTokens += chunk.tokens.length;
+    }
+});
+
+console.log("Response: " + response);
+
+console.log("Response tokens: " + responseTokens);
+console.log("Thought tokens: " + thoughtTokens);
+```
diff --git a/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts b/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts
index 57fc4ceb..30f434a0 100644
--- a/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts
+++ b/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts
@@ -41,7 +41,8 @@ export function extractSegmentSettingsFromTokenizerAndChatTemplate(
     return removeUndefinedFields({
         thought: tryMatchPrefixSuffixPair([
             ["<think>", "</think>"], // DeepSeek, QwQ
-            ["<thought>", "</thought>"] // EXAONE Deep
+            ["<thought>", "</thought>"], // EXAONE Deep
+            ["<|START_THINKING|>", "<|END_THINKING|>"] // Command R7B
         ])
     });
 }
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index 26e85cdd..d1ebc8e1 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -62,6 +62,7 @@ type ChatCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
+    thoughtBudget?: number,
     noHistory: boolean,
     environmentFunctions: boolean,
     tokenPredictionDraftModel?: string,
@@ -262,6 +263,13 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
+            .option("thoughtBudget", {
+                alias: ["tb", "thinkingBudget", "reasoningBudget"],
+                type: "number",
+                default: -1,
+                defaultDescription: "Unlimited",
+                description: "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning"
+            })
             .option("noHistory", {
                 alias: "nh",
                 type: "boolean",
@@ -318,7 +326,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory,
         environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
     }) {
         try {
@@ -327,8 +335,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-                maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter,
-                timing, noMmap, printTimings
+                maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+                debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -344,11 +352,12 @@ async function RunChat({
     contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
-    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel,
+    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
     tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
+    if (thoughtBudget === -1) thoughtBudget = undefined;
 
     const headers = resolveHeaderFlag(headerArg);
     const trimWhitespace = !noTrimWhitespace;
@@ -686,6 +695,9 @@ async function RunChat({
                 seed: seed ?? undefined,
                 signal: abortController.signal,
                 stopOnAbortSignal: true,
+                budgets: {
+                    thoughtTokens: thoughtBudget
+                },
                 repeatPenalty: {
                     penalty: repeatPenalty,
                     frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined,
diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts
index da15b1c0..77a171d9 100644
--- a/src/evaluator/LlamaChat/LlamaChat.ts
+++ b/src/evaluator/LlamaChat/LlamaChat.ts
@@ -294,7 +294,26 @@ export type LLamaChatGenerateResponseOptions<Functions extends ChatModelFunction
      *
      * Only relevant when using function calling (via passing the `functions` option).
      */
-    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void,
+
+    /**
+     * Set the maximum number of tokens the model is allowed to spend on various segmented responses.
+     */
+    budgets?: {
+        /**
+         * Whether to include the tokens already consumed by the current model response being completed in the budget.
+         *
+         * Defaults to `true`.
+         */
+        includeCurrentResponse?: boolean,
+
+        /**
+         * Budget for thought tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        thoughtTokens?: number
+    }
 } & ({
     grammar?: LlamaGrammar,
     functions?: never,
@@ -515,6 +534,7 @@ export class LlamaChat {
             onToken,
             onResponseChunk,
             onFunctionCallParamsChunk,
+            budgets,
             signal,
             stopOnAbortSignal = false,
             maxTokens,
@@ -552,6 +572,7 @@ export class LlamaChat {
                 onToken,
                 onResponseChunk,
                 onFunctionCallParamsChunk,
+                budgets,
                 signal,
                 stopOnAbortSignal,
                 maxTokens,
@@ -595,6 +616,7 @@ export class LlamaChat {
                     );
                 };
                 const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true);
+                const loadContextWindowForBudgetTriggers = async () => loadContextWindow(false);
 
                 while (true) {
                     generateResponseState.startTokenLoop();
@@ -657,6 +679,15 @@ export class LlamaChat {
                         if (maxTokensTriggerRes != null)
                             return maxTokensTriggerRes;
 
+                        if (generateResponseState.updateShouldContextShift())
+                            break;
+
+                        if (await generateResponseState.handleBudgetTriggers()) {
+                            await loadContextWindowForBudgetTriggers();
+                            await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
+                            await generateResponseState.createNewEvaluationIterator();
+                        }
+
                         if (generateResponseState.updateShouldContextShift())
                             break;
 
@@ -797,6 +828,17 @@ export class LlamaChat {
                             StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer)
                         );
 
+                    allSegmentTypes
+                        .map((segmentType) => getChatWrapperSegmentDefinition(this._chatWrapper.settings, segmentType))
+                        .filter((segmentDefinition) => segmentDefinition != null)
+                        .flatMap((segmentDefinition) => [segmentDefinition?.prefix, segmentDefinition?.suffix])
+                        .filter((trigger) => trigger != null)
+                        .forEach((trigger) => (
+                            generateResponseState.stopGenerationDetector.addStopTrigger(
+                                StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(trigger), this.model.tokenizer)
+                            )
+                        ));
+
                     await generateResponseState.alignCurrentSequenceStateWithCurrentTokens();
 
                     if (generateResponseState.maxTokens === 0) {
@@ -827,7 +869,15 @@ export class LlamaChat {
 
                             generateResponseState.popStreamRegulatorFreeTokens();
 
-                            const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user");
+                            const someOfCurrentTokensAreSpecial = generateResponseState.currentTokens.some((token) => (
+                                this.model.isSpecialToken(token)
+                            ));
+                            const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger(
+                                "user",
+                                someOfCurrentTokensAreSpecial
+                                    ? "eogToken"
+                                    : undefined
+                            );
                             if (stopGenerationTriggerRes != null)
                                 return {
                                     completion: stopGenerationTriggerRes.response,
@@ -1251,10 +1301,9 @@ function generateContextTextThatEndsWithUserText(
         ...options,
         chatHistory: setLastUserTextInChatHistory(options.chatHistory, lastUserText + randomId)
     });
-    let newContextText = contextText;
 
-    for (let i = 0; i < newContextText.values.length; i++) {
-        const item = newContextText.values[i];
+    for (let i = 0; i < contextText.values.length; i++) {
+        const item = contextText.values[i];
         if (typeof item !== "string")
             continue;
 
@@ -1263,15 +1312,14 @@ function generateContextTextThatEndsWithUserText(
             continue;
 
         const newValue = item.slice(0, randomTextIndex);
-        newContextText = LlamaText([
-            ...newContextText.values.slice(0, i),
-            newValue
-        ]);
         return {
-            contextText: newContextText,
+            contextText: LlamaText([
+                ...contextText.values.slice(0, i),
+                newValue
+            ]),
             userTextSuffix: LlamaText([
                 item.slice(randomTextIndex + randomId.length),
-                ...newContextText.values.slice(i + 1)
+                ...contextText.values.slice(i + 1)
             ]),
             ...rest
         };
@@ -1485,6 +1533,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
     private readonly onToken: LLamaChatGenerateResponseOptions<Functions>["onToken"];
     private readonly onResponseChunk: LLamaChatGenerateResponseOptions<Functions>["onResponseChunk"];
     private readonly onFunctionCallParamsChunk: LLamaChatGenerateResponseOptions<Functions>["onFunctionCallParamsChunk"];
+    private readonly budgets: LLamaChatGenerateResponseOptions<Functions>["budgets"];
     private readonly signal: LLamaChatGenerateResponseOptions<Functions>["signal"];
     private readonly stopOnAbortSignal: LLamaChatGenerateResponseOptions<Functions>["stopOnAbortSignal"];
     public readonly maxTokens: LLamaChatGenerateResponseOptions<Functions>["maxTokens"];
@@ -1584,6 +1633,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             onToken,
             onResponseChunk,
             onFunctionCallParamsChunk,
+            budgets,
             signal,
             stopOnAbortSignal = false,
             maxTokens,
@@ -1617,6 +1667,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         this.onToken = safeEventCallback(onToken);
         this.onResponseChunk = safeEventCallback(onResponseChunk);
         this.onFunctionCallParamsChunk = safeEventCallback(onFunctionCallParamsChunk);
+        this.budgets = budgets;
         this.signal = signal;
         this.stopOnAbortSignal = stopOnAbortSignal;
         this.maxTokens = maxTokens;
@@ -1698,6 +1749,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                 segmentDefinitions.set(segmentType, segmentDefinition);
         }
 
+        const lastModelMessageFullResponse = getLastModelMessageFullResponseFromChatHistory(this.resolvedHistory);
         this.segmentHandler = new SegmentHandler({
             model: this.llamaChat.model,
             onTextChunk: this.onTextChunk,
@@ -1706,9 +1758,10 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
             previousTokens: this.getLastTokens(),
             closeAllSegments: this.chatWrapper.settings.segments?.closeAllSegments,
             segmentDefinitions,
-            initialSegmentStack: SegmentHandler.getStackFromModelResponse(
-                getLastModelMessageFullResponseFromChatHistory(this.resolvedHistory)
-            )
+            initialSegmentStack: SegmentHandler.getStackFromModelResponse(lastModelMessageFullResponse),
+            initialTokenCounts: this.budgets?.includeCurrentResponse === false
+                ? new Map()
+                : SegmentHandler.getSegmentTokenCounts(lastModelMessageFullResponse, this.llamaChat.model.tokenizer)
         });
 
         this.getPenaltyTokens = this.getPenaltyTokens.bind(this);
@@ -2714,9 +2767,9 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         pushAll(this.pendingTokens, this.streamRegulator.popFreeChunkTokens());
     }
 
-    public handleStopGenerationTrigger(lastHistoryItemType: "user" | "model") {
+    public handleStopGenerationTrigger(lastHistoryItemType: "user" | "model", forceStopReason?: "eogToken") {
         if (this.stopGenerationDetector.hasTriggeredStops || this.customStopGenerationTriggersDetector.hasTriggeredStops ||
-            this.llamaChat.model.isEogToken(this.currentToken)
+            this.llamaChat.model.isEogToken(this.currentToken) || forceStopReason != null
         ) {
             this.stopGenerationDetector.clearInProgressStops();
             this.customStopGenerationTriggersDetector.clearInProgressStops();
@@ -2761,7 +2814,7 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
                 ),
                 contextShiftMetadata: this.lastHistoryCompressionMetadata
             };
-            const isEogToken = this.llamaChat.model.isEogToken(this.currentToken);
+            const isEogToken = this.llamaChat.model.isEogToken(this.currentToken) || forceStopReason === "eogToken";
 
             if (isEogToken || this.stopGenerationDetector.hasTriggeredStops) {
                 return {
@@ -2854,6 +2907,26 @@ class GenerateResponseState<const Functions extends ChatModelFunctions | undefin
         return undefined;
     }
 
+    public async handleBudgetTriggers() {
+        let shouldReloadEvaluationState = false;
+
+        const hasBudget = (budget: number | undefined): budget is number => budget != null && budget !== Infinity;
+
+        const hasBudgetTriggers = this.budgets != null && hasBudget(this.budgets.thoughtTokens);
+        if (!hasBudgetTriggers)
+            return shouldReloadEvaluationState;
+
+        if (hasBudget(this.budgets.thoughtTokens) && this.segmentHandler.isSegmentTypeOpen("thought")) {
+            const usedThoughtTokens = this.segmentHandler.getSegmentTokensCount("thought");
+            if (usedThoughtTokens >= this.budgets.thoughtTokens) {
+                this.segmentHandler.closeSegment("thought");
+                shouldReloadEvaluationState = true;
+            }
+        }
+
+        return shouldReloadEvaluationState;
+    }
+
     public updateShouldContextShift() {
         this.shouldContextShift = this.llamaChat.sequence.nextTokenIndex >= this.llamaChat.context.contextSize - 1;
         return this.shouldContextShift;
@@ -2946,6 +3019,7 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
     private _ownedSegmentsStackLength: number = 0;
     private readonly _segments: RawSegment<S>[] = [];
     private readonly _segmentsStartTokenTrail: Token[] = [];
+    private readonly _segmentTokenCounts: Map<S | undefined, number>;
     private readonly _contextWindowSegments: RawSegment<S>[] = [];
     private readonly _contextWindowStartTokenTrail: Token[] = [];
     private readonly _initialTokensTrail: Token[];
@@ -2958,7 +3032,7 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
 
     public constructor({
         model, onTextChunk, onToken, onResponseChunk,
-        segmentDefinitions, closeAllSegments, initialSegmentStack,
+        segmentDefinitions, closeAllSegments, initialSegmentStack, initialTokenCounts,
         previousTokens
     }: {
         model: LlamaModel,
@@ -2971,6 +3045,7 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
         }>,
         closeAllSegments?: string | LlamaText,
         initialSegmentStack: S[],
+        initialTokenCounts: Map<S | undefined, number>,
         previousTokens: Token[]
     }) {
         this.model = model;
@@ -2990,6 +3065,7 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
         this._segmentsStackSet = new Set(initialSegmentStack);
         this._ownedSegmentsStackLength = initialSegmentStack.length;
         this._segmentDefinitions = segmentDefinitions;
+        this._segmentTokenCounts = new Map(initialTokenCounts);
 
         for (const [segment, {prefix, suffix}] of segmentDefinitions.entries()) {
             this._segmentDetectors.set(segment, {
@@ -3051,6 +3127,21 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
         });
     }
 
+    public closeSegment(type: S) {
+        if (!this.isSegmentTypeOpen(type))
+            return;
+
+        this._closeSegment(type);
+    }
+
+    public getSegmentTokensCount(type: S | undefined): number {
+        return this._segmentTokenCounts.get(type) ?? 0;
+    }
+
+    public isSegmentTypeOpen(type: S): boolean {
+        return this._segmentsStackSet.has(type);
+    }
+
     private _processTokens(tokens: Token[], text: string) {
         const queuedTokenRelease = this._streamRegulator.addChunk({
             tokens,
@@ -3212,11 +3303,6 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
                 });
             }
 
-            this._segmentsStackSet.delete(this._segmentsStack.pop()!);
-
-            if (this._segmentsStack.length < this._ownedSegmentsStackLength)
-                this._ownedSegmentsStackLength = this._segmentsStack.length;
-
             const lastContextWindowSegment = this._contextWindowSegments.at(-1);
             if (lastContextWindowSegment != null && !(lastContextWindowSegment instanceof Array) &&
                 lastContextWindowSegment.type === type && this._segmentsStack.at(-1) === type
@@ -3228,6 +3314,11 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
             } else
                 this._contextWindowSegments.push({type, tokens: [], ended: true, start: false, endTime: now});
 
+            this._segmentsStackSet.delete(this._segmentsStack.pop()!);
+
+            if (this._segmentsStack.length < this._ownedSegmentsStackLength)
+                this._ownedSegmentsStackLength = this._segmentsStack.length;
+
             return;
         }
 
@@ -3280,6 +3371,8 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
         const lastContextWindowSegment = this._contextWindowSegments.at(-1);
         const type = this._segmentsStack.at(-1);
 
+        this._segmentTokenCounts.set(type, (this._segmentTokenCounts.get(type) ?? 0) + tokens.length);
+
         if (type == null) {
             if (lastSegment == null) {
                 const text = (this.onResponseChunk != null || this.onTextChunk != null)
@@ -3497,4 +3590,31 @@ class SegmentHandler<const S extends ChatModelSegmentType = ChatModelSegmentType
 
         return stack;
     }
+
+    public static getSegmentTokenCounts(
+        modelResponse: ChatModelResponse["response"],
+        tokenizer: Tokenizer
+    ) {
+        const segmentTokenCounts = new Map<ChatModelSegmentType | undefined, number>();
+
+        for (const item of modelResponse) {
+            if (typeof item === "string") {
+                segmentTokenCounts.set(
+                    undefined,
+                    (segmentTokenCounts.get(undefined) ?? 0) + tokenizer(item, false, "trimLeadingSpace").length
+                );
+                continue;
+            } else if (isChatModelResponseFunctionCall(item))
+                continue;
+
+            void (item.type satisfies "segment");
+
+            segmentTokenCounts.set(
+                item.segmentType,
+                (segmentTokenCounts.get(item.segmentType) ?? 0) + tokenizer(item.text, false, "trimLeadingSpace").length
+            );
+        }
+
+        return segmentTokenCounts;
+    }
 }
diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
index cb64518d..f0a0ba77 100644
--- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts
+++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts
@@ -209,7 +209,19 @@ export type LLamaChatPromptOptions<Functions extends ChatSessionModelFunctions |
      *
      * Only relevant when using function calling (via passing the `functions` option).
      */
-    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void
+    onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void,
+
+    /**
+     * Set the maximum number of tokens that the model is allowed to spend on various segmented responses.
+     */
+    budgets?: {
+        /**
+         * Budget for thought tokens.
+         *
+         * Defaults to `Infinity`.
+         */
+        thoughtTokens?: number
+    }
 } & ({
     grammar?: LlamaGrammar,
     functions?: never,
@@ -445,6 +457,7 @@ export class LlamaChatSession {
             onToken,
             onResponseChunk,
             onFunctionCallParamsChunk,
+            budgets,
             signal,
             stopOnAbortSignal = false,
             maxTokens,
@@ -469,7 +482,7 @@ export class LlamaChatSession {
             maxParallelFunctionCalls: maxParallelFunctionCalls as undefined,
             onFunctionCallParamsChunk: onFunctionCallParamsChunk as undefined,
 
-            onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens,
+            onTextChunk, onToken, onResponseChunk, budgets, signal, stopOnAbortSignal, maxTokens,
             temperature, minP, topK, topP, seed,
             trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers
         });
@@ -489,6 +502,7 @@ export class LlamaChatSession {
         onToken,
         onResponseChunk,
         onFunctionCallParamsChunk,
+        budgets,
         signal,
         stopOnAbortSignal = false,
         maxTokens,
@@ -589,6 +603,10 @@ export class LlamaChatSession {
                                 paramsChunk: chunk.paramsChunk,
                                 done: chunk.done
                             })),
+                        budgets: {
+                            includeCurrentResponse: true,
+                            thoughtTokens: budgets?.thoughtTokens
+                        },
                         signal: abortController.signal,
                         stopOnAbortSignal,
                         repeatPenalty,
diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts
new file mode 100644
index 00000000..d667f31c
--- /dev/null
+++ b/test/modelDependent/llama3.2/promptCompletion.test.ts
@@ -0,0 +1,98 @@
+import {describe, expect, test} from "vitest";
+import {LlamaChatSession} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+import {LlamaText} from "../../../src/utils/LlamaText.js";
+
+describe("llama 3.2", () => {
+    describe("prompt completion", () => {
+        test("prompt completion isn't kept in the next evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Llama-3.2-3B-Instruct.Q4_K_M.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 4096
+            });
+            const context2 = await model.createContext({
+                contextSize: 4096
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+            const chatSession2 = new LlamaChatSession({
+                contextSequence: context2.getSequence()
+            });
+
+            const promptCompletion = await chatSession.completePrompt("Hi there!", {
+                maxTokens: 50
+            });
+            expect(promptCompletion).toMatchInlineSnapshot("\" I're looking for a new phone case. I want one that is waterproof and has a good camera.\"");
+            expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(`
+              LlamaText([
+                new SpecialToken("BOS"),
+                new SpecialTokensText("<|start_header_id|>"),
+                "system",
+                new SpecialTokensText("<|end_header_id|>"),
+                "
+
+              Cutting Knowledge Date: December 2023",
+                new SpecialToken("NL"),
+                "Today Date: 29 May 2025
+
+              You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                new SpecialToken("EOT"),
+                new SpecialTokensText("<|start_header_id|>"),
+                "user",
+                new SpecialTokensText("<|end_header_id|>"),
+                "
+
+              Hi there! I're looking for a new phone case. I want one that is waterproof and has a good camera.",
+              ])
+            `);
+
+            const res = await chatSession.prompt("Hi there!", {
+                maxTokens: 50
+            });
+            expect(res).toMatchInlineSnapshot("\"Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat for a bit?\"");
+            expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(`
+              LlamaText([
+                new SpecialToken("BOS"),
+                new SpecialTokensText("<|start_header_id|>"),
+                "system",
+                new SpecialTokensText("<|end_header_id|>"),
+                "
+
+              Cutting Knowledge Date: December 2023",
+                new SpecialToken("NL"),
+                "Today Date: 29 May 2025
+
+              You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+              If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
+                new SpecialToken("EOT"),
+                new SpecialTokensText("<|start_header_id|>"),
+                "user",
+                new SpecialTokensText("<|end_header_id|>"),
+                "
+
+              Hi there!",
+                new SpecialToken("EOT"),
+                new SpecialTokensText("<|start_header_id|>"),
+                "assistant",
+                new SpecialTokensText("<|end_header_id|>"),
+                "
+
+              Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat for a bit?",
+              ])
+            `);
+
+            const res2 = await chatSession2.prompt("Hi there!", {
+                maxTokens: 50
+            });
+            expect(res2).to.eql(res);
+        });
+    });
+});
diff --git a/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts b/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts
new file mode 100644
index 00000000..35522794
--- /dev/null
+++ b/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts
@@ -0,0 +1,95 @@
+import {describe, expect, test} from "vitest";
+import {LlamaChatSession, isChatModelResponseSegment} from "../../../src/index.js";
+import {getModelFile} from "../../utils/modelFiles.js";
+import {getTestLlama} from "../../utils/getTestLlama.js";
+
+describe("qwen3 0.6b", () => {
+    describe("thinking budget", () => {
+        test("doesn't exceed thinking budget", {timeout: 1000 * 60 * 60 * 2}, async () => {
+            const modelPath = await getModelFile("Qwen3-0.6B-Q8_0.gguf");
+            const llama = await getTestLlama();
+
+            const model = await llama.loadModel({
+                modelPath
+            });
+            const context = await model.createContext({
+                contextSize: 512
+            });
+            const chatSession = new LlamaChatSession({
+                contextSequence: context.getSequence()
+            });
+
+            const initialChatHistory = chatSession.getChatHistory();
+
+            async function promptWithBudget({
+                prompt, maxTokens, thinkingBudget
+            }: {
+                prompt: string, maxTokens: number, thinkingBudget?: number
+            }) {
+                let thoughtTokens = 0;
+                let totalTokens = 0;
+
+                chatSession.setChatHistory(initialChatHistory);
+                const {responseText, response} = await chatSession.promptWithMeta(prompt, {
+                    maxTokens,
+                    budgets: {
+                        thoughtTokens: thinkingBudget
+                    },
+                    onResponseChunk(chunk) {
+                        if (chunk.type === "segment" && chunk.segmentType === "thought") {
+                            thoughtTokens += chunk.tokens.length;
+                        }
+
+                        totalTokens += chunk.tokens.length;
+                    }
+                });
+
+                return {
+                    thoughtTokens,
+                    totalTokens,
+                    responseText,
+                    thoughts: response
+                        .filter((item) => isChatModelResponseSegment(item))
+                        .filter((item) => item.segmentType === "thought")
+                        .map((item) => item.text)
+                };
+            }
+
+            const res1 = await promptWithBudget({
+                prompt: "Where do llamas come from?",
+                thinkingBudget: 10,
+                maxTokens: 20
+            });
+            expect(res1.thoughtTokens).to.be.gt(1);
+            expect(res1.thoughtTokens).to.be.lte(10);
+            expect(res1.totalTokens).to.be.gte(16);
+            expect(res1.totalTokens).to.be.lte(20);
+
+            const res2 = await promptWithBudget({
+                prompt: "Where do llamas come from?",
+                thinkingBudget: 0,
+                maxTokens: 20
+            });
+            expect(res2.thoughtTokens).to.be.eq(0);
+            expect(res2.totalTokens).to.be.gte(16);
+            expect(res2.totalTokens).to.be.lte(20);
+
+            const res3 = await promptWithBudget({
+                prompt: "Where do llamas come from?",
+                thinkingBudget: 20,
+                maxTokens: 20
+            });
+            expect(res3.thoughtTokens).to.be.eq(res3.totalTokens);
+            expect(res3.totalTokens).to.be.gte(16);
+            expect(res3.totalTokens).to.be.lte(20);
+
+            const res4 = await promptWithBudget({
+                prompt: "Where do llamas come from?",
+                maxTokens: 20
+            });
+            expect(res4.thoughtTokens).to.be.eq(res4.totalTokens);
+            expect(res4.totalTokens).to.be.gte(16);
+            expect(res4.totalTokens).to.be.lte(20);
+        });
+    });
+});
diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts
index bcc6a6c0..fa307dc6 100644
--- a/test/utils/modelFiles.ts
+++ b/test/utils/modelFiles.ts
@@ -20,7 +20,8 @@ const supportedModels = {
     "codegemma-2b-Q4_K_M.gguf": "https://huggingface.co/bartowski/codegemma-2b-GGUF/resolve/main/codegemma-2b-Q4_K_M.gguf?download=true",
     "Llama-3.2-3B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct.Q4_K_M.gguf?download=true",
     "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true",
-    "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true"
+    "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true",
+    "Qwen3-0.6B-Q8_0.gguf": "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true"
 } as const;
 
 export async function getModelFile(modelName: keyof typeof supportedModels) {

From d83f1778334215dbd54e4f5b4adf73ff07394ead Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 1 Jun 2025 03:25:46 +0300
Subject: [PATCH 04/14] style: lint

---
 src/gguf/insights/GgufInsights.ts     | 2 --
 src/gguf/types/GgufTensorInfoTypes.ts | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index eb8330a8..5b01dd22 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -203,9 +203,7 @@ export class GgufInsights {
         const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0;
         const embeddingSize = llmData.embedding_length ?? 0;
 
-        const sizeTBytes = 8; // sizeof(size_t)
         const floatBytes = 4; // sizeof(float)
-        const uint32TBytes = 4; // sizeof(uint32_t)
         const int32TBytes = 4; // sizeof(int32_t)
 
         const estimateOutput = (nOutputs: number) => {
diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts
index b23bf1f8..8b7f615a 100644
--- a/src/gguf/types/GgufTensorInfoTypes.ts
+++ b/src/gguf/types/GgufTensorInfoTypes.ts
@@ -59,5 +59,5 @@ export const enum GgmlType {
     TQ2_0 = 35,
     IQ4_NL_4_4 = 36,
     IQ4_NL_4_8 = 37,
-    IQ4_NL_8_8 = 38,
+    IQ4_NL_8_8 = 38
 }

From b3d04fe0a6737aa04a67722975e8bb1d9ce83b02 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Sun, 1 Jun 2025 04:12:10 +0300
Subject: [PATCH 05/14] test: fix tests

---
 src/gguf/insights/GgufInsights.ts             |  10 +-
 .../functionaryModelGpuLayersOptions.test.ts  | 144 +++++++++---------
 .../functionary/gguf/ggufInsights.test.ts     |  70 ++++-----
 .../llama3.2/promptCompletion.test.ts         |  28 +++-
 .../stableCodeModelGpuLayersOptions.test.ts   |  80 +++++-----
 5 files changed, 172 insertions(+), 160 deletions(-)

diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index 5b01dd22..7758a7de 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -359,16 +359,14 @@ export class GgufInsights {
 
         const outputBufferSize = estimateOutput(sequences);
 
-        const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + (
-            usingGpu
-                ? outputBufferSize
-                : 0
-        );
+        const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize;
         const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize;
 
         return {
             cpuRam,
-            gpuVram
+            gpuVram: usingGpu
+                ? gpuVram
+                : 0
         };
     }
 
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 03d5942a..d8247dd9 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -114,7 +114,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7718");
                     }
                     {
                         const res = await resolveGpuLayers(0, {
@@ -151,7 +151,7 @@ describe("functionary", () => {
                             freeSwap: s1GB * 1
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     {
                         const res = await resolveGpuLayers(0, {
@@ -255,7 +255,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 4.5
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4016");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -318,7 +318,7 @@ describe("functionary", () => {
                             ignoreMemorySafetyChecks: true
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7718");
                     }
                 });
 
@@ -336,14 +336,14 @@ describe("functionary", () => {
                     }
                     {
                         const res = await resolveGpuLayers(16, {
-                            totalVram: s1GB * 7,
-                            freeVram: s1GB * 7,
-                            totalRam: s1GB * 7,
+                            totalVram: s1GB * 7.5,
+                            freeVram: s1GB * 7.5,
+                            totalRam: s1GB * 7.5,
                             freeRam: s1GB * 5.5,
-                            unifiedMemorySize: s1GB * 7
+                            unifiedMemorySize: s1GB * 7.3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2086");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1760");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
@@ -354,7 +354,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 5.3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6804");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5505");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -409,7 +409,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4352");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4441");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
@@ -422,7 +422,7 @@ describe("functionary", () => {
                             ignoreMemorySafetyChecks: true
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                 });
 
@@ -608,7 +608,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     {
                         const res = await resolveGpuLayers(32, {
@@ -619,7 +619,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1143");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1164");
                     }
                     {
                         const res = await resolveGpuLayers(32, {
@@ -761,7 +761,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1143");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1164");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -772,7 +772,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -783,7 +783,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7718");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -795,7 +795,7 @@ describe("functionary", () => {
                             ignoreMemorySafetyChecks: true
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7718");
                     }
                 });
 
@@ -809,7 +809,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7562");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6251");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -820,18 +820,18 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4352");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2974");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
                             totalVram: s1GB * 6,
                             freeVram: s1GB * 6,
                             totalRam: s1GB * 6,
-                            freeRam: s1GB * 4.8,
+                            freeRam: s1GB * 5.1,
                             unifiedMemorySize: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1142");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1336");
                     }
                     try {
                         await resolveGpuLayers(33, {
@@ -908,7 +908,7 @@ describe("functionary", () => {
                         freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("502");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("472");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -918,7 +918,7 @@ describe("functionary", () => {
                         freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1010");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("898");
                 }
             });
 
@@ -952,7 +952,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3606");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -961,8 +961,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7483");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -972,7 +972,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("10");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5856");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -981,7 +981,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -991,7 +991,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("13");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1001,7 +1001,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1011,7 +1011,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1021,7 +1021,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1031,7 +1031,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1042,7 +1042,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7977");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1052,7 +1052,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8043");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1062,7 +1062,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4754");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4721");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1072,7 +1072,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7964");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7998");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1095,7 +1095,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1105,7 +1105,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1115,7 +1115,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3606");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5438");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1124,8 +1124,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7483");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1135,7 +1135,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("10");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5856");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1144,7 +1144,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("12");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("14");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1154,7 +1154,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("13");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1164,7 +1164,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("15");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1174,7 +1174,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1184,7 +1184,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1194,7 +1194,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 5,
                             freeRam: s1GB * 5
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1205,7 +1205,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7977");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1215,7 +1215,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("25");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("8043");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1225,7 +1225,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4754");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4721");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1235,7 +1235,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7964");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7998");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1324,7 +1324,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.be.gte(16);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1336,7 +1336,7 @@ describe("functionary", () => {
                         });
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1349,7 +1349,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4016");
                     }
                 });
 
@@ -1362,7 +1362,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     {
                         const res = await resolveGpuLayers({min: 0, max: 4}, {
@@ -1372,7 +1372,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2213");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
                     }
                     try {
                         await resolveGpuLayers({min: 2}, {
@@ -1426,7 +1426,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.be.gte(16);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1438,7 +1438,7 @@ describe("functionary", () => {
                         });
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("19");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                     }
                     {
@@ -1451,7 +1451,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("3202");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4016");
                     }
                 });
             });
@@ -1480,7 +1480,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5737");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6535");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1491,8 +1491,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5246");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7483");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1503,7 +1503,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1515,7 +1515,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 8,
                             freeRam: s1GB * 8
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1569,7 +1569,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 7
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5737");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6535");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1580,8 +1580,8 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("6");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5246");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7483");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1592,7 +1592,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("17");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("18");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
@@ -1604,7 +1604,7 @@ describe("functionary", () => {
                             totalRam: s1GB * 7,
                             freeRam: s1GB * 7
                         });
-                        expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
+                        expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
                         expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
index 33e638d0..ee193e2c 100644
--- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts
+++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts
@@ -124,7 +124,7 @@ describe("gguf", async () => {
                 sequences: context.totalSequences,
                 modelGpuLayers: ggufInsights.totalLayers
             }).gpuVram;
-            expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot('"1.02GB"');
+            expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot("\"1.03GB\"");
             expect(Math.abs(contextVramUsageDiff - estimatedContextVramUsage)).to.be.lte(s300MB);
 
             await model.dispose();
@@ -168,7 +168,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "1.78GB",
+                "cpuRam": "1.75GB",
                 "gpuVram": "0B",
               }
             `);
@@ -179,7 +179,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "1.02GB",
+                "cpuRam": "1GB",
                 "gpuVram": "0B",
               }
             `);
@@ -190,7 +190,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "650.6MB",
+                "cpuRam": "643.07MB",
                 "gpuVram": "0B",
               }
             `);
@@ -201,7 +201,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "454.58MB",
+                "cpuRam": "451.07MB",
                 "gpuVram": "0B",
               }
             `);
@@ -213,8 +213,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "1GB",
-                "gpuVram": "834.69MB",
+                "cpuRam": "1.71GB",
+                "gpuVram": "355.25MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -224,8 +224,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "512MB",
-                "gpuVram": "546.63MB",
+                "cpuRam": "1002.8MB",
+                "gpuVram": "315.25MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -235,8 +235,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "256MB",
-                "gpuVram": "402.6MB",
+                "cpuRam": "630.8MB",
+                "gpuVram": "295.25MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -246,8 +246,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "128MB",
-                "gpuVram": "330.58MB",
+                "cpuRam": "444.8MB",
+                "gpuVram": "285.25MB",
               }
             `);
 
@@ -258,8 +258,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "544MB",
-                "gpuVram": "1.28GB",
+                "cpuRam": "1022.78MB",
+                "gpuVram": "1.05GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -269,8 +269,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "272MB",
-                "gpuVram": "786.67MB",
+                "cpuRam": "638.78MB",
+                "gpuVram": "679.25MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -280,8 +280,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "136MB",
-                "gpuVram": "522.64MB",
+                "cpuRam": "446.78MB",
+                "gpuVram": "479.25MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -291,8 +291,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "68MB",
-                "gpuVram": "390.63MB",
+                "cpuRam": "350.78MB",
+                "gpuVram": "379.25MB",
               }
             `);
 
@@ -303,7 +303,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "32MB",
+                "cpuRam": "250.5MB",
                 "gpuVram": "1.78GB",
               }
             `);
@@ -314,8 +314,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "16MB",
-                "gpuVram": "1.02GB",
+                "cpuRam": "250.5MB",
+                "gpuVram": "1.03GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -325,8 +325,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "8MB",
-                "gpuVram": "650.69MB",
+                "cpuRam": "250.5MB",
+                "gpuVram": "667.52MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -336,8 +336,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "4MB",
-                "gpuVram": "454.67MB",
+                "cpuRam": "250.5MB",
+                "gpuVram": "475.52MB",
               }
             `);
 
@@ -348,7 +348,7 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
+                "cpuRam": "250.5MB",
                 "gpuVram": "1.78GB",
               }
             `);
@@ -359,8 +359,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "1.02GB",
+                "cpuRam": "250.5MB",
+                "gpuVram": "1.03GB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -370,8 +370,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "650.69MB",
+                "cpuRam": "250.5MB",
+                "gpuVram": "667.52MB",
               }
             `);
             expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({
@@ -381,8 +381,8 @@ describe("gguf", async () => {
                 batchSize: 512
             }))).toMatchInlineSnapshot(`
               {
-                "cpuRam": "0B",
-                "gpuVram": "454.67MB",
+                "cpuRam": "250.5MB",
+                "gpuVram": "475.52MB",
               }
             `);
         });
diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts
index d667f31c..574524d9 100644
--- a/test/modelDependent/llama3.2/promptCompletion.test.ts
+++ b/test/modelDependent/llama3.2/promptCompletion.test.ts
@@ -1,5 +1,5 @@
 import {describe, expect, test} from "vitest";
-import {LlamaChatSession} from "../../../src/index.js";
+import {LlamaChatSession, resolveChatWrapper} from "../../../src/index.js";
 import {getModelFile} from "../../utils/modelFiles.js";
 import {getTestLlama} from "../../utils/getTestLlama.js";
 import {LlamaText} from "../../../src/utils/LlamaText.js";
@@ -20,16 +20,30 @@ describe("llama 3.2", () => {
                 contextSize: 4096
             });
             const chatSession = new LlamaChatSession({
-                contextSequence: context.getSequence()
+                contextSequence: context.getSequence(),
+                chatWrapper: resolveChatWrapper(model, {
+                    customWrapperSettings: {
+                        "llama3.2-lightweight": {
+                            todayDate: new Date("2025-01-01T00:00:00Z")
+                        }
+                    }
+                })
             });
             const chatSession2 = new LlamaChatSession({
-                contextSequence: context2.getSequence()
+                contextSequence: context2.getSequence(),
+                chatWrapper: resolveChatWrapper(model, {
+                    customWrapperSettings: {
+                        "llama3.2-lightweight": {
+                            todayDate: new Date("2025-01-01T00:00:00Z")
+                        }
+                    }
+                })
             });
 
             const promptCompletion = await chatSession.completePrompt("Hi there!", {
                 maxTokens: 50
             });
-            expect(promptCompletion).toMatchInlineSnapshot("\" I're looking for a new phone case. I want one that is waterproof and has a good camera.\"");
+            expect(promptCompletion).toMatchInlineSnapshot("\" I'm looking for a new phone case. I need a case that can protect your phone from scratches and drops.\"");
             expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(`
               LlamaText([
                 new SpecialToken("BOS"),
@@ -40,7 +54,7 @@ describe("llama 3.2", () => {
 
               Cutting Knowledge Date: December 2023",
                 new SpecialToken("NL"),
-                "Today Date: 29 May 2025
+                "Today Date: 1 Jan 2025
 
               You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
               If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
@@ -50,7 +64,7 @@ describe("llama 3.2", () => {
                 new SpecialTokensText("<|end_header_id|>"),
                 "
 
-              Hi there! I're looking for a new phone case. I want one that is waterproof and has a good camera.",
+              Hi there! I'm looking for a new phone case. I need a case that can protect your phone from scratches and drops.",
               ])
             `);
 
@@ -68,7 +82,7 @@ describe("llama 3.2", () => {
 
               Cutting Knowledge Date: December 2023",
                 new SpecialToken("NL"),
-                "Today Date: 29 May 2025
+                "Today Date: 1 Jan 2025
 
               You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
               If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.",
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
index 43145a6d..c2ad773f 100644
--- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -111,7 +111,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7177");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8064");
                 }
                 try {
                     await resolveGpuLayers(16, {
@@ -137,12 +137,12 @@ describe("stableCode", () => {
 
                         // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left
                         // to create a context
-                        freeVram: s1GB * 0.2,
+                        freeVram: s1GB * 1.4,
 
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("133");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("138");
                 }
 
 
@@ -174,7 +174,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11348");
                 }
                 try {
                     await resolveGpuLayers(32, {
@@ -192,7 +192,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("94");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("48");
                 }
 
                 {
@@ -223,7 +223,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11348");
                 }
                 try {
                     await resolveGpuLayers(33, {
@@ -241,7 +241,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("94");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("48");
                 }
 
                 {
@@ -303,7 +303,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("94");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("48");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -311,7 +311,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5887");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -319,7 +319,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6866");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6979");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -327,7 +327,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7931");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8072");
                 }
             });
 
@@ -345,24 +345,24 @@ describe("stableCode", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("16384");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("10864");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 0.8
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8724");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("16384");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 1.4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6203");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("5");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8368");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -370,7 +370,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 2.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1544");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1518");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -378,7 +378,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.1
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3407");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3429");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -386,7 +386,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3939");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3976");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -394,7 +394,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4471");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("4522");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -402,7 +402,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5270");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5341");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -410,7 +410,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5887");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -418,7 +418,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6600");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6706");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -426,7 +426,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7133");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7252");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -434,7 +434,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7931");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8072");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -442,7 +442,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.2
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8995");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9164");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -450,7 +450,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10592");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("10802");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -458,7 +458,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11125");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11348");
                 }
             });
 
@@ -504,7 +504,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11658");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("13255");
                 }
                 try {
                     await resolveGpuLayers({min: 16}, {
@@ -522,7 +522,7 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5887");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -531,8 +531,8 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("22");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8160");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8249");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -542,7 +542,7 @@ describe("stableCode", () => {
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7177");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8064");
                 }
             });
 
@@ -565,7 +565,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5802");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5887");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -574,8 +574,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 2,
                         freeVram: s1GB * 1
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9426");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("3");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5933");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -584,8 +584,8 @@ describe("stableCode", () => {
                         totalVram: s1GB * 6,
                         freeVram: s1GB * 4
                     });
-                    expect(res.gpuLayers).to.toMatchInlineSnapshot("20");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9167");
+                    expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("9208");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -595,7 +595,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 1
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("2");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9426");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("16384");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {

From e382413c532bdbad077b6cd3c5234a08f7efd029 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 2 Jun 2025 03:02:45 +0300
Subject: [PATCH 06/14] docs: generate a `llms.txt` file

---
 .vitepress/config.ts              | 149 ++++--------------------------
 .vitepress/config/getBlogPosts.ts |  46 +++++++++
 .vitepress/config/sidebar.ts      | 134 +++++++++++++++++++++++++++
 package.json                      |   1 +
 4 files changed, 201 insertions(+), 129 deletions(-)
 create mode 100644 .vitepress/config/getBlogPosts.ts
 create mode 100644 .vitepress/config/sidebar.ts

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
index f3b7fab2..e6161519 100644
--- a/.vitepress/config.ts
+++ b/.vitepress/config.ts
@@ -12,11 +12,13 @@ import {rehype} from "rehype";
 import sharp from "sharp";
 import {GitChangelog, GitChangelogMarkdownSection} from "@nolebase/vitepress-plugin-git-changelog/vite";
 import {buildEndGenerateOpenGraphImages} from "@nolebase/vitepress-plugin-og-image/vitepress";
+import llmstxt from "vitepress-plugin-llms";
 import {Resvg, initWasm as initResvgWasm, type ResvgRenderOptions} from "@resvg/resvg-wasm";
 import {BlogPageInfoPlugin} from "./config/BlogPageInfoPlugin.js";
-import {getApiReferenceSidebar} from "./config/apiReferenceSidebar.js";
 import {ensureLocalImage} from "./utils/ensureLocalImage.js";
 import {getExcerptFromMarkdownFile} from "./utils/getExcerptFromMarkdownFile.js";
+import {getVitepressSidebar, getVitepressSidebarWithBlog} from "./config/sidebar.js";
+import {getBlogPosts} from "./config/getBlogPosts.js";
 import type {Element as HastElement, Parent} from "hast";
 
 import type {Node as UnistNode} from "unist";
@@ -365,6 +367,12 @@ export default defineConfig({
             }) as VitepressPlugin,
             BlogPageInfoPlugin({
                 include: (id) => id.includes(path.sep + "blog" + path.sep) && !id.endsWith(path.sep + "blog" + path.sep + "index.md")
+            }),
+            llmstxt({
+                ignoreFiles: ["index.md"],
+                domain: resolveHref("/test").slice(0, -"/test".length) || undefined,
+                excludeBlog: false,
+                sidebar: () => getVitepressSidebarWithBlog(true, false)
             })
         ],
         build: {
@@ -434,6 +442,9 @@ export default defineConfig({
                 }, {
                     text: "GitHub Discussions",
                     link: "https://github.com/withcatai/node-llama-cpp/discussions"
+                }, {
+                    text: "Awesome List",
+                    link: "/guide/awesome"
                 }, {
                     text: "Contribute",
                     link: "/guide/contributing"
@@ -469,100 +480,14 @@ export default defineConfig({
                 }
             }
         },
-        sidebar: {
-            "/guide/": [{
-                text: "Guide",
-                base: "/guide",
-                items: [
-                    {text: "Getting Started", link: "/"},
-                    {text: "Chat Session", link: "/chat-session"},
-                    {text: "Chat Wrapper", link: "/chat-wrapper"},
-                    {text: "Grammar", link: "/grammar"},
-                    {text: "Function Calling", link: "/function-calling"},
-                    {text: "Embedding", link: "/embedding"},
-                    {text: "Text Completion", link: "/text-completion"},
-                    {text: "Choosing a Model", link: "/choosing-a-model"},
-                    {text: "Downloading Models", link: "/downloading-models"}
-                ]
-            }, {
-                text: "Advanced",
-                base: "/guide",
-                items: [
-                    {text: "Building From Source", link: "/building-from-source"},
-                    {text: "Metal Support", link: "/Metal"},
-                    {text: "CUDA Support", link: "/CUDA"},
-                    {text: "Vulkan Support", link: "/Vulkan"},
-                    {text: "Electron Support", link: "/electron"},
-                    {text: "Using in Docker", link: "/docker"},
-                    {text: "Using Tokens", link: "/tokens"},
-                    {text: "LlamaText", link: "/llama-text"},
-                    {text: "External Chat State", link: "/external-chat-state"},
-                    {text: "Token Bias", link: "/token-bias"},
-                    {text: "Objects Lifecycle", link: "/objects-lifecycle"},
-                    {text: "Chat Context Shift", link: "/chat-context-shift"},
-                    {text: "Batching", link: "/batching"},
-                    {text: "Token Prediction", link: "/token-prediction"},
-                    {text: "Low Level API", link: "/low-level-api"},
-                    {text: "Awesome List", link: "/awesome"},
-                    {text: "Troubleshooting", link: "/troubleshooting"},
-                    {text: "Tips and Tricks", link: "/tips-and-tricks"}
-                ]
-            }, {
-                text: "Contributing",
-                base: "/guide",
-                items: [
-                    {text: "Setting Up a Dev Environment", link: "/development"},
-                    {text: "Pull Request Guidelines", link: "/contributing"}
-                ]
-            }],
-
-            "/cli/": [{
-                text: "CLI",
-                base: "/cli",
-                link: "/",
-                items: [
-                    {text: "Init", link: "/init"},
-                    {text: "Chat", link: "/chat"},
-                    {text: "Pull", link: "/pull"},
-                    {
-                        text: "Source",
-                        link: "/source",
-                        collapsed: true,
-                        items: [
-                            {text: "Download", link: "/source/download"},
-                            {text: "Build", link: "/source/build"},
-                            {text: "Clear", link: "/source/clear"}
-                        ]
-                    },
-                    {text: "Complete", link: "/complete"},
-                    {text: "Infill", link: "/infill"},
-                    {
-                        text: "Inspect",
-                        link: "/inspect",
-                        collapsed: true,
-                        items: [
-                            {text: "GPU", link: "/inspect/gpu"},
-                            {text: "GGUF", link: "/inspect/gguf"},
-                            {text: "Measure", link: "/inspect/measure"},
-                            {text: "Estimate", link: "/inspect/estimate"}
-                        ]
-                    }
-                ]
-            }],
-
-            "/api/": getApiReferenceSidebar()
-        },
+        sidebar: getVitepressSidebar(),
         socialLinks: [
             {icon: "npm", link: "https://www.npmjs.com/package/node-llama-cpp"},
             {icon: "github", link: "https://github.com/withcatai/node-llama-cpp"}
         ]
     },
     async buildEnd(siteConfig) {
-        const blogPosts = await createContentLoader("blog/*.md", {
-            excerpt: true,
-            render: true
-        })
-            .load();
+        const blogPosts = await getBlogPosts(false);
 
         async function loadSvgFontBuffers() {
             const interFontFilesDirectoryPath = path.join(require.resolve("@fontsource/inter"), "..", "files");
@@ -699,24 +624,7 @@ export default defineConfig({
                     ...siteConfig.site,
                     themeConfig: {
                         ...siteConfig.site.themeConfig,
-                        sidebar: {
-                            ...siteConfig.site.themeConfig.sidebar,
-                            "/_blog/": {
-                                text: "Blog",
-                                link: "/blog/",
-                                items: blogPosts
-                                    .filter((post) => {
-                                        const hasCoverImage = typeof post.frontmatter?.image === "string" ||
-                                            typeof post.frontmatter?.image?.url === "string";
-
-                                        return !hasCoverImage;
-                                    })
-                                    .map((post) => ({
-                                        text: post.frontmatter.title,
-                                        link: post.url
-                                    }))
-                            }
-                        }
+                        sidebar: await getVitepressSidebarWithBlog(true, true)
                     }
                 }
             });
@@ -744,22 +652,6 @@ export default defineConfig({
                 hub: "https://pubsubhubbub.appspot.com/"
             });
 
-            blogPosts.sort((a, b) => {
-                const aDate = a.frontmatter.date
-                    ? new Date(a.frontmatter.date)
-                    : null;
-                const bDate = b.frontmatter.date
-                    ? new Date(b.frontmatter.date)
-                    : null;
-
-                if (aDate == null)
-                    return -1;
-                if (bDate == null)
-                    return 1;
-
-                return bDate.getTime() - aDate.getTime();
-            });
-
             for (const {url, frontmatter, html, src, excerpt: originalExcerpt} of blogPosts) {
                 const ogImageElement = findElementInHtml(html, (element) => (
                     element.tagName === "meta" && (element.properties?.name === "og:image" || element.properties?.property === "og:image")
@@ -819,12 +711,6 @@ export default defineConfig({
 
         await addOgImages();
 
-        const indexPageIndex = blogPosts.findIndex((post) => post.url === "/blog/");
-        if (indexPageIndex < 0)
-            throw new Error("Blog index page not found");
-
-        blogPosts.splice(indexPageIndex, 1);
-
         await addBlogRssFeed();
 
         try {
@@ -853,6 +739,11 @@ export default defineConfig({
             path.join(siteConfig.outDir, "logo.preview.avif"),
             24
         );
+
+        await Promise.all([
+            fs.copy(path.join(siteConfig.outDir, "llms.txt"), path.join(siteConfig.outDir, "llms.md")),
+            fs.copy(path.join(siteConfig.outDir, "llms-full.txt"), path.join(siteConfig.outDir, "llms-full.md"))
+        ]);
     }
 });
 
diff --git a/.vitepress/config/getBlogPosts.ts b/.vitepress/config/getBlogPosts.ts
new file mode 100644
index 00000000..1d4cb6a5
--- /dev/null
+++ b/.vitepress/config/getBlogPosts.ts
@@ -0,0 +1,46 @@
+import {ContentData, createContentLoader} from "vitepress";
+
+let blogPosts: ContentData[] | undefined = undefined;
+export async function getBlogPosts(includeIndex: boolean = false) {
+    if (includeIndex)
+        return await _getBlogPosts();
+
+    const blogPosts = (await _getBlogPosts()).slice();
+
+    const indexPageIndex = blogPosts.findIndex((post) => post.url === "/blog/");
+    if (indexPageIndex < 0)
+        throw new Error("Blog index page not found");
+
+    blogPosts.splice(indexPageIndex, 1);
+
+    return blogPosts;
+}
+
+async function _getBlogPosts() {
+    if (blogPosts != null)
+        return blogPosts;
+
+    blogPosts = await createContentLoader("blog/*.md", {
+        excerpt: true,
+        render: true
+    })
+        .load();
+
+    blogPosts.sort((a, b) => {
+        const aDate = a.frontmatter.date
+            ? new Date(a.frontmatter.date)
+            : null;
+        const bDate = b.frontmatter.date
+            ? new Date(b.frontmatter.date)
+            : null;
+
+        if (aDate == null)
+            return -1;
+        if (bDate == null)
+            return 1;
+
+        return bDate.getTime() - aDate.getTime();
+    });
+
+    return blogPosts;
+}
diff --git a/.vitepress/config/sidebar.ts b/.vitepress/config/sidebar.ts
new file mode 100644
index 00000000..b151a56c
--- /dev/null
+++ b/.vitepress/config/sidebar.ts
@@ -0,0 +1,134 @@
+import {DefaultTheme} from "vitepress";
+import {getApiReferenceSidebar} from "./apiReferenceSidebar.js";
+import {getBlogPosts} from "./getBlogPosts.js";
+
+const apiReferenceSidebar = getApiReferenceSidebar();
+
+export function getVitepressSidebar(blog?: DefaultTheme.SidebarItem[]): DefaultTheme.Sidebar {
+    return {
+        "/guide/": [{
+            text: "Guide",
+            base: "/guide",
+            items: [
+                {text: "Getting Started", link: "/"},
+                {text: "Chat Session", link: "/chat-session"},
+                {text: "Chat Wrapper", link: "/chat-wrapper"},
+                {text: "Grammar", link: "/grammar"},
+                {text: "Function Calling", link: "/function-calling"},
+                {text: "Embedding", link: "/embedding"},
+                {text: "Text Completion", link: "/text-completion"},
+                {text: "Choosing a Model", link: "/choosing-a-model"},
+                {text: "Downloading Models", link: "/downloading-models"}
+            ]
+        }, {
+            text: "Advanced",
+            base: "/guide",
+            items: [
+                {text: "Building From Source", link: "/building-from-source"},
+                {text: "Metal Support", link: "/Metal"},
+                {text: "CUDA Support", link: "/CUDA"},
+                {text: "Vulkan Support", link: "/Vulkan"},
+                {text: "Electron Support", link: "/electron"},
+                {text: "Using in Docker", link: "/docker"},
+                {text: "Using Tokens", link: "/tokens"},
+                {text: "LlamaText", link: "/llama-text"},
+                {text: "External Chat State", link: "/external-chat-state"},
+                {text: "Token Bias", link: "/token-bias"},
+                {text: "Objects Lifecycle", link: "/objects-lifecycle"},
+                {text: "Chat Context Shift", link: "/chat-context-shift"},
+                {text: "Batching", link: "/batching"},
+                {text: "Token Prediction", link: "/token-prediction"},
+                {text: "Low Level API", link: "/low-level-api"},
+                {text: "Awesome List", link: "/awesome"},
+                {text: "Troubleshooting", link: "/troubleshooting"},
+                {text: "Tips and Tricks", link: "/tips-and-tricks"}
+            ]
+        }, {
+            text: "Contributing",
+            base: "/guide",
+            items: [
+                {text: "Setting Up a Dev Environment", link: "/development"},
+                {text: "Pull Request Guidelines", link: "/contributing"}
+            ]
+        }],
+
+        ...(
+            blog != null
+                ? {
+                    "/_blog/": [{
+                        text: "Blog",
+                        link: "/blog/",
+                        items: blog
+                    }]
+                }
+                : {}
+        ),
+
+        "/cli/": [{
+            text: "CLI",
+            base: "/cli",
+            link: "/",
+            items: [
+                {text: "Init", link: "/init"},
+                {text: "Chat", link: "/chat"},
+                {text: "Pull", link: "/pull"},
+                {
+                    text: "Source",
+                    link: "/source",
+                    collapsed: true,
+                    items: [
+                        {text: "Download", link: "/source/download"},
+                        {text: "Build", link: "/source/build"},
+                        {text: "Clear", link: "/source/clear"}
+                    ]
+                },
+                {text: "Complete", link: "/complete"},
+                {text: "Infill", link: "/infill"},
+                {
+                    text: "Inspect",
+                    link: "/inspect",
+                    collapsed: true,
+                    items: [
+                        {text: "GPU", link: "/inspect/gpu"},
+                        {text: "GGUF", link: "/inspect/gguf"},
+                        {text: "Measure", link: "/inspect/measure"},
+                        {text: "Estimate", link: "/inspect/estimate"}
+                    ]
+                }
+            ]
+        }],
+
+        "/api/": structuredClone(apiReferenceSidebar)
+    };
+}
+
+export async function getSidebarBlogPostItems(
+    includeIndex: boolean = false,
+    onlyItemsWithoutCoverImage: boolean = false
+): Promise<DefaultTheme.SidebarItem[]> {
+    const blogPosts = await getBlogPosts(includeIndex);
+
+    return blogPosts
+        .filter((post) => {
+            if (!onlyItemsWithoutCoverImage)
+                return true;
+
+            const hasCoverImage = typeof post.frontmatter?.image === "string" ||
+                typeof post.frontmatter?.image?.url === "string";
+
+            return !hasCoverImage;
+        })
+        .map((post) => ({
+            text: post.frontmatter.title,
+            link: post.url
+        }));
+}
+
+export async function getVitepressSidebarWithBlog(
+    includeIndex: boolean = false,
+    onlyItemsWithoutCoverImage: boolean = false
+) {
+    const blogItems = await getSidebarBlogPostItems(includeIndex, onlyItemsWithoutCoverImage);
+
+    return getVitepressSidebar(blogItems);
+}
diff --git a/package.json b/package.json
index 79f9834f..0636af08 100644
--- a/package.json
+++ b/package.json
@@ -210,6 +210,7 @@
     "stdout-update": "^4.0.1",
     "strip-ansi": "^7.1.0",
     "validate-npm-package-name": "^6.0.0",
+    "vitepress-plugin-llms": "^1.3.4",
     "which": "^5.0.0",
     "yargs": "^17.7.2"
   },

From 262999f84498d3b5c8895720f5084a943a2821c7 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Mon, 2 Jun 2025 03:05:02 +0300
Subject: [PATCH 07/14] fix: update `ipull`

---
 package-lock.json | 308 ++++++++++++++++++++++++++++++++++++++++++++--
 package.json      |   6 +-
 2 files changed, 298 insertions(+), 16 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 72578ac6..ad8c83c4 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -22,7 +22,7 @@
         "filenamify": "^6.0.0",
         "fs-extra": "^11.3.0",
         "ignore": "^7.0.4",
-        "ipull": "^3.9.2",
+        "ipull": "^4.0.3",
         "is-unicode-supported": "^2.1.0",
         "lifecycle-utils": "^2.0.0",
         "log-symbols": "^7.0.0",
@@ -62,7 +62,7 @@
         "@types/bytes": "^3.1.5",
         "@types/cross-spawn": "^6.0.6",
         "@types/fs-extra": "^11.0.4",
-        "@types/node": "^22.15.17",
+        "@types/node": "^20.17.50",
         "@types/proper-lockfile": "^4.1.4",
         "@types/semver": "^7.7.0",
         "@types/validate-npm-package-name": "^4.0.2",
@@ -91,6 +91,7 @@
         "typescript-eslint": "^8.32.0",
         "vite-node": "^3.1.3",
         "vitepress": "^1.6.3",
+        "vitepress-plugin-llms": "^1.3.4",
         "vitest": "^3.1.3",
         "zx": "^8.5.4"
       },
@@ -2231,6 +2232,15 @@
       "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==",
       "license": "MIT"
     },
+    "node_modules/@lukeed/csprng": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@lukeed/csprng/-/csprng-1.1.0.tgz",
+      "integrity": "sha512-Z7C/xXCiGWsg0KuKsHTKJxbWhpI3Vs5GwLfOean7MGyVFGqdRgBbAjOCh6u4bbjPc/8MJ2pZmK/0DLdCbivLDA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/@modelcontextprotocol/sdk": {
       "version": "1.11.1",
       "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.11.1.tgz",
@@ -4420,13 +4430,13 @@
       "license": "MIT"
     },
     "node_modules/@types/node": {
-      "version": "22.15.17",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.17.tgz",
-      "integrity": "sha512-wIX2aSZL5FE+MR0JlvF87BNVrtFWf6AE6rxSE9X7OwnVvoyCQjpzSRJ+M87se/4QCkCiebQAqrJ0y6fwIyi7nw==",
+      "version": "20.17.50",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.50.tgz",
+      "integrity": "sha512-Mxiq0ULv/zo1OzOhwPqOA13I81CV/W3nvd3ChtQZRT5Cwz3cr0FKo/wMSsbTqL3EXpaBAEQhva2B8ByRkOIh9A==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "undici-types": "~6.21.0"
+        "undici-types": "~6.19.2"
       }
     },
     "node_modules/@types/normalize-package-data": {
@@ -5996,6 +6006,24 @@
         "node": "*"
       }
     },
+    "node_modules/byte-size": {
+      "version": "9.0.1",
+      "resolved": "https://registry.npmjs.org/byte-size/-/byte-size-9.0.1.tgz",
+      "integrity": "sha512-YLe9x3rabBrcI0cueCdLS2l5ONUKywcRpTs02B8KP9/Cimhj7o3ZccGrPnRvcbyHMbb7W79/3MUJl7iGgTXKEw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=12.17"
+      },
+      "peerDependencies": {
+        "@75lb/nature": "latest"
+      },
+      "peerDependenciesMeta": {
+        "@75lb/nature": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/bytes": {
       "version": "3.1.2",
       "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@@ -7500,6 +7528,23 @@
         "node": ">= 12.20.55"
       }
     },
+    "node_modules/electron/node_modules/@types/node": {
+      "version": "22.15.21",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.21.tgz",
+      "integrity": "sha512-EV/37Td6c+MgKAbkcLG6vqZ2zEYHD7bvSrzqqs2RIhbA6w3x+Dqz8MZM3sP6kGTeLrdoOgKZe+Xja7tUB2DNkQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~6.21.0"
+      }
+    },
+    "node_modules/electron/node_modules/undici-types": {
+      "version": "6.21.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
+      "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/emoji-regex": {
       "version": "10.4.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz",
@@ -8945,6 +8990,20 @@
         "reusify": "^1.0.4"
       }
     },
+    "node_modules/fault": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/fault/-/fault-2.0.1.tgz",
+      "integrity": "sha512-WtySTkS4OKev5JtpHXnib4Gxiurzh5NCGvWrFaZ34m6JehfTUhKZvn9njTfw48t6JumVQOmrKqpmGcdwxnhqBQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "format": "^0.2.0"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/fd-slicer": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
@@ -9238,6 +9297,15 @@
         "node": ">= 6"
       }
     },
+    "node_modules/format": {
+      "version": "0.2.2",
+      "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz",
+      "integrity": "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==",
+      "dev": true,
+      "engines": {
+        "node": ">=0.4.x"
+      }
+    },
     "node_modules/forwarded": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz",
@@ -10550,9 +10618,9 @@
       }
     },
     "node_modules/ipull": {
-      "version": "3.9.2",
-      "resolved": "https://registry.npmjs.org/ipull/-/ipull-3.9.2.tgz",
-      "integrity": "sha512-YbCDsqcf0ytc3b8304ygBlvRtKJTvyygkQX2xcmPkih6vdVKbRw13pDdtSR+vEqLql3owyuPj9m6iT6IfwFaCg==",
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/ipull/-/ipull-4.0.3.tgz",
+      "integrity": "sha512-mPcOnm1hX1GTL4/f1C5IQFbo1uxqKihZX8KbaHWWnJ7NW4SKQaelRAVy9iVb8XgugMnlEo6TQVBrzCbOvswbsA==",
       "license": "MIT",
       "dependencies": {
         "@tinyhttp/content-disposition": "^2.2.0",
@@ -10573,7 +10641,8 @@
         "sleep-promise": "^9.1.0",
         "slice-ansi": "^7.1.0",
         "stdout-update": "^4.0.1",
-        "strip-ansi": "^7.1.0"
+        "strip-ansi": "^7.1.0",
+        "uid": "^2.0.2"
       },
       "bin": {
         "ipull": "dist/cli/cli.js"
@@ -11808,6 +11877,16 @@
         "url": "https://github.com/sponsors/wooorm"
       }
     },
+    "node_modules/markdown-title": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/markdown-title/-/markdown-title-1.0.2.tgz",
+      "integrity": "sha512-MqIQVVkz+uGEHi3TsHx/czcxxCbRIL7sv5K5DnYw/tI+apY54IbPefV/cmgxp6LoJSEx/TqcHdLs/298afG5QQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
     "node_modules/marked": {
       "version": "12.0.2",
       "resolved": "https://registry.npmjs.org/marked/-/marked-12.0.2.tgz",
@@ -11922,6 +12001,38 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/mdast-util-frontmatter": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/mdast-util-frontmatter/-/mdast-util-frontmatter-2.0.1.tgz",
+      "integrity": "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "devlop": "^1.0.0",
+        "escape-string-regexp": "^5.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "micromark-extension-frontmatter": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/mdast-util-frontmatter/node_modules/escape-string-regexp": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz",
+      "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/mdast-util-gfm": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz",
@@ -12242,6 +12353,23 @@
         "micromark-util-types": "^2.0.0"
       }
     },
+    "node_modules/micromark-extension-frontmatter": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/micromark-extension-frontmatter/-/micromark-extension-frontmatter-2.0.0.tgz",
+      "integrity": "sha512-C4AkuM3dA58cgZha7zVnuVxBhDsbttIMiytjgsM2XbHAB2faRVaHRle40558FBN+DJcrLNCoqG5mlrpdU4cRtg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "fault": "^2.0.0",
+        "micromark-util-character": "^2.0.0",
+        "micromark-util-symbol": "^2.0.0",
+        "micromark-util-types": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/micromark-factory-destination": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz",
@@ -12661,6 +12789,19 @@
         "url": "https://github.com/sponsors/jonschlinkert"
       }
     },
+    "node_modules/millify": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/millify/-/millify-6.1.0.tgz",
+      "integrity": "sha512-H/E3J6t+DQs/F2YgfDhxUVZz/dF8JXPPKTLHL/yHCcLZLtCXJDUaqvhJXQwqOVBvbyNn4T0WjLpIHd7PAw7fBA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "yargs": "^17.0.1"
+      },
+      "bin": {
+        "millify": "bin/millify"
+      }
+    },
     "node_modules/mime": {
       "version": "4.0.6",
       "resolved": "https://registry.npmjs.org/mime/-/mime-4.0.6.tgz",
@@ -17122,6 +17263,73 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/remark": {
+      "version": "15.0.1",
+      "resolved": "https://registry.npmjs.org/remark/-/remark-15.0.1.tgz",
+      "integrity": "sha512-Eht5w30ruCXgFmxVUSlNWQ9iiimq07URKeFS3hNc8cUWy1llX4KDWfyEDZRycMc+znsN9Ux5/tJ/BFdgdOwA3A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "remark-parse": "^11.0.0",
+        "remark-stringify": "^11.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-frontmatter": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/remark-frontmatter/-/remark-frontmatter-5.0.0.tgz",
+      "integrity": "sha512-XTFYvNASMe5iPN0719nPrdItC9aU0ssC4v14mH1BCi1u0n1gAocqcujWUrByftZTbLhRtiKRyjYTSIOcr69UVQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-frontmatter": "^2.0.0",
+        "micromark-extension-frontmatter": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-parse": {
+      "version": "11.0.0",
+      "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz",
+      "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-from-markdown": "^2.0.0",
+        "micromark-util-types": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/remark-stringify": {
+      "version": "11.0.0",
+      "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz",
+      "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/mdast": "^4.0.0",
+        "mdast-util-to-markdown": "^2.0.0",
+        "unified": "^11.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/require-directory": {
       "version": "2.1.1",
       "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
@@ -19266,6 +19474,13 @@
         "node": ">=0.6"
       }
     },
+    "node_modules/tokenx": {
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-0.4.1.tgz",
+      "integrity": "sha512-LCMniis0WsHel07xh3K9OIt5c9Xla1awtOoWBmUHZBQR7pvTvgGFuYpLiCZWohXPC1YuZORnN0+fCVYI/ie8Jg==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/totalist": {
       "version": "3.0.1",
       "resolved": "https://registry.npmjs.org/totalist/-/totalist-3.0.1.tgz",
@@ -19661,6 +19876,18 @@
         "node": ">=0.8.0"
       }
     },
+    "node_modules/uid": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/uid/-/uid-2.0.2.tgz",
+      "integrity": "sha512-u3xV3X7uzvi5b1MncmZo3i2Aw222Zk1keqLA1YkHldREkAhAqi65wuPfe7lHx8H/Wzy+8CE7S7uS3jekIM5s8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@lukeed/csprng": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
     "node_modules/unbox-primitive": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz",
@@ -19688,9 +19915,9 @@
       "license": "MIT"
     },
     "node_modules/undici-types": {
-      "version": "6.21.0",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
-      "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+      "version": "6.19.8",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz",
+      "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==",
       "dev": true,
       "license": "MIT"
     },
@@ -19781,6 +20008,22 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/unist-util-remove": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/unist-util-remove/-/unist-util-remove-4.0.0.tgz",
+      "integrity": "sha512-b4gokeGId57UVRX/eVKej5gXqGlc9+trkORhFJpu9raqZkZhU0zm8Doi05+HaiBsMEIJowL+2WtQ5ItjsngPXg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^3.0.0",
+        "unist-util-is": "^6.0.0",
+        "unist-util-visit-parents": "^6.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/unist-util-stringify-position": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz",
@@ -20127,6 +20370,45 @@
         }
       }
     },
+    "node_modules/vitepress-plugin-llms": {
+      "version": "1.3.4",
+      "resolved": "https://registry.npmjs.org/vitepress-plugin-llms/-/vitepress-plugin-llms-1.3.4.tgz",
+      "integrity": "sha512-owEPumKy5syjRRG0OSA2635NoeR/U+eiLIjurLTUMXxdmtJ0h6OrTLqvIFPYTV0gSQfaWY/owKdTxjZxv2n2bQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "byte-size": "^9.0.1",
+        "gray-matter": "^4.0.3",
+        "markdown-title": "^1.0.2",
+        "millify": "^6.1.0",
+        "minimatch": "^10.0.1",
+        "picocolors": "^1.1.1",
+        "remark": "^15.0.1",
+        "remark-frontmatter": "^5.0.0",
+        "tokenx": "^0.4.1",
+        "unist-util-remove": "^4.0.0",
+        "unist-util-visit": "^5.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/okineadev/vitepress-plugin-llms?sponsor=1"
+      }
+    },
+    "node_modules/vitepress-plugin-llms/node_modules/minimatch": {
+      "version": "10.0.1",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz",
+      "integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": "20 || >=22"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
     "node_modules/vitepress/node_modules/@shikijs/core": {
       "version": "2.2.0",
       "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-2.2.0.tgz",
diff --git a/package.json b/package.json
index 0636af08..b6b2493e 100644
--- a/package.json
+++ b/package.json
@@ -149,7 +149,7 @@
     "@types/bytes": "^3.1.5",
     "@types/cross-spawn": "^6.0.6",
     "@types/fs-extra": "^11.0.4",
-    "@types/node": "^22.15.17",
+    "@types/node": "^20.17.50",
     "@types/proper-lockfile": "^4.1.4",
     "@types/semver": "^7.7.0",
     "@types/validate-npm-package-name": "^4.0.2",
@@ -178,6 +178,7 @@
     "typescript-eslint": "^8.32.0",
     "vite-node": "^3.1.3",
     "vitepress": "^1.6.3",
+    "vitepress-plugin-llms": "^1.3.4",
     "vitest": "^3.1.3",
     "zx": "^8.5.4"
   },
@@ -194,7 +195,7 @@
     "filenamify": "^6.0.0",
     "fs-extra": "^11.3.0",
     "ignore": "^7.0.4",
-    "ipull": "^3.9.2",
+    "ipull": "^4.0.3",
     "is-unicode-supported": "^2.1.0",
     "lifecycle-utils": "^2.0.0",
     "log-symbols": "^7.0.0",
@@ -210,7 +211,6 @@
     "stdout-update": "^4.0.1",
     "strip-ansi": "^7.1.0",
     "validate-npm-package-name": "^6.0.0",
-    "vitepress-plugin-llms": "^1.3.4",
     "which": "^5.0.0",
     "yargs": "^17.7.2"
   },

From 39373a6997a2ea0c6cb69b4f89de39d8fea116e9 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 4 Jun 2025 01:04:13 +0300
Subject: [PATCH 08/14] docs: generate a `llms.txt` file

---
 package-lock.json | 14 +++++++-------
 package.json      |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index ad8c83c4..871a8067 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -91,7 +91,7 @@
         "typescript-eslint": "^8.32.0",
         "vite-node": "^3.1.3",
         "vitepress": "^1.6.3",
-        "vitepress-plugin-llms": "^1.3.4",
+        "vitepress-plugin-llms": "https://pkg.pr.new/vitepress-plugin-llms@51",
         "vitest": "^3.1.3",
         "zx": "^8.5.4"
       },
@@ -19475,9 +19475,9 @@
       }
     },
     "node_modules/tokenx": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-0.4.1.tgz",
-      "integrity": "sha512-LCMniis0WsHel07xh3K9OIt5c9Xla1awtOoWBmUHZBQR7pvTvgGFuYpLiCZWohXPC1YuZORnN0+fCVYI/ie8Jg==",
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-1.0.1.tgz",
+      "integrity": "sha512-MhOngUHRuVE0CHP4cNEZ/XpdXETFL65nJpEvoTW+VYPuXsT/MTeNj+UNnekNsnxecmj2DEvUYPebqz+CsPTUSg==",
       "dev": true,
       "license": "MIT"
     },
@@ -20372,8 +20372,8 @@
     },
     "node_modules/vitepress-plugin-llms": {
       "version": "1.3.4",
-      "resolved": "https://registry.npmjs.org/vitepress-plugin-llms/-/vitepress-plugin-llms-1.3.4.tgz",
-      "integrity": "sha512-owEPumKy5syjRRG0OSA2635NoeR/U+eiLIjurLTUMXxdmtJ0h6OrTLqvIFPYTV0gSQfaWY/owKdTxjZxv2n2bQ==",
+      "resolved": "https://pkg.pr.new/vitepress-plugin-llms@51",
+      "integrity": "sha512-FTyNYyx1jVbKae/raJLgDTgMaHSmY51B1nbokeC4KAhXMe413eGSexNIdvnCHXf9U1t92VlLajJ5S9E7adDoOQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -20385,7 +20385,7 @@
         "picocolors": "^1.1.1",
         "remark": "^15.0.1",
         "remark-frontmatter": "^5.0.0",
-        "tokenx": "^0.4.1",
+        "tokenx": "^1.0.0",
         "unist-util-remove": "^4.0.0",
         "unist-util-visit": "^5.0.0"
       },
diff --git a/package.json b/package.json
index b6b2493e..4c1a6865 100644
--- a/package.json
+++ b/package.json
@@ -178,7 +178,7 @@
     "typescript-eslint": "^8.32.0",
     "vite-node": "^3.1.3",
     "vitepress": "^1.6.3",
-    "vitepress-plugin-llms": "^1.3.4",
+    "vitepress-plugin-llms": "https://pkg.pr.new/vitepress-plugin-llms@51",
     "vitest": "^3.1.3",
     "zx": "^8.5.4"
   },

From 06c0d60c984662ce84835073bd3f8807121abdcf Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 4 Jun 2025 01:14:30 +0300
Subject: [PATCH 09/14] fix: naming consistency

---
 docs/guide/chat-session.md                       |  4 ++--
 src/cli/commands/ChatCommand.ts                  | 16 ++++++++--------
 ...ingBudget.test.ts => reasoningBudget.test.ts} | 16 ++++++++--------
 3 files changed, 18 insertions(+), 18 deletions(-)
 rename test/modelDependent/qwen3-0.6b/{thinkingBudget.test.ts => reasoningBudget.test.ts} (88%)

diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md
index ec9c8541..a6a1a097 100644
--- a/docs/guide/chat-session.md
+++ b/docs/guide/chat-session.md
@@ -899,8 +899,8 @@ const fullResponse = a1.response
 console.log("Full response: " + fullResponse);
 ```
 
-## Set Thinking Budget {#thinking-budget}
-You can set a thinking budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments).
+## Set Reasoning Budget {#reasoning-budget}
+You can set a reasoning budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments).
 ```typescript
 import {
     getLlama, LlamaChatSession, resolveModelFile, Token
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index d1ebc8e1..a23e58e5 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -62,7 +62,7 @@ type ChatCommand = {
     repeatFrequencyPenalty?: number,
     repeatPresencePenalty?: number,
     maxTokens: number,
-    thoughtBudget?: number,
+    reasoningBudget?: number,
     noHistory: boolean,
     environmentFunctions: boolean,
     tokenPredictionDraftModel?: string,
@@ -263,8 +263,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: 0,
                 description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size"
             })
-            .option("thoughtBudget", {
-                alias: ["tb", "thinkingBudget", "reasoningBudget"],
+            .option("reasoningBudget", {
+                alias: ["tb", "thinkingBudget", "thoughtsBudget"],
                 type: "number",
                 default: -1,
                 defaultDescription: "Unlimited",
@@ -326,7 +326,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
-        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory,
+        repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
         environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
     }) {
         try {
@@ -335,7 +335,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads,
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-                maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
+                maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
                 debug, meter, timing, noMmap, printTimings
             });
         } catch (err) {
@@ -352,12 +352,12 @@ async function RunChat({
     contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg,
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
-    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
+    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
     tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
-    if (thoughtBudget === -1) thoughtBudget = undefined;
+    if (reasoningBudget === -1) reasoningBudget = undefined;
 
     const headers = resolveHeaderFlag(headerArg);
     const trimWhitespace = !noTrimWhitespace;
@@ -696,7 +696,7 @@ async function RunChat({
                 signal: abortController.signal,
                 stopOnAbortSignal: true,
                 budgets: {
-                    thoughtTokens: thoughtBudget
+                    thoughtTokens: reasoningBudget
                 },
                 repeatPenalty: {
                     penalty: repeatPenalty,
diff --git a/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts b/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts
similarity index 88%
rename from test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts
rename to test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts
index 35522794..78cf5480 100644
--- a/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts
+++ b/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts
@@ -4,8 +4,8 @@ import {getModelFile} from "../../utils/modelFiles.js";
 import {getTestLlama} from "../../utils/getTestLlama.js";
 
 describe("qwen3 0.6b", () => {
-    describe("thinking budget", () => {
-        test("doesn't exceed thinking budget", {timeout: 1000 * 60 * 60 * 2}, async () => {
+    describe("reasoning budget", () => {
+        test("doesn't exceed reasoning budget", {timeout: 1000 * 60 * 60 * 2}, async () => {
             const modelPath = await getModelFile("Qwen3-0.6B-Q8_0.gguf");
             const llama = await getTestLlama();
 
@@ -22,9 +22,9 @@ describe("qwen3 0.6b", () => {
             const initialChatHistory = chatSession.getChatHistory();
 
             async function promptWithBudget({
-                prompt, maxTokens, thinkingBudget
+                prompt, maxTokens, reasoningBudget
             }: {
-                prompt: string, maxTokens: number, thinkingBudget?: number
+                prompt: string, maxTokens: number, reasoningBudget?: number
             }) {
                 let thoughtTokens = 0;
                 let totalTokens = 0;
@@ -33,7 +33,7 @@ describe("qwen3 0.6b", () => {
                 const {responseText, response} = await chatSession.promptWithMeta(prompt, {
                     maxTokens,
                     budgets: {
-                        thoughtTokens: thinkingBudget
+                        thoughtTokens: reasoningBudget
                     },
                     onResponseChunk(chunk) {
                         if (chunk.type === "segment" && chunk.segmentType === "thought") {
@@ -57,7 +57,7 @@ describe("qwen3 0.6b", () => {
 
             const res1 = await promptWithBudget({
                 prompt: "Where do llamas come from?",
-                thinkingBudget: 10,
+                reasoningBudget: 10,
                 maxTokens: 20
             });
             expect(res1.thoughtTokens).to.be.gt(1);
@@ -67,7 +67,7 @@ describe("qwen3 0.6b", () => {
 
             const res2 = await promptWithBudget({
                 prompt: "Where do llamas come from?",
-                thinkingBudget: 0,
+                reasoningBudget: 0,
                 maxTokens: 20
             });
             expect(res2.thoughtTokens).to.be.eq(0);
@@ -76,7 +76,7 @@ describe("qwen3 0.6b", () => {
 
             const res3 = await promptWithBudget({
                 prompt: "Where do llamas come from?",
-                thinkingBudget: 20,
+                reasoningBudget: 20,
                 maxTokens: 20
             });
             expect(res3.thoughtTokens).to.be.eq(res3.totalTokens);

From 119652618b0f575f04749988bfd5b9c4c6afb1b3 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 4 Jun 2025 03:36:08 +0300
Subject: [PATCH 10/14] fix: bugs

---
 llama/addon/AddonContext.cpp               |  2 +-
 src/evaluator/LlamaContext/LlamaContext.ts | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index a64e3ada..775c2053 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -403,7 +403,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         }
 
         if (options.Has("batchSize")) {
-            context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value();
+            context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value() + 1; // +1 to handle edge cases with SWA KV cache
             context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side
         }
 
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index f1c263be..d3af97e0 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -1055,7 +1055,9 @@ export class LlamaContextSequence {
      *
      * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models).
      *
-     * When SWA is used, this index will usually be `Math.max(0, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
+     * When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger.
+     *
+     * When the KV cache is empty, this index will be `-1`.
      *
      * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context.
      */
@@ -1207,6 +1209,8 @@ export class LlamaContextSequence {
     ) {
         this._ensureNotDisposed();
 
+        let awaitPromise: Promise<void> | undefined;
+
         await withLock(this._context, "context", async () => {
             this._ensureNotDisposed();
 
@@ -1250,7 +1254,7 @@ export class LlamaContextSequence {
 
             const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0)
                 ? 0
-                : this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId);
+                : Math.max(0, this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId));
             if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition)
                 // we have to drop the cache and reevaluate the sequence due to missing KV cache
                 deletionSuccessful = false;
@@ -1310,8 +1314,12 @@ export class LlamaContextSequence {
             this._nextTokenIndex = 0;
             this._context._ctx.disposeSequence(this._sequenceId);
 
-            await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock});
+            // wait for the evaluation outside the "context" lock to avoid deadlocks
+            awaitPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock});
         });
+
+        if (awaitPromise != null)
+            await awaitPromise;
     }
 
     /**

From 2a0c539a90cf51906fe70b8cec1745a221678517 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 4 Jun 2025 03:42:48 +0300
Subject: [PATCH 11/14] test: fix test

---
 test/modelDependent/llama3.2/sequenceState.test.ts | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts
index 151fc4f3..e6267045 100644
--- a/test/modelDependent/llama3.2/sequenceState.test.ts
+++ b/test/modelDependent/llama3.2/sequenceState.test.ts
@@ -34,10 +34,10 @@ describe("llama 3.2", () => {
                     res1,
                     res2
                 ] = await Promise.all([
-                    chatSession1.prompt("Remember: locks are not doors", {maxTokens: 6}),
+                    chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}),
                     chatSession2.prompt("Remember: giraffes are not elephants", {maxTokens: 5})
                 ]);
-                expect(res1).to.toMatchInlineSnapshot("\"That's a clever phrase.\"");
+                expect(res1).to.toMatchInlineSnapshot("\"That's a clever\"");
                 expect(res2).to.toMatchInlineSnapshot('"I appreciate the reminder."');
 
 
@@ -47,8 +47,8 @@ describe("llama 3.2", () => {
                 test.onTestFinished(() => fs.remove(stateFile1Path));
 
                 expect(contextSequence1.contextTokens).to.eql(state1Tokens);
-                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105");
-                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.49MB"');
+                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103");
+                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot("\"11.27MB\"");
 
 
                 const stateFile2Path = await getTempTestFilePath("state2");
@@ -68,7 +68,7 @@ describe("llama 3.2", () => {
                 expect(contextSequence1TokensState1).toMatchInlineSnapshot(`
                   {
                     "usedInputTokens": 99,
-                    "usedOutputTokens": 6,
+                    "usedOutputTokens": 4,
                   }
                 `);
 
@@ -91,7 +91,7 @@ describe("llama 3.2", () => {
 
                 await contextSequence1.loadStateFromFile(stateFile1Path, {acceptRisk: true});
                 expect(contextSequence1.contextTokens).to.eql(state1Tokens);
-                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105");
+                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103");
 
                 const contextSequence1TokensState3 = contextSequence1.tokenMeter.getState();
                 expect(TokenMeter.diff(contextSequence1TokensState3, contextSequence1TokensState2)).toMatchInlineSnapshot(`

From 8d85100bf190153fa35f24d8e4186b665c5203fc Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 4 Jun 2025 03:51:58 +0300
Subject: [PATCH 12/14] fix: bugs

---
 llama/addon/AddonContext.cpp               | 2 +-
 src/evaluator/LlamaContext/LlamaContext.ts | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 775c2053..a64e3ada 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -403,7 +403,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         }
 
         if (options.Has("batchSize")) {
-            context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value() + 1; // +1 to handle edge cases with SWA KV cache
+            context_params.n_batch = options.Get("batchSize").As<Napi::Number>().Uint32Value();
             context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side
         }
 
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index d3af97e0..8a5cff98 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -125,7 +125,11 @@ export class LlamaContext {
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
-            batchSize: this._batchSize,
+            batchSize: this._batchSize + (
+                (!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0)
+                    ? 1 // +1 to handle edge cases with SWA KV cache
+                    : 0
+            ),
             sequences: this._totalSequences,
             flashAttention: this._flashAttention,
             threads: this._idealThreads,

From 5c95321e447e25cbdd4ff9b46a2de02005b67db2 Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Wed, 4 Jun 2025 23:33:06 +0300
Subject: [PATCH 13/14] chore: add internal debug method

---
 llama/addon/AddonContext.cpp | 14 ++++++++++++++
 llama/addon/AddonContext.h   |  1 +
 src/bindings/AddonTypes.ts   |  1 +
 src/bindings/getLlama.ts     |  1 +
 4 files changed, 17 insertions(+)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index a64e3ada..574dd79f 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -638,6 +638,19 @@ Napi::Value AddonContext::GetSequenceKvCacheMinPosition(const Napi::CallbackInfo
 
     return Napi::Number::New(info.Env(), minPosition);
 }
+Napi::Value AddonContext::GetSequenceKvCacheMaxPosition(const Napi::CallbackInfo& info) {
+    if (disposed) {
+        Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }
+
+    int32_t sequenceId = info[0].As<Napi::Number>().Int32Value();
+
+
+    const auto maxPosition = llama_kv_self_seq_pos_max(ctx, sequenceId);
+
+    return Napi::Number::New(info.Env(), maxPosition);
+}
 Napi::Value AddonContext::DecodeBatch(const Napi::CallbackInfo& info) {
     AddonContextDecodeBatchWorker* worker = new AddonContextDecodeBatchWorker(info.Env(), this);
     worker->Queue();
@@ -945,6 +958,7 @@ void AddonContext::init(Napi::Object exports) {
                 InstanceMethod("removeTokenCellsFromSequence", &AddonContext::RemoveTokenCellsFromSequence),
                 InstanceMethod("shiftSequenceTokenCells", &AddonContext::ShiftSequenceTokenCells),
                 InstanceMethod("getSequenceKvCacheMinPosition", &AddonContext::GetSequenceKvCacheMinPosition),
+                InstanceMethod("getSequenceKvCacheMaxPosition", &AddonContext::GetSequenceKvCacheMaxPosition),
                 InstanceMethod("decodeBatch", &AddonContext::DecodeBatch),
                 InstanceMethod("sampleToken", &AddonContext::SampleToken),
                 InstanceMethod("getEmbedding", &AddonContext::GetEmbedding),
diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h
index 0edbedc7..7e661f12 100644
--- a/llama/addon/AddonContext.h
+++ b/llama/addon/AddonContext.h
@@ -37,6 +37,7 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
         Napi::Value RemoveTokenCellsFromSequence(const Napi::CallbackInfo& info);
         Napi::Value ShiftSequenceTokenCells(const Napi::CallbackInfo& info);
         Napi::Value GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info);
+        Napi::Value GetSequenceKvCacheMaxPosition(const Napi::CallbackInfo& info);
         Napi::Value DecodeBatch(const Napi::CallbackInfo& info);
         Napi::Value SampleToken(const Napi::CallbackInfo& info);
 
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index e74a1132..a2f06ae9 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -146,6 +146,7 @@ export type AddonContext = {
     shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void,
 
     getSequenceKvCacheMinPosition(sequenceId: number): number,
+    getSequenceKvCacheMaxPosition(sequenceId: number): number,
     getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array,
     getStateSize(): number,
     getThreads(): number,
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index faf626b4..8ba71a22 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -365,6 +365,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp
     return getLlamaForOptions(options ?? {});
 }
 
+// internal
 export async function getLlamaForOptions({
     gpu = defaultLlamaCppGpuSupport,
     logLevel = defaultLlamaCppLogLevel,

From 83dfa5e2d420b61a2dcf21950e76b6046314760e Mon Sep 17 00:00:00 2001
From: Gilad S <giladgd@gmail.com>
Date: Thu, 5 Jun 2025 00:28:04 +0300
Subject: [PATCH 14/14] chore: module versions

---
 package-lock.json | 32 +++++---------------------------
 package.json      |  2 +-
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 871a8067..2a79518a 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -22,7 +22,7 @@
         "filenamify": "^6.0.0",
         "fs-extra": "^11.3.0",
         "ignore": "^7.0.4",
-        "ipull": "^4.0.3",
+        "ipull": "^3.9.2",
         "is-unicode-supported": "^2.1.0",
         "lifecycle-utils": "^2.0.0",
         "log-symbols": "^7.0.0",
@@ -2232,15 +2232,6 @@
       "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==",
       "license": "MIT"
     },
-    "node_modules/@lukeed/csprng": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@lukeed/csprng/-/csprng-1.1.0.tgz",
-      "integrity": "sha512-Z7C/xXCiGWsg0KuKsHTKJxbWhpI3Vs5GwLfOean7MGyVFGqdRgBbAjOCh6u4bbjPc/8MJ2pZmK/0DLdCbivLDA==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
     "node_modules/@modelcontextprotocol/sdk": {
       "version": "1.11.1",
       "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.11.1.tgz",
@@ -10618,9 +10609,9 @@
       }
     },
     "node_modules/ipull": {
-      "version": "4.0.3",
-      "resolved": "https://registry.npmjs.org/ipull/-/ipull-4.0.3.tgz",
-      "integrity": "sha512-mPcOnm1hX1GTL4/f1C5IQFbo1uxqKihZX8KbaHWWnJ7NW4SKQaelRAVy9iVb8XgugMnlEo6TQVBrzCbOvswbsA==",
+      "version": "3.9.2",
+      "resolved": "https://registry.npmjs.org/ipull/-/ipull-3.9.2.tgz",
+      "integrity": "sha512-YbCDsqcf0ytc3b8304ygBlvRtKJTvyygkQX2xcmPkih6vdVKbRw13pDdtSR+vEqLql3owyuPj9m6iT6IfwFaCg==",
       "license": "MIT",
       "dependencies": {
         "@tinyhttp/content-disposition": "^2.2.0",
@@ -10641,8 +10632,7 @@
         "sleep-promise": "^9.1.0",
         "slice-ansi": "^7.1.0",
         "stdout-update": "^4.0.1",
-        "strip-ansi": "^7.1.0",
-        "uid": "^2.0.2"
+        "strip-ansi": "^7.1.0"
       },
       "bin": {
         "ipull": "dist/cli/cli.js"
@@ -19876,18 +19866,6 @@
         "node": ">=0.8.0"
       }
     },
-    "node_modules/uid": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/uid/-/uid-2.0.2.tgz",
-      "integrity": "sha512-u3xV3X7uzvi5b1MncmZo3i2Aw222Zk1keqLA1YkHldREkAhAqi65wuPfe7lHx8H/Wzy+8CE7S7uS3jekIM5s8g==",
-      "license": "MIT",
-      "dependencies": {
-        "@lukeed/csprng": "^1.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
     "node_modules/unbox-primitive": {
       "version": "1.1.0",
       "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz",
diff --git a/package.json b/package.json
index 4c1a6865..cc455955 100644
--- a/package.json
+++ b/package.json
@@ -195,7 +195,7 @@
     "filenamify": "^6.0.0",
     "fs-extra": "^11.3.0",
     "ignore": "^7.0.4",
-    "ipull": "^4.0.3",
+    "ipull": "^3.9.2",
     "is-unicode-supported": "^2.1.0",
     "lifecycle-utils": "^2.0.0",
     "log-symbols": "^7.0.0",