From cabafeab69900f035a5730e6c12a574ca8774a08 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 1 Jun 2025 02:05:39 +0300 Subject: [PATCH 01/14] feat: SWA support --- llama/addon/AddonContext.cpp | 19 ++ llama/addon/AddonContext.h | 1 + llama/addon/addon.cpp | 14 ++ src/bindings/AddonTypes.ts | 5 +- src/cli/commands/ChatCommand.ts | 19 +- src/cli/commands/CompleteCommand.ts | 17 +- src/cli/commands/InfillCommand.ts | 17 +- .../commands/InspectEstimateCommand.ts | 14 +- .../inspect/commands/InspectMeasureCommand.ts | 37 ++- src/cli/utils/interactivelyAskForModel.ts | 14 +- src/cli/utils/resolveCommandGgufPath.ts | 5 +- src/evaluator/LlamaContext/LlamaContext.ts | 53 +++- src/evaluator/LlamaContext/types.ts | 16 ++ src/evaluator/LlamaModel/LlamaModel.ts | 25 +- src/gguf/insights/GgufInsights.ts | 234 +++++++++++------- .../GgufInsightsConfigurationResolver.ts | 19 +- .../utils/resolveContextContextSizeOption.ts | 6 +- .../utils/resolveModelGpuLayersOption.ts | 26 +- src/gguf/types/GgufMetadataTypes.ts | 1 + 19 files changed, 404 insertions(+), 138 deletions(-) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 1f8a8726..a64e3ada 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -393,6 +393,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap 1 && info[1].IsObject()) { Napi::Object options = info[1].As(); @@ -433,6 +434,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value()); } + + if (options.Has("swaFullCache")) { + context_params.swa_full = options.Get("swaFullCache").As().Value(); + } } } AddonContext::~AddonContext() { @@ -620,6 +625,19 @@ Napi::Value AddonContext::ShiftSequenceTokenCells(const Napi::CallbackInfo& info return info.Env().Undefined(); } +Napi::Value AddonContext::GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + int32_t sequenceId = info[0].As().Int32Value(); + + + const auto minPosition = llama_kv_self_seq_pos_min(ctx, sequenceId); + + return Napi::Number::New(info.Env(), minPosition); +} Napi::Value AddonContext::DecodeBatch(const Napi::CallbackInfo& info) { AddonContextDecodeBatchWorker* worker = new AddonContextDecodeBatchWorker(info.Env(), this); worker->Queue(); @@ -926,6 +944,7 @@ void AddonContext::init(Napi::Object exports) { InstanceMethod("disposeSequence", &AddonContext::DisposeSequence), InstanceMethod("removeTokenCellsFromSequence", &AddonContext::RemoveTokenCellsFromSequence), InstanceMethod("shiftSequenceTokenCells", &AddonContext::ShiftSequenceTokenCells), + InstanceMethod("getSequenceKvCacheMinPosition", &AddonContext::GetSequenceKvCacheMinPosition), InstanceMethod("decodeBatch", &AddonContext::DecodeBatch), InstanceMethod("sampleToken", &AddonContext::SampleToken), InstanceMethod("getEmbedding", &AddonContext::GetEmbedding), diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h index 933ba8f0..0edbedc7 100644 --- a/llama/addon/AddonContext.h +++ b/llama/addon/AddonContext.h @@ -36,6 +36,7 @@ class AddonContext : public Napi::ObjectWrap { Napi::Value DisposeSequence(const Napi::CallbackInfo& info); Napi::Value RemoveTokenCellsFromSequence(const Napi::CallbackInfo& info); Napi::Value ShiftSequenceTokenCells(const Napi::CallbackInfo& info); + Napi::Value GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info); Napi::Value DecodeBatch(const Napi::CallbackInfo& info); Napi::Value SampleToken(const Napi::CallbackInfo& info); diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index 943866c0..eef81c25 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -73,6 +73,19 @@ Napi::Value addonGetTypeSizeForGgmlType(const Napi::CallbackInfo& info) { return Napi::Number::New(info.Env(), typeSize); } +Napi::Value addonGetGgmlGraphOverheadCustom(const Napi::CallbackInfo& info) { + if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsBoolean()) { + return Napi::Number::New(info.Env(), 0); + } + + const size_t size = info[0].As().Uint32Value(); + const bool grads = info[1].As().Value(); + + const auto graphOverhead = ggml_graph_overhead_custom(size, grads); + + return Napi::Number::New(info.Env(), graphOverhead); +} + Napi::Value addonGetConsts(const Napi::CallbackInfo& info) { Napi::Object consts = Napi::Object::New(info.Env()); consts.Set("ggmlMaxDims", Napi::Number::New(info.Env(), GGML_MAX_DIMS)); @@ -231,6 +244,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { Napi::PropertyDescriptor::Function("getMathCores", addonGetMathCores), Napi::PropertyDescriptor::Function("getBlockSizeForGgmlType", addonGetBlockSizeForGgmlType), Napi::PropertyDescriptor::Function("getTypeSizeForGgmlType", addonGetTypeSizeForGgmlType), + Napi::PropertyDescriptor::Function("getGgmlGraphOverheadCustom", addonGetGgmlGraphOverheadCustom), Napi::PropertyDescriptor::Function("getConsts", addonGetConsts), Napi::PropertyDescriptor::Function("setLogger", setLogger), Napi::PropertyDescriptor::Function("setLoggerLogLevel", setLoggerLogLevel), diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index b1f3ca0b..e74a1132 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -28,7 +28,8 @@ export type BindingModule = { embeddings?: boolean, ranking?: boolean, threads?: number, - performanceTracking?: boolean + performanceTracking?: boolean, + swaFullCache?: boolean }): AddonContext }, AddonGrammar: { @@ -54,6 +55,7 @@ export type BindingModule = { getMathCores(): number, getBlockSizeForGgmlType(ggmlType: number): number | undefined, getTypeSizeForGgmlType(ggmlType: number): number | undefined, + getGgmlGraphOverheadCustom(size: number, grads: boolean): number, getConsts(): { ggmlMaxDims: number, ggmlTypeF16Size: number, @@ -143,6 +145,7 @@ export type AddonContext = { // startPos in inclusive, endPos is exclusive shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void, + getSequenceKvCacheMinPosition(sequenceId: number): number, getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array, getStateSize(): number, getThreads(): number, diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 79a71c65..26e85cdd 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -45,6 +45,7 @@ type ChatCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + swaFullCache?: boolean, noTrimWhitespace: boolean, grammar: "text" | Parameters[1], jsonSchemaGrammarFile?: string, @@ -162,6 +163,12 @@ export const ChatCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("noTrimWhitespace", { type: "boolean", alias: ["noTrim"], @@ -308,7 +315,7 @@ export const ChatCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, - promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, + promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, @@ -317,7 +324,8 @@ export const ChatCommand: CommandModule = { try { await RunChat({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, - batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, + batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, + temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings @@ -333,7 +341,8 @@ export const ChatCommand: CommandModule = { async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, - contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, + contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, + jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings @@ -363,11 +372,13 @@ async function RunChat({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, + swaFullCache, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, + swaFullCache, useMmap, consoleTitle: "Draft model file" }) @@ -413,6 +424,7 @@ async function RunChat({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { @@ -446,6 +458,7 @@ async function RunChat({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index f8c7790e..1aae93fd 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -32,6 +32,7 @@ type CompleteCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + swaFullCache?: boolean, threads?: number, temperature: number, minP: number, @@ -119,6 +120,12 @@ export const CompleteCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -235,14 +242,14 @@ export const CompleteCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, - flashAttention, threads, temperature, minP, topK, + flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }) { try { await RunCompletion({ - modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, + modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings @@ -257,7 +264,7 @@ export const CompleteCommand: CommandModule = { async function RunCompletion({ - modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, + modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings @@ -286,11 +293,13 @@ async function RunCompletion({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, + swaFullCache, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, + swaFullCache, useMmap, consoleTitle: "Draft model file" }) @@ -329,6 +338,7 @@ async function RunCompletion({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { @@ -362,6 +372,7 @@ async function RunCompletion({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 7a4a536b..a47df068 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -34,6 +34,7 @@ type InfillCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + swaFullCache?: boolean, threads?: number, temperature: number, minP: number, @@ -129,6 +130,12 @@ export const InfillCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -245,7 +252,7 @@ export const InfillCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, - flashAttention, threads, temperature, minP, topK, + flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings @@ -253,7 +260,7 @@ export const InfillCommand: CommandModule = { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, + swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }); @@ -268,7 +275,7 @@ export const InfillCommand: CommandModule = { async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - threads, temperature, minP, topK, topP, seed, gpuLayers, + swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings }: InfillCommand) { @@ -296,11 +303,13 @@ async function RunInfill({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, + swaFullCache, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, + swaFullCache, useMmap, consoleTitle: "Draft model file" }) @@ -353,6 +362,7 @@ async function RunInfill({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { @@ -386,6 +396,7 @@ async function RunInfill({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index db34de6d..ffd5f65e 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -32,7 +32,8 @@ type InspectEstimateCommand = { gpuLayers?: number | "max", contextSize?: number | "train", embedding?: boolean, - noMmap?: boolean + noMmap?: boolean, + swaFullCache?: boolean }; export const InspectEstimateCommand: CommandModule = { @@ -115,10 +116,16 @@ export const InspectEstimateCommand: CommandModule default: false, description: "Enable flash attention for the context" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("measures", { alias: "n", type: "number", @@ -140,8 +147,8 @@ export const InspectMeasureCommand: CommandModule }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, - memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText + modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache, + measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }: InspectMeasureCommand) { if (maxLayers === -1) maxLayers = undefined; if (maxContextSize === -1) maxContextSize = undefined; @@ -162,7 +169,7 @@ export const InspectMeasureCommand: CommandModule const useMmap = !noMmap && llama.supportsMmap; const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, { - flashAttention, useMmap + flashAttention, swaFullCache, useMmap }); console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`); @@ -216,6 +223,7 @@ export const InspectMeasureCommand: CommandModule maxContextSize, minContextSize, flashAttention, + swaFullCache, tests: measures, evaluateText: evaluateText == null ? undefined @@ -286,7 +294,8 @@ export const InspectMeasureCommand: CommandModule : ggufInsights.estimateContextResourceRequirements({ contextSize: previousContextSizeCheck, modelGpuLayers: lastGpuLayers, - flashAttention + flashAttention, + swaFullCache }); const contextVramEstimation = contextResourceEstimation?.gpuVram; @@ -496,7 +505,7 @@ const expectedFileName = "InspectMeasureCommand"; async function measureModel({ modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, - evaluateText, exitAfterMeasurement = false, onInfo + swaFullCache, evaluateText, exitAfterMeasurement = false, onInfo }: { modelPath: string, useMmap?: boolean, @@ -508,6 +517,7 @@ async function measureModel({ maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + swaFullCache?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean, onInfo(data: { @@ -615,6 +625,7 @@ async function measureModel({ maxGpuLayers, minGpuLayers, flashAttention, + swaFullCache, evaluateText, exitAfterMeasurement } satisfies ParentToChildMessage); @@ -716,11 +727,12 @@ async function runTestWorkerLogic() { } async function testContextSizes({ - model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText, - exitAfterMeasurement = false + model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache, + evaluateText, exitAfterMeasurement = false }: { model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, tests: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean + minContextSize?: number, tests: number, flashAttention?: boolean, swaFullCache?: boolean, evaluateText?: string, + exitAfterMeasurement?: boolean }) { let measurementsDone: number = 0; const contextSizeCheckPlan = getContextSizesCheckPlan( @@ -750,6 +762,7 @@ async function runTestWorkerLogic() { ), ignoreMemorySafetyChecks: currentContextSizeCheck != null, flashAttention, + swaFullCache, failedCreationRemedy: false }); @@ -803,11 +816,11 @@ async function runTestWorkerLogic() { } async function testWithGpuLayers({ - modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText, + modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, evaluateText, exitAfterMeasurement = false }: { modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean + minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean }) { try { const preModelVramUsage = (await llama.getVramState()).used; @@ -817,6 +830,7 @@ async function runTestWorkerLogic() { useMmap, gpuLayers, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, ignoreMemorySafetyChecks: true }); const postModelVramUsage = (await llama.getVramState()).used; @@ -839,6 +853,7 @@ async function runTestWorkerLogic() { maxContextSize, minContextSize, flashAttention, + swaFullCache, tests, evaluateText, exitAfterMeasurement @@ -887,6 +902,7 @@ async function runTestWorkerLogic() { maxContextSize: message.maxContextSize, minContextSize: message.minContextSize, flashAttention: message.flashAttention, + swaFullCache: message.swaFullCache, evaluateText: message.evaluateText, exitAfterMeasurement: message.exitAfterMeasurement }); @@ -976,6 +992,7 @@ type ParentToChildMessage = { maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + swaFullCache?: boolean, initialMaxContextSize?: number, maxContextSize?: number, minContextSize?: number, diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index 7ceb9773..a896a5ce 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -60,6 +60,7 @@ export async function interactivelyAskForModel({ allowLocalModels = true, downloadIntent = true, flashAttention = false, + swaFullCache = false, useMmap }: { llama: Llama, @@ -67,6 +68,7 @@ export async function interactivelyAskForModel({ allowLocalModels?: boolean, downloadIntent?: boolean, flashAttention?: boolean, + swaFullCache?: boolean, useMmap?: boolean }): Promise { let localModelFileOptions: (ModelOption & {type: "localModel"})[] = []; @@ -120,6 +122,7 @@ export async function interactivelyAskForModel({ const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention: flashAttention && ggufInsights?.flashAttentionSupported, + swaFullCache: swaFullCache, useMmap }); @@ -292,7 +295,9 @@ export async function interactivelyAskForModel({ }, items: options, renderItem(item, focused, rerender) { - return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention, useMmap); + return renderSelectionItem( + item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap + ); }, canFocusItem(item) { return item.type === "recommendedModel" || item.type === "localModel" || item.type === "action"; @@ -408,7 +413,7 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean, - useMmap?: boolean + swaFullCache: boolean, useMmap?: boolean ) { if (item.type === "localModel") { let modelText = item.title instanceof Function @@ -435,6 +440,7 @@ function renderSelectionItem( rerenderOption: rerender, llama, flashAttention, + swaFullCache, useMmap }); } @@ -557,13 +563,14 @@ function renderRecommendedModelTechnicalInfo( } async function selectFileForModelRecommendation({ - recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, useMmap + recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap }: { recommendedModelOption: ModelOption & {type: "recommendedModel"}, llama: Llama, abortSignal: AbortSignal, rerenderOption(): void, flashAttention: boolean, + swaFullCache: boolean, useMmap?: boolean }) { try { @@ -586,6 +593,7 @@ async function selectFileForModelRecommendation({ const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention, + swaFullCache, useMmap }); diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts index 7b04b0ce..219d1808 100644 --- a/src/cli/utils/resolveCommandGgufPath.ts +++ b/src/cli/utils/resolveCommandGgufPath.ts @@ -13,9 +13,9 @@ import {getReadablePath} from "./getReadablePath.js"; import {interactivelyAskForModel} from "./interactivelyAskForModel.js"; export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record, { - targetDirectory = cliModelsDirectory, flashAttention = false, useMmap, consoleTitle = "File" + targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File" }: { - targetDirectory?: string, flashAttention?: boolean, useMmap?: boolean, consoleTitle?: string + targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string } = {}) { if (ggufPath == null) ggufPath = await interactivelyAskForModel({ @@ -24,6 +24,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama allowLocalModels: true, downloadIntent: true, flashAttention, + swaFullCache, useMmap }); diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index e5797a4f..f1c263be 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -53,6 +53,7 @@ export class LlamaContext { /** @internal */ private readonly _totalSequences: number; /** @internal */ private readonly _unusedSequenceIds: number[] = []; /** @internal */ private readonly _batchingOptions: Required; + /** @internal */ private readonly _swaFullCache: boolean = false; /** @internal */ private readonly _queuedDecodeSequenceIds = new Set(); /** @internal */ private readonly _queuedDecodes: InternalQueuedDecode[] = []; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); @@ -84,6 +85,7 @@ export class LlamaContext { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, + swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, _embeddings, _ranking @@ -120,6 +122,7 @@ export class LlamaContext { : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1) ); this._performanceTracking = !!performanceTracking; + this._swaFullCache = !!swaFullCache; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ contextSize: this._contextSize * this._totalSequences, // each sequence needs its own of cells batchSize: this._batchSize, @@ -128,7 +131,8 @@ export class LlamaContext { threads: this._idealThreads, embeddings: _embeddings, ranking: _ranking, - performanceTracking: this._performanceTracking + performanceTracking: this._performanceTracking, + swaFullCache: this._swaFullCache })); this._batchingOptions = { dispatchSchedule: batchingDispatchSchedule, @@ -783,6 +787,7 @@ export class LlamaContext { const flashAttention = _model.flashAttentionSupported ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) : false; + const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache; const loraOptions = typeof options.lora === "string" ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"] : options.lora satisfies LlamaContextOptions["lora"]; @@ -799,6 +804,7 @@ export class LlamaContext { modelGpuLayers: _model.gpuLayers, modelTrainContextSize: _model.trainContextSize, flashAttention, + swaFullCache, getVramState: () => _model._llama._vramOrchestrator.getMemoryState(), llamaGpu: _model._llama.gpu, ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks, @@ -821,10 +827,11 @@ export class LlamaContext { isEmbeddingContext: options._embeddings, modelGpuLayers: _model.gpuLayers, batchSize, - flashAttention + flashAttention, + swaFullCache }); - const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention}); + const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention, swaFullCache}); const contextCreationVramReservation = options.ignoreMemorySafetyChecks ? null : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram); @@ -1035,6 +1042,29 @@ export class LlamaContextSequence { return this._tokenPredictor; } + /** + * Get the index of the first token in the KV cache. + * + * If you remove any tokens from the state that come before this index, + * no cached prefix tokens evaluation state will be used for the next evaluation. + * + * For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}` + * then the cached state for range `0-10` will be used in the next evaluation, + * but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all + * and will be re-evaluated in the next evaluation. + * + * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models). + * + * When SWA is used, this index will usually be `Math.max(0, .nextTokenIndex - .model.fileInsights.swaSize)` or larger. + * + * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context. + */ + public get stateCellsStartIndex() { + this._ensureNotDisposed(); + + return this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId); + } + /** * Statistics of token predictions using the sequence's `tokenPredictor`. * @@ -1218,6 +1248,13 @@ export class LlamaContextSequence { return ranges; }, [] as ContextTokensDeleteRange[]); + const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0) + ? 0 + : this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId); + if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition) + // we have to drop the cache and reevaluate the sequence due to missing KV cache + deletionSuccessful = false; + const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens) ? this._loadedTokenPredictions.length : 0; @@ -1578,12 +1615,13 @@ export class LlamaContextSequence { } } + /* eslint-disable @stylistic/max-len */ /** * Save the current context sequence evaluation state to a file. - * @see [Saving and restoring a context sequence evaluation state - * ](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) + * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) */ public async saveStateToFile(filePath: string) { + /* eslint-enable @stylistic/max-len */ this._ensureNotDisposed(); const resolvedPath = path.resolve(process.cwd(), filePath); @@ -1606,14 +1644,14 @@ export class LlamaContextSequence { } } + /* eslint-disable @stylistic/max-len */ /** * Load a context sequence evaluation state from a file. * * Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error. * * You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process. - * @see [Saving and restoring a context sequence evaluation state - * ](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) + * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) */ public async loadStateFromFile(filePath: string, acceptRisk: { /** @@ -1623,6 +1661,7 @@ export class LlamaContextSequence { */ acceptRisk: true }) { + /* eslint-enable @stylistic/max-len */ if (!acceptRisk.acceptRisk) throw new Error("The `acceptRisk` option must be set to `true` to use this feature"); diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index 16d17bce..52a18bf9 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -99,6 +99,22 @@ export type LlamaContextOptions = { */ batching?: BatchingOptions, + /** + * When using SWA (Sliding Window Attention) on a supported model, + * extend the sliding window size to the current context size (meaning practically disabling SWA). + * + * Enabling this option will consume more memory on models that support SWA (Sliding Window Attention), + * but will allow reusing the evaluation cache of any prefix length of the context sequence state + * (instead of just the size of the sliding window when SWA is used). + * + * This option has no effect on models that do not support SWA (Sliding Window Attention). + * + * > **Note:** you can check the SWA size using `model.fileInsights.swaSize`. + * + * Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`); + */ + swaFullCache?: boolean, + /** * Load the provided LoRA adapters onto the context. * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 0be0bddc..f53ab21a 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -111,6 +111,17 @@ export type LlamaModelOptions = { */ defaultContextFlashAttention?: boolean, + /** + * When using SWA (Sliding Window Attention) on a supported model, + * extend the sliding window size to the current context size (meaning practically disabling SWA) + * by default for contexts created with this model. + * + * See the `swaFullCache` option of the `.createContext()` method for more information. + * + * Defaults to `false`. + */ + defaultContextSwaFullCache?: boolean, + /** * Called with the load percentage when the model is being loaded. * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive). @@ -140,6 +151,7 @@ export type LlamaModelOptions = { const defaultUseMmap = true; const defaultContextFlashAttentionEnabled = false; +const defaultContextSwaFullCache = false; export class LlamaModel { /** @internal */ public readonly _llama: Llama; @@ -157,6 +169,7 @@ export class LlamaModel { /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle; /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean; /** @internal */ private readonly _defaultContextFlashAttention: boolean; + /** @internal */ private readonly _defaultContextSwaFullCache: boolean; /** @internal */ private readonly _flashAttentionSupported: boolean; /** @internal */ private readonly _loraAdapters = new Map(); /** @internal */ private _typeDescription?: ModelTypeDescription; @@ -177,6 +190,7 @@ export class LlamaModel { _fileInsights, _defaultContextFlashAttentionOptionEnabled, _defaultContextFlashAttention, + _defaultContextSwaFullCache, _flashAttentionSupported }: { _llama: Llama, @@ -184,6 +198,7 @@ export class LlamaModel { _fileInsights: GgufInsights, _defaultContextFlashAttentionOptionEnabled: boolean, _defaultContextFlashAttention: boolean, + _defaultContextSwaFullCache: boolean, _flashAttentionSupported: boolean }) { this._llama = _llama; @@ -196,6 +211,7 @@ export class LlamaModel { this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle(); this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled; this._defaultContextFlashAttention = _defaultContextFlashAttention; + this._defaultContextSwaFullCache = _defaultContextSwaFullCache; this._flashAttentionSupported = _flashAttentionSupported; const overridesList = ggufMetadataOverridesToList(metadataOverrides); this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({ @@ -321,6 +337,10 @@ export class LlamaModel { return this._defaultContextFlashAttention; } + public get defaultContextSwaFullCache() { + return this._defaultContextSwaFullCache; + } + /** * Transform text into tokens that can be fed to the model * @param text - the text to tokenize @@ -700,9 +720,11 @@ export class LlamaModel { const resolvedDefaultContextFlashAttention = flashAttentionSupported ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) : false; + const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache; const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, defaultContextFlashAttention: resolvedDefaultContextFlashAttention, + defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, useMmap }); const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({ @@ -716,7 +738,8 @@ export class LlamaModel { _llama, _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false, _flashAttentionSupported: flashAttentionSupported, - _defaultContextFlashAttention: resolvedDefaultContextFlashAttention + _defaultContextFlashAttention: resolvedDefaultContextFlashAttention, + _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache }); const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks ? null diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index 8b0f85e9..eb8330a8 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -15,7 +15,7 @@ export type GgufInsightsResourceRequirements = { export class GgufInsights { /** @internal */ public readonly _llama: Llama; /** @internal */ private readonly _modelSize: number; - /** @internal */ private _totalLayers: number | null = null; + /** @internal */ private _totalFileLayers: number | null = null; /** @internal */ private readonly _ggufFileInfo: GgufFileInfo; /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver; @@ -71,13 +71,8 @@ export class GgufInsights { } public get totalLayers() { - if (this._totalLayers != null) - return this._totalLayers; - const outputLayers = 1; - this._totalLayers = this._getFileLayers() + outputLayers; - - return this._totalLayers; + return this._getTotalFileLayers() + outputLayers; } public get modelSize() { @@ -133,6 +128,23 @@ export class GgufInsights { return false; } + /** + * The size of the SWA (Sliding Window Attention). + * + * When `undefined`, the model does not use sliding window attention. + */ + public get swaSize() { + const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window; + if (slidingWindow == null || slidingWindow <= 0) + return undefined; + + const trainContextSize = this.trainContextSize; + if (trainContextSize != null && slidingWindow >= trainContextSize) + return undefined; + + return slidingWindow; + } + public estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }: { @@ -152,72 +164,72 @@ export class GgufInsights { * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. */ public estimateContextResourceRequirements({ - contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, + swaFullCache = false }: { contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, - flashAttention?: boolean, includeGraphOverhead?: boolean + flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean }): GgufInsightsResourceRequirements { if (sequences == null) sequences = getDefaultContextSequences(); if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences}); - const actualContextSize = contextSize * sequences; - - const totalLayers = this.totalLayers; - const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalLayers, totalLayers)); - const finalCpuLayers = totalLayers - finalGpuLayers; const llmData = this._ggufFileInfo.architectureMetadata; + const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; + const slidingWindow = this.swaSize ?? 0; + const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize && + (this.trainContextSize == null || slidingWindow < this.trainContextSize); + const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture); + const nonSwaPercent = swaPattern <= 1 + ? 1 + : (1 / (swaPattern + (flashAttention ? -0.5 : -1))); + + // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp` + const kvCachePadding = flashAttention + ? 256 + : 32; + const actualContextSize = sequences * contextSize; + const kvSize = usingSWA + ? ( + (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) + + nonSwaPercent * actualContextSize + ) + : actualContextSize; + + const totalFileLayers = this._getTotalFileLayers(); + const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers)); + const finalCpuLayers = totalFileLayers - finalGpuLayers; + const usingGpu = finalGpuLayers !== 0; const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0; - const logitsSize = vocabularySize * batchSize; - const embedSize = isEmbeddingContext - ? (llmData.embedding_length ?? 0) * batchSize - : 0; + const embeddingSize = llmData.embedding_length ?? 0; const sizeTBytes = 8; // sizeof(size_t) const floatBytes = 4; // sizeof(float) const uint32TBytes = 4; // sizeof(uint32_t) const int32TBytes = 4; // sizeof(int32_t) - // source: `llama_state_get_size` in `llama.cpp` - const sRngSize = sizeTBytes; - const sRng = 64 * 1024; // LLAMA_MAX_RNG_STATE - const sNOutputs = sizeTBytes; - const sNOutputPos = batchSize * int32TBytes; - const sLogitsSize = sizeTBytes; - const sLogits = logitsSize * floatBytes; - const sEmbeddingSize = sizeTBytes; - const sEmbedding = embedSize * floatBytes; - const sKvBufSize = sizeTBytes; - const sKvHead = uint32TBytes; - const sKvSize = uint32TBytes; - const sKvUsed = uint32TBytes; - const sKv = 2 * int32TBytes * modelGpuLayers * this._llama._consts.ggmlTensorOverhead; - const sKvCell = this._llama._consts.llamaPosSize + sizeTBytes + this._llama._consts.llamaSeqIdSize; - const kvSelfLength = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba - ? Math.max(1, sequences) - : actualContextSize; - const sKvCells = kvSelfLength * sKvCell; - - const overheadMemory = ( - sRngSize + - sRng + - sNOutputs + - sNOutputPos + - sLogitsSize + - sLogits + - sEmbeddingSize + - sEmbedding + - sKvBufSize + - sKvHead + - sKvSize + - sKvUsed + - sKv + - sKvCells - ); + const estimateOutput = (nOutputs: number) => { + // source: `llama_context::output_reserve` in `llama-context.cpp` + const nOutputsMax = Math.max(batchSize, nOutputs); + + const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5; + const hasLogits = isT5 || !isEmbeddingContext; + const hasEmbd = isT5 || isEmbeddingContext; + + const logitsSize = hasLogits + ? (vocabularySize * nOutputsMax) + : 0; + const embdSize = hasEmbd + ? (embeddingSize * nOutputsMax) + : 0; + const outputBufferSize = (logitsSize + embdSize) * floatBytes; + + const outputIdsArr = int32TBytes * batchSize; + + return outputBufferSize + outputIdsArr; + }; - // Estimates the memory allocated by `ggml_backend_sched_reserve` in `llama_new_context_with_model` in `llama.cpp`. - // If you read this line and have better insights on how to estimate this memory, please open a PR to improve it :) - const estimateGraphOverheadMemory = () => { + const estimateGraphOverheadMemory = (): number => { const s1MB = Math.pow(1024, 2); const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; @@ -234,23 +246,23 @@ export class GgufInsights { if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; - return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (actualContextSize * headCount)); + return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); } - return int32TBytes * batchSize * (embeddingLength + (actualContextSize * headCount)); + return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) { if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment -= (s1MB * 340) * ( this.trainContextSize == null ? 1 - : actualContextSize / this.trainContextSize + : kvSize / this.trainContextSize ); } else { defaultCalculationAdjustment -= (s1MB * 250) + ( (s1MB * 50) * ( this.trainContextSize == null ? 1 - : actualContextSize / this.trainContextSize + : kvSize / this.trainContextSize ) ); } @@ -263,7 +275,7 @@ export class GgufInsights { (s1MB * 270) * ( this.trainContextSize == null ? 1 - : actualContextSize / this.trainContextSize + : kvSize / this.trainContextSize ) ); } else { @@ -271,21 +283,21 @@ export class GgufInsights { (s1MB * 150) * ( this.trainContextSize == null ? 1 - : Math.max(0, (1 - (actualContextSize / this.trainContextSize))) + : Math.max(0, (1 - (kvSize / this.trainContextSize))) ) ); } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) { const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; - return (int32TBytes * batchSize * actualContextSize * headCount) - (50 * s1MB); + return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB); // if (modelGpuLayers === this.totalLayers) { // defaultCalculationAdjustment += -(s1MB * 20) + ( // (s1MB * 250) * ( // this.trainContextSize == null // ? 1 - // : actualContextSize / this.trainContextSize + // : kvSize / this.trainContextSize // ) // ); // } else { @@ -293,7 +305,7 @@ export class GgufInsights { // (s1MB * 300) * ( // this.trainContextSize == null // ? 1 - // : actualContextSize / this.trainContextSize + // : kvSize / this.trainContextSize // ) // ); // } @@ -312,37 +324,49 @@ export class GgufInsights { if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) { // magic numbers for estimation. will be improved in the future - return (totalElements * 123 * (actualContextSize / 4096)) + defaultCalculationAdjustment; + return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment; } // magic numbers for estimation. will be improved in the future - return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment; + return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment; }; + const gpuKVCacheSize = usingGpu + ? this._estimateKvMemorySizeInBytes( + kvSize, + finalGpuLayers < totalFileLayers + ? (finalGpuLayers + 1) + : finalGpuLayers + ) + : 0; + const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers); + + // source: `llama_context::graph_max_nodes` in `llama-context.cpp` + const maxNodes = Math.max(65536, 5 * tensorInfo.length); + const cpuNodes = 5 * (tensorInfo.length * (finalCpuLayers / totalFileLayers)); + const gpuNodes = maxNodes - cpuNodes; + + const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) + + this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false); + const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) + + this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false); + const graphOverheadMemory = (flashAttention || !includeGraphOverhead) ? 0 : estimateGraphOverheadMemory(); + const graphOverheadGpuSize = usingGpu + ? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers)) + : 0; + const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize; - const usingGpu = finalGpuLayers !== 0; + const outputBufferSize = estimateOutput(sequences); - const cpuRam = ( - !usingGpu - ? (overheadMemory + graphOverheadMemory) + const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + ( + usingGpu + ? outputBufferSize : 0 - ) + - this._estimateKvMemorySizeInBytes(actualContextSize, finalCpuLayers); - const gpuVram = usingGpu - ? ( - overheadMemory + - graphOverheadMemory + - this._estimateKvMemorySizeInBytes( - actualContextSize, - finalGpuLayers < totalLayers - ? (finalGpuLayers + 1) - : finalGpuLayers - ) - ) - : 0; + ); + const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize; return { cpuRam, @@ -449,7 +473,7 @@ export class GgufInsights { } /** @internal */ - public _estimateKvMemorySizeInBytes(contextSize: number, layers: number) { + public _estimateKvMemorySizeInBytes(kvSize: number, layers: number) { // source: `llama_kv_cache_init` in `llama.cpp` const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0; @@ -483,8 +507,8 @@ export class GgufInsights { const totalNEmbdKGqa = nEmbdKGqa + modelNEmbdKS; const totalNEmbdVGqa = nEmbdVGqa + modelNEmbdVS; - totalElementsK += totalNEmbdKGqa * contextSize; - totalElementsV += totalNEmbdVGqa * contextSize; + totalElementsK += totalNEmbdKGqa * kvSize; + totalElementsV += totalNEmbdVGqa * kvSize; } const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba @@ -504,6 +528,16 @@ export class GgufInsights { ); } + /** @internal */ + private _getTotalFileLayers() { + if (this._totalFileLayers != null) + return this._totalFileLayers; + + this._totalFileLayers = this._getFileLayers(); + + return this._totalFileLayers; + } + /** * @param ggufFileInfo * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance. @@ -718,3 +752,25 @@ function isTokenEmbedLayer(layerName: string) { return firstPart === "token_embd"; } + +function ggmlPad(value: number, padding: number): number { + return ((value + padding - 1) & ~(padding - 1)); +} + +function getSwaPatternForArchitecture(architecture?: GgufArchitectureType): number { + // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern` + switch (architecture) { + case GgufArchitectureType.llama4: + return 4; + case GgufArchitectureType.phi3: + return 1; + case GgufArchitectureType.gemma2: + return 2; + case GgufArchitectureType.gemma3: + return 6; + case GgufArchitectureType.cohere2: + return 4; + } + + return 1; +} diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index 05595c98..cbae45d5 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -39,12 +39,14 @@ export class GgufInsightsConfigurationResolver { targetContextSize, embeddingContext = false, flashAttention = false, + swaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap }: { targetGpuLayers?: number | "max", targetContextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, + swaFullCache?: boolean, useMmap?: boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), @@ -63,6 +65,7 @@ export class GgufInsightsConfigurationResolver { } = {}) { const compatibilityScore = await this.scoreModelConfigurationCompatibility({ flashAttention, + swaFullCache, contextSize: targetContextSize, embeddingContext, forceGpuLayers: targetGpuLayers, @@ -105,6 +108,7 @@ export class GgufInsightsConfigurationResolver { contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, + swaFullCache = false, maximumFittedContextSizeMultiplier = 100, maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, @@ -114,6 +118,7 @@ export class GgufInsightsConfigurationResolver { contextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, + swaFullCache?: boolean, maximumFittedContextSizeMultiplier?: number, maximumUnfitConfigurationResourceMultiplier?: number, @@ -209,6 +214,7 @@ export class GgufInsightsConfigurationResolver { llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, ignoreMemorySafetyChecks: forceGpuLayers != null, useMmap } @@ -263,7 +269,8 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers: resolvedGpuLayers, modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes, ignoreMemorySafetyChecks: forceStrictContextSize, - flashAttention + flashAttention, + swaFullCache }); contextFitsMemory = true; } catch (err) { @@ -275,7 +282,8 @@ export class GgufInsightsConfigurationResolver { contextSize: resolvedContextSize, isEmbeddingContext: embeddingContext, modelGpuLayers: resolvedGpuLayers, - flashAttention + flashAttention, + swaFullCache }); const rankPoints = { @@ -371,11 +379,12 @@ export class GgufInsightsConfigurationResolver { llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, defaultContextFlashAttention = false, + defaultContextSwaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap }: { ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean, - useMmap?: boolean + defaultContextSwaFullCache?: boolean, useMmap?: boolean } = {}) { return resolveModelGpuLayersOption(gpuLayers, { ggufInsights: this._ggufInsights, @@ -385,6 +394,7 @@ export class GgufInsightsConfigurationResolver { llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); } @@ -399,6 +409,7 @@ export class GgufInsightsConfigurationResolver { batchSize, modelTrainContextSize, flashAttention = false, + swaFullCache = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), @@ -410,6 +421,7 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers: number, modelTrainContextSize: number, flashAttention?: boolean, + swaFullCache?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, @@ -427,6 +439,7 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers, modelTrainContextSize, flashAttention, + swaFullCache, getVramState, getRamState, getSwapState, diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index f800f712..49ace603 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -9,7 +9,7 @@ import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js"; const defaultMaxContextSizeSwapUse = 2048; export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, + contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache, getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, maxContextSizeSwapUse = defaultMaxContextSizeSwapUse }: { @@ -20,6 +20,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: number, modelTrainContextSize: number, flashAttention: boolean, + swaFullCache: boolean, getVramState(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState(): Promise<{total: number, free: number}>, getSwapState(): Promise<{total: number, free: number}>, @@ -52,6 +53,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + swaFullCache, isEmbeddingContext }); @@ -97,6 +99,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + swaFullCache, isEmbeddingContext }); @@ -145,6 +148,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + swaFullCache, isEmbeddingContext }); diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts index 1edae352..62d58141 100644 --- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts +++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts @@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5; export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, - llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap + llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }: { ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean, getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu, - llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, useMmap?: boolean + llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean }): Promise { if (gpuLayers == null) gpuLayers = "auto"; @@ -37,6 +37,7 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" ggufInsights, currentVram: vramState.free, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); @@ -73,6 +74,7 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" ? gpuLayers.max : undefined, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); @@ -95,6 +97,7 @@ function getBestGpuLayersForFreeVram({ minGpuLayers, maxGpuLayers, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }: { ggufInsights: GgufInsights, @@ -103,6 +106,7 @@ function getBestGpuLayersForFreeVram({ minGpuLayers?: number, maxGpuLayers?: number, defaultContextFlashAttention: boolean, + defaultContextSwaFullCache: boolean, useMmap?: boolean }) { return findBestOption({ @@ -123,6 +127,7 @@ function getBestGpuLayersForFreeVram({ currentVram: freeVram, fitContext, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); @@ -182,10 +187,10 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer } function getVramRequiredForGpuLayers({ - gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, useMmap + gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap }: { gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, - defaultContextFlashAttention: boolean, useMmap?: boolean + defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean }) { const modelVram = ggufInsights.estimateModelResourceRequirements({ gpuLayers, @@ -202,7 +207,8 @@ function getVramRequiredForGpuLayers({ modelGpuLayers: gpuLayers, sequences: 1, isEmbeddingContext: fitContext.embeddingContext ?? false, - flashAttention: defaultContextFlashAttention + flashAttention: defaultContextFlashAttention, + swaFullCache: defaultContextSwaFullCache }).gpuVram; const totalVram = modelVram + contextVram; @@ -221,7 +227,8 @@ function getVramRequiredForGpuLayers({ ggufInsights, vram: currentVram - modelVram, isEmbeddingContext: fitContext?.embeddingContext ?? false, - flashAttention: defaultContextFlashAttention + flashAttention: defaultContextFlashAttention, + swaFullCache: defaultContextSwaFullCache }); if (maxContext == null || modelVram + maxContext.vram > currentVram) @@ -234,8 +241,8 @@ function getVramRequiredForGpuLayers({ }; } -function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention}: { - gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean +function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache}: { + gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, swaFullCache: boolean }) { const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}); @@ -250,7 +257,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb modelGpuLayers: gpuLayers, sequences: 1, isEmbeddingContext, - flashAttention + flashAttention, + swaFullCache }).gpuVram; if (contextVram <= vram) diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index 827493fc..c665b958 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -316,6 +316,7 @@ export type GgufMetadataDefaultArchitectureType = { readonly layer_norm_rms_epsilon?: number, readonly key_length?: number, readonly value_length?: number, + readonly sliding_window?: number, readonly causal?: boolean }, From 2e4877ad8880e96823d263210fc24f7b0c048666 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 1 Jun 2025 02:12:13 +0300 Subject: [PATCH 02/14] fix: bugs and types --- src/cli/commands/DebugCommand.ts | 1 + src/cli/utils/interactivelyAskForModel.ts | 2 +- src/gguf/types/GgufMetadataTypes.ts | 4 ++-- src/gguf/types/GgufTensorInfoTypes.ts | 14 ++++++++++++-- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/cli/commands/DebugCommand.ts b/src/cli/commands/DebugCommand.ts index 149de90d..d2ee7117 100644 --- a/src/cli/commands/DebugCommand.ts +++ b/src/cli/commands/DebugCommand.ts @@ -65,5 +65,6 @@ async function DebugCmakeOptionsFunction() { console.info(); console.info(`${chalk.yellow("CMake options:")} ${prettyPrintObject(llama.cmakeOptions)}`); + console.info(`${chalk.yellow("Release:")} ${prettyPrintObject(llama.llamaCppRelease)}`); } diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index a896a5ce..8238daec 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -122,7 +122,7 @@ export async function interactivelyAskForModel({ const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention: flashAttention && ggufInsights?.flashAttentionSupported, - swaFullCache: swaFullCache, + swaFullCache, useMmap }); diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index c665b958..5f8a48e1 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -135,8 +135,8 @@ export enum GgufFileType { MOSTLY_Q4_0_4_4 = 33, // deprecated MOSTLY_Q4_0_4_8 = 34, // deprecated MOSTLY_Q4_0_8_8 = 35, // deprecated - MOSTLY_TQ1_0 = 36, // deprecated - MOSTLY_TQ2_0 = 37 // deprecated + MOSTLY_TQ1_0 = 36, + MOSTLY_TQ2_0 = 37 } diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index 28ae45c3..b23bf1f8 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -6,7 +6,7 @@ export type GgufTensorInfo = { /** * Adjusted offset relative to the file. - * + * * Added by the GGUF parser - not part of the file's metadata. */ readonly fileOffset: number | bigint, @@ -49,5 +49,15 @@ export const enum GgmlType { I16 = 25, I32 = 26, I64 = 27, - F64 = 28 + F64 = 28, + IQ1_M = 29, + BF16 = 30, + Q4_0_4_4 = 31, + Q4_0_4_8 = 32, + Q4_0_8_8 = 33, + TQ1_0 = 34, + TQ2_0 = 35, + IQ4_NL_4_4 = 36, + IQ4_NL_4_8 = 37, + IQ4_NL_8_8 = 38, } From 25f016df7dde0787cdd652687aae9ccad3b51d66 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 1 Jun 2025 03:24:43 +0300 Subject: [PATCH 03/14] feat: thought budget, improve prompt completion --- docs/guide/chat-session.md | 55 ++++++ ...entSettingsFromTokenizerAndChatTemplate.ts | 3 +- src/cli/commands/ChatCommand.ts | 20 ++- src/evaluator/LlamaChat/LlamaChat.ts | 166 +++++++++++++++--- .../LlamaChatSession/LlamaChatSession.ts | 22 ++- .../llama3.2/promptCompletion.test.ts | 98 +++++++++++ .../qwen3-0.6b/thinkingBudget.test.ts | 95 ++++++++++ test/utils/modelFiles.ts | 3 +- 8 files changed, 431 insertions(+), 31 deletions(-) create mode 100644 test/modelDependent/llama3.2/promptCompletion.test.ts create mode 100644 test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md index 992a6487..ec9c8541 100644 --- a/docs/guide/chat-session.md +++ b/docs/guide/chat-session.md @@ -898,3 +898,58 @@ const fullResponse = a1.response console.log("Full response: " + fullResponse); ``` + +## Set Thinking Budget {#thinking-budget} +You can set a thinking budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments). +```typescript +import { + getLlama, LlamaChatSession, resolveModelFile, Token +} from "node-llama-cpp"; + +const modelPath = await resolveModelFile("hf:Qwen/Qwen3-14B-GGUF:Q4_K_M"); + +const llama = await getLlama(); +const model = await llama.loadModel({modelPath}); +const context = await model.createContext(); +const session = new LlamaChatSession({ + contextSequence: context.getSequence() +}); + + +const q1 = "Where do llamas come from?"; +console.log("User: " + q1); + +const maxThoughtTokens = 100; + +let responseTokens = 0; +let thoughtTokens = 0; + +process.stdout.write("AI: "); +const response = await session.prompt(q1, { + budgets: { + thoughtTokens: maxThoughtTokens + }, + onResponseChunk(chunk) { + const isThoughtSegment = chunk.type === "segment" && + chunk.segmentType === "thought"; + + if (chunk.type === "segment" && chunk.segmentStartTime != null) + process.stdout.write(` [segment start: ${chunk.segmentType}] `); + + process.stdout.write(chunk.text); + + if (chunk.type === "segment" && chunk.segmentEndTime != null) + process.stdout.write(` [segment end: ${chunk.segmentType}] `); + + if (isThoughtSegment) + thoughtTokens += chunk.tokens.length; + else + responseTokens += chunk.tokens.length; + } +}); + +console.log("Response: " + response); + +console.log("Response tokens: " + responseTokens); +console.log("Thought tokens: " + thoughtTokens); +``` diff --git a/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts b/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts index 57fc4ceb..30f434a0 100644 --- a/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts +++ b/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts @@ -41,7 +41,8 @@ export function extractSegmentSettingsFromTokenizerAndChatTemplate( return removeUndefinedFields({ thought: tryMatchPrefixSuffixPair([ ["", ""], // DeepSeek, QwQ - ["", ""] // EXAONE Deep + ["", ""], // EXAONE Deep + ["<|START_THINKING|>", "<|END_THINKING|>"] // Command R7B ]) }); } diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 26e85cdd..d1ebc8e1 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -62,6 +62,7 @@ type ChatCommand = { repeatFrequencyPenalty?: number, repeatPresencePenalty?: number, maxTokens: number, + thoughtBudget?: number, noHistory: boolean, environmentFunctions: boolean, tokenPredictionDraftModel?: string, @@ -262,6 +263,13 @@ export const ChatCommand: CommandModule = { default: 0, description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size" }) + .option("thoughtBudget", { + alias: ["tb", "thinkingBudget", "reasoningBudget"], + type: "number", + default: -1, + defaultDescription: "Unlimited", + description: "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning" + }) .option("noHistory", { alias: "nh", type: "boolean", @@ -318,7 +326,7 @@ export const ChatCommand: CommandModule = { promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, - repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }) { try { @@ -327,8 +335,8 @@ export const ChatCommand: CommandModule = { batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, - timing, noMmap, printTimings + maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, + debug, meter, timing, noMmap, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -344,11 +352,12 @@ async function RunChat({ contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, - repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; + if (thoughtBudget === -1) thoughtBudget = undefined; const headers = resolveHeaderFlag(headerArg); const trimWhitespace = !noTrimWhitespace; @@ -686,6 +695,9 @@ async function RunChat({ seed: seed ?? undefined, signal: abortController.signal, stopOnAbortSignal: true, + budgets: { + thoughtTokens: thoughtBudget + }, repeatPenalty: { penalty: repeatPenalty, frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined, diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts index da15b1c0..77a171d9 100644 --- a/src/evaluator/LlamaChat/LlamaChat.ts +++ b/src/evaluator/LlamaChat/LlamaChat.ts @@ -294,7 +294,26 @@ export type LLamaChatGenerateResponseOptions void + onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void, + + /** + * Set the maximum number of tokens the model is allowed to spend on various segmented responses. + */ + budgets?: { + /** + * Whether to include the tokens already consumed by the current model response being completed in the budget. + * + * Defaults to `true`. + */ + includeCurrentResponse?: boolean, + + /** + * Budget for thought tokens. + * + * Defaults to `Infinity`. + */ + thoughtTokens?: number + } } & ({ grammar?: LlamaGrammar, functions?: never, @@ -515,6 +534,7 @@ export class LlamaChat { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal = false, maxTokens, @@ -552,6 +572,7 @@ export class LlamaChat { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal, maxTokens, @@ -595,6 +616,7 @@ export class LlamaChat { ); }; const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true); + const loadContextWindowForBudgetTriggers = async () => loadContextWindow(false); while (true) { generateResponseState.startTokenLoop(); @@ -657,6 +679,15 @@ export class LlamaChat { if (maxTokensTriggerRes != null) return maxTokensTriggerRes; + if (generateResponseState.updateShouldContextShift()) + break; + + if (await generateResponseState.handleBudgetTriggers()) { + await loadContextWindowForBudgetTriggers(); + await generateResponseState.alignCurrentSequenceStateWithCurrentTokens(); + await generateResponseState.createNewEvaluationIterator(); + } + if (generateResponseState.updateShouldContextShift()) break; @@ -797,6 +828,17 @@ export class LlamaChat { StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer) ); + allSegmentTypes + .map((segmentType) => getChatWrapperSegmentDefinition(this._chatWrapper.settings, segmentType)) + .filter((segmentDefinition) => segmentDefinition != null) + .flatMap((segmentDefinition) => [segmentDefinition?.prefix, segmentDefinition?.suffix]) + .filter((trigger) => trigger != null) + .forEach((trigger) => ( + generateResponseState.stopGenerationDetector.addStopTrigger( + StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(trigger), this.model.tokenizer) + ) + )); + await generateResponseState.alignCurrentSequenceStateWithCurrentTokens(); if (generateResponseState.maxTokens === 0) { @@ -827,7 +869,15 @@ export class LlamaChat { generateResponseState.popStreamRegulatorFreeTokens(); - const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user"); + const someOfCurrentTokensAreSpecial = generateResponseState.currentTokens.some((token) => ( + this.model.isSpecialToken(token) + )); + const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger( + "user", + someOfCurrentTokensAreSpecial + ? "eogToken" + : undefined + ); if (stopGenerationTriggerRes != null) return { completion: stopGenerationTriggerRes.response, @@ -1251,10 +1301,9 @@ function generateContextTextThatEndsWithUserText( ...options, chatHistory: setLastUserTextInChatHistory(options.chatHistory, lastUserText + randomId) }); - let newContextText = contextText; - for (let i = 0; i < newContextText.values.length; i++) { - const item = newContextText.values[i]; + for (let i = 0; i < contextText.values.length; i++) { + const item = contextText.values[i]; if (typeof item !== "string") continue; @@ -1263,15 +1312,14 @@ function generateContextTextThatEndsWithUserText( continue; const newValue = item.slice(0, randomTextIndex); - newContextText = LlamaText([ - ...newContextText.values.slice(0, i), - newValue - ]); return { - contextText: newContextText, + contextText: LlamaText([ + ...contextText.values.slice(0, i), + newValue + ]), userTextSuffix: LlamaText([ item.slice(randomTextIndex + randomId.length), - ...newContextText.values.slice(i + 1) + ...contextText.values.slice(i + 1) ]), ...rest }; @@ -1485,6 +1533,7 @@ class GenerateResponseState["onToken"]; private readonly onResponseChunk: LLamaChatGenerateResponseOptions["onResponseChunk"]; private readonly onFunctionCallParamsChunk: LLamaChatGenerateResponseOptions["onFunctionCallParamsChunk"]; + private readonly budgets: LLamaChatGenerateResponseOptions["budgets"]; private readonly signal: LLamaChatGenerateResponseOptions["signal"]; private readonly stopOnAbortSignal: LLamaChatGenerateResponseOptions["stopOnAbortSignal"]; public readonly maxTokens: LLamaChatGenerateResponseOptions["maxTokens"]; @@ -1584,6 +1633,7 @@ class GenerateResponseState budget != null && budget !== Infinity; + + const hasBudgetTriggers = this.budgets != null && hasBudget(this.budgets.thoughtTokens); + if (!hasBudgetTriggers) + return shouldReloadEvaluationState; + + if (hasBudget(this.budgets.thoughtTokens) && this.segmentHandler.isSegmentTypeOpen("thought")) { + const usedThoughtTokens = this.segmentHandler.getSegmentTokensCount("thought"); + if (usedThoughtTokens >= this.budgets.thoughtTokens) { + this.segmentHandler.closeSegment("thought"); + shouldReloadEvaluationState = true; + } + } + + return shouldReloadEvaluationState; + } + public updateShouldContextShift() { this.shouldContextShift = this.llamaChat.sequence.nextTokenIndex >= this.llamaChat.context.contextSize - 1; return this.shouldContextShift; @@ -2946,6 +3019,7 @@ class SegmentHandler[] = []; private readonly _segmentsStartTokenTrail: Token[] = []; + private readonly _segmentTokenCounts: Map; private readonly _contextWindowSegments: RawSegment[] = []; private readonly _contextWindowStartTokenTrail: Token[] = []; private readonly _initialTokensTrail: Token[]; @@ -2958,7 +3032,7 @@ class SegmentHandler, closeAllSegments?: string | LlamaText, initialSegmentStack: S[], + initialTokenCounts: Map, previousTokens: Token[] }) { this.model = model; @@ -2990,6 +3065,7 @@ class SegmentHandler(); + + for (const item of modelResponse) { + if (typeof item === "string") { + segmentTokenCounts.set( + undefined, + (segmentTokenCounts.get(undefined) ?? 0) + tokenizer(item, false, "trimLeadingSpace").length + ); + continue; + } else if (isChatModelResponseFunctionCall(item)) + continue; + + void (item.type satisfies "segment"); + + segmentTokenCounts.set( + item.segmentType, + (segmentTokenCounts.get(item.segmentType) ?? 0) + tokenizer(item.text, false, "trimLeadingSpace").length + ); + } + + return segmentTokenCounts; + } } diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts index cb64518d..f0a0ba77 100644 --- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts +++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts @@ -209,7 +209,19 @@ export type LLamaChatPromptOptions void + onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void, + + /** + * Set the maximum number of tokens that the model is allowed to spend on various segmented responses. + */ + budgets?: { + /** + * Budget for thought tokens. + * + * Defaults to `Infinity`. + */ + thoughtTokens?: number + } } & ({ grammar?: LlamaGrammar, functions?: never, @@ -445,6 +457,7 @@ export class LlamaChatSession { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal = false, maxTokens, @@ -469,7 +482,7 @@ export class LlamaChatSession { maxParallelFunctionCalls: maxParallelFunctionCalls as undefined, onFunctionCallParamsChunk: onFunctionCallParamsChunk as undefined, - onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens, + onTextChunk, onToken, onResponseChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers }); @@ -489,6 +502,7 @@ export class LlamaChatSession { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal = false, maxTokens, @@ -589,6 +603,10 @@ export class LlamaChatSession { paramsChunk: chunk.paramsChunk, done: chunk.done })), + budgets: { + includeCurrentResponse: true, + thoughtTokens: budgets?.thoughtTokens + }, signal: abortController.signal, stopOnAbortSignal, repeatPenalty, diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts new file mode 100644 index 00000000..d667f31c --- /dev/null +++ b/test/modelDependent/llama3.2/promptCompletion.test.ts @@ -0,0 +1,98 @@ +import {describe, expect, test} from "vitest"; +import {LlamaChatSession} from "../../../src/index.js"; +import {getModelFile} from "../../utils/modelFiles.js"; +import {getTestLlama} from "../../utils/getTestLlama.js"; +import {LlamaText} from "../../../src/utils/LlamaText.js"; + +describe("llama 3.2", () => { + describe("prompt completion", () => { + test("prompt completion isn't kept in the next evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("Llama-3.2-3B-Instruct.Q4_K_M.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 4096 + }); + const context2 = await model.createContext({ + contextSize: 4096 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence() + }); + const chatSession2 = new LlamaChatSession({ + contextSequence: context2.getSequence() + }); + + const promptCompletion = await chatSession.completePrompt("Hi there!", { + maxTokens: 50 + }); + expect(promptCompletion).toMatchInlineSnapshot("\" I're looking for a new phone case. I want one that is waterproof and has a good camera.\""); + expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|start_header_id|>"), + "system", + new SpecialTokensText("<|end_header_id|>"), + " + + Cutting Knowledge Date: December 2023", + new SpecialToken("NL"), + "Today Date: 29 May 2025 + + You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialToken("EOT"), + new SpecialTokensText("<|start_header_id|>"), + "user", + new SpecialTokensText("<|end_header_id|>"), + " + + Hi there! I're looking for a new phone case. I want one that is waterproof and has a good camera.", + ]) + `); + + const res = await chatSession.prompt("Hi there!", { + maxTokens: 50 + }); + expect(res).toMatchInlineSnapshot("\"Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat for a bit?\""); + expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|start_header_id|>"), + "system", + new SpecialTokensText("<|end_header_id|>"), + " + + Cutting Knowledge Date: December 2023", + new SpecialToken("NL"), + "Today Date: 29 May 2025 + + You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialToken("EOT"), + new SpecialTokensText("<|start_header_id|>"), + "user", + new SpecialTokensText("<|end_header_id|>"), + " + + Hi there!", + new SpecialToken("EOT"), + new SpecialTokensText("<|start_header_id|>"), + "assistant", + new SpecialTokensText("<|end_header_id|>"), + " + + Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat for a bit?", + ]) + `); + + const res2 = await chatSession2.prompt("Hi there!", { + maxTokens: 50 + }); + expect(res2).to.eql(res); + }); + }); +}); diff --git a/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts b/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts new file mode 100644 index 00000000..35522794 --- /dev/null +++ b/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts @@ -0,0 +1,95 @@ +import {describe, expect, test} from "vitest"; +import {LlamaChatSession, isChatModelResponseSegment} from "../../../src/index.js"; +import {getModelFile} from "../../utils/modelFiles.js"; +import {getTestLlama} from "../../utils/getTestLlama.js"; + +describe("qwen3 0.6b", () => { + describe("thinking budget", () => { + test("doesn't exceed thinking budget", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("Qwen3-0.6B-Q8_0.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 512 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence() + }); + + const initialChatHistory = chatSession.getChatHistory(); + + async function promptWithBudget({ + prompt, maxTokens, thinkingBudget + }: { + prompt: string, maxTokens: number, thinkingBudget?: number + }) { + let thoughtTokens = 0; + let totalTokens = 0; + + chatSession.setChatHistory(initialChatHistory); + const {responseText, response} = await chatSession.promptWithMeta(prompt, { + maxTokens, + budgets: { + thoughtTokens: thinkingBudget + }, + onResponseChunk(chunk) { + if (chunk.type === "segment" && chunk.segmentType === "thought") { + thoughtTokens += chunk.tokens.length; + } + + totalTokens += chunk.tokens.length; + } + }); + + return { + thoughtTokens, + totalTokens, + responseText, + thoughts: response + .filter((item) => isChatModelResponseSegment(item)) + .filter((item) => item.segmentType === "thought") + .map((item) => item.text) + }; + } + + const res1 = await promptWithBudget({ + prompt: "Where do llamas come from?", + thinkingBudget: 10, + maxTokens: 20 + }); + expect(res1.thoughtTokens).to.be.gt(1); + expect(res1.thoughtTokens).to.be.lte(10); + expect(res1.totalTokens).to.be.gte(16); + expect(res1.totalTokens).to.be.lte(20); + + const res2 = await promptWithBudget({ + prompt: "Where do llamas come from?", + thinkingBudget: 0, + maxTokens: 20 + }); + expect(res2.thoughtTokens).to.be.eq(0); + expect(res2.totalTokens).to.be.gte(16); + expect(res2.totalTokens).to.be.lte(20); + + const res3 = await promptWithBudget({ + prompt: "Where do llamas come from?", + thinkingBudget: 20, + maxTokens: 20 + }); + expect(res3.thoughtTokens).to.be.eq(res3.totalTokens); + expect(res3.totalTokens).to.be.gte(16); + expect(res3.totalTokens).to.be.lte(20); + + const res4 = await promptWithBudget({ + prompt: "Where do llamas come from?", + maxTokens: 20 + }); + expect(res4.thoughtTokens).to.be.eq(res4.totalTokens); + expect(res4.totalTokens).to.be.gte(16); + expect(res4.totalTokens).to.be.lte(20); + }); + }); +}); diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts index bcc6a6c0..fa307dc6 100644 --- a/test/utils/modelFiles.ts +++ b/test/utils/modelFiles.ts @@ -20,7 +20,8 @@ const supportedModels = { "codegemma-2b-Q4_K_M.gguf": "https://huggingface.co/bartowski/codegemma-2b-GGUF/resolve/main/codegemma-2b-Q4_K_M.gguf?download=true", "Llama-3.2-3B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct.Q4_K_M.gguf?download=true", "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true", - "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true" + "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true", + "Qwen3-0.6B-Q8_0.gguf": "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true" } as const; export async function getModelFile(modelName: keyof typeof supportedModels) { From d83f1778334215dbd54e4f5b4adf73ff07394ead Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 1 Jun 2025 03:25:46 +0300 Subject: [PATCH 04/14] style: lint --- src/gguf/insights/GgufInsights.ts | 2 -- src/gguf/types/GgufTensorInfoTypes.ts | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index eb8330a8..5b01dd22 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -203,9 +203,7 @@ export class GgufInsights { const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0; const embeddingSize = llmData.embedding_length ?? 0; - const sizeTBytes = 8; // sizeof(size_t) const floatBytes = 4; // sizeof(float) - const uint32TBytes = 4; // sizeof(uint32_t) const int32TBytes = 4; // sizeof(int32_t) const estimateOutput = (nOutputs: number) => { diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index b23bf1f8..8b7f615a 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -59,5 +59,5 @@ export const enum GgmlType { TQ2_0 = 35, IQ4_NL_4_4 = 36, IQ4_NL_4_8 = 37, - IQ4_NL_8_8 = 38, + IQ4_NL_8_8 = 38 } From b3d04fe0a6737aa04a67722975e8bb1d9ce83b02 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Sun, 1 Jun 2025 04:12:10 +0300 Subject: [PATCH 05/14] test: fix tests --- src/gguf/insights/GgufInsights.ts | 10 +- .../functionaryModelGpuLayersOptions.test.ts | 144 +++++++++--------- .../functionary/gguf/ggufInsights.test.ts | 70 ++++----- .../llama3.2/promptCompletion.test.ts | 28 +++- .../stableCodeModelGpuLayersOptions.test.ts | 80 +++++----- 5 files changed, 172 insertions(+), 160 deletions(-) diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index 5b01dd22..7758a7de 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -359,16 +359,14 @@ export class GgufInsights { const outputBufferSize = estimateOutput(sequences); - const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + ( - usingGpu - ? outputBufferSize - : 0 - ); + const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize; const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize; return { cpuRam, - gpuVram + gpuVram: usingGpu + ? gpuVram + : 0 }; } diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts index 03d5942a..d8247dd9 100644 --- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts +++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts @@ -114,7 +114,7 @@ describe("functionary", () => { freeRam: s1GB * 6 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } { const res = await resolveGpuLayers(0, { @@ -151,7 +151,7 @@ describe("functionary", () => { freeSwap: s1GB * 1 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers(0, { @@ -255,7 +255,7 @@ describe("functionary", () => { freeRam: s1GB * 4.5 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("3202"); + expect(res.contextSize).to.toMatchInlineSnapshot("4016"); } try { await resolveGpuLayers(16, { @@ -318,7 +318,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } }); @@ -336,14 +336,14 @@ describe("functionary", () => { } { const res = await resolveGpuLayers(16, { - totalVram: s1GB * 7, - freeVram: s1GB * 7, - totalRam: s1GB * 7, + totalVram: s1GB * 7.5, + freeVram: s1GB * 7.5, + totalRam: s1GB * 7.5, freeRam: s1GB * 5.5, - unifiedMemorySize: s1GB * 7 + unifiedMemorySize: s1GB * 7.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("2086"); + expect(res.contextSize).to.toMatchInlineSnapshot("1760"); } { const res = await resolveGpuLayers(16, { @@ -354,7 +354,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 5.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("6804"); + expect(res.contextSize).to.toMatchInlineSnapshot("5505"); } try { await resolveGpuLayers(16, { @@ -409,7 +409,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.contextSize).to.toMatchInlineSnapshot("4441"); } { const res = await resolveGpuLayers(16, { @@ -422,7 +422,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } }); @@ -608,7 +608,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers(32, { @@ -619,7 +619,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1143"); + expect(res.contextSize).to.toMatchInlineSnapshot("1164"); } { const res = await resolveGpuLayers(32, { @@ -761,7 +761,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1143"); + expect(res.contextSize).to.toMatchInlineSnapshot("1164"); } { const res = await resolveGpuLayers(33, { @@ -772,7 +772,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers(33, { @@ -783,7 +783,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } { const res = await resolveGpuLayers(33, { @@ -795,7 +795,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } }); @@ -809,7 +809,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("6251"); } { const res = await resolveGpuLayers(33, { @@ -820,18 +820,18 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.contextSize).to.toMatchInlineSnapshot("2974"); } { const res = await resolveGpuLayers(33, { totalVram: s1GB * 6, freeVram: s1GB * 6, totalRam: s1GB * 6, - freeRam: s1GB * 4.8, + freeRam: s1GB * 5.1, unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1142"); + expect(res.contextSize).to.toMatchInlineSnapshot("1336"); } try { await resolveGpuLayers(33, { @@ -908,7 +908,7 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("502"); + expect(res.contextSize).to.toMatchInlineSnapshot("472"); } { const res = await resolveGpuLayers("max", { @@ -918,7 +918,7 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1010"); + expect(res.contextSize).to.toMatchInlineSnapshot("898"); } }); @@ -952,7 +952,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("3606"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -961,8 +961,8 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); } { const res = await resolveGpuLayers("auto", { @@ -972,7 +972,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("10"); - expect(res.contextSize).to.toMatchInlineSnapshot("5856"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -981,7 +981,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -991,7 +991,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("13"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1001,7 +1001,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1011,7 +1011,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1021,7 +1021,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1031,7 +1031,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1042,7 +1042,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); - expect(res.contextSize).to.toMatchInlineSnapshot("7977"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1052,7 +1052,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8043"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1062,7 +1062,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4754"); + expect(res.contextSize).to.toMatchInlineSnapshot("4721"); } { const res = await resolveGpuLayers("auto", { @@ -1072,7 +1072,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7964"); + expect(res.contextSize).to.toMatchInlineSnapshot("7998"); } { const res = await resolveGpuLayers("auto", { @@ -1095,7 +1095,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers("auto", { @@ -1105,7 +1105,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers("auto", { @@ -1115,7 +1115,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("3606"); + expect(res.contextSize).to.toMatchInlineSnapshot("5438"); } { const res = await resolveGpuLayers("auto", { @@ -1124,8 +1124,8 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); } { const res = await resolveGpuLayers("auto", { @@ -1135,7 +1135,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("10"); - expect(res.contextSize).to.toMatchInlineSnapshot("5856"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1144,7 +1144,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1154,7 +1154,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("13"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1164,7 +1164,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1174,7 +1174,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1184,7 +1184,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1194,7 +1194,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1205,7 +1205,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); - expect(res.contextSize).to.toMatchInlineSnapshot("7977"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1215,7 +1215,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8043"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1225,7 +1225,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4754"); + expect(res.contextSize).to.toMatchInlineSnapshot("4721"); } { const res = await resolveGpuLayers("auto", { @@ -1235,7 +1235,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7964"); + expect(res.contextSize).to.toMatchInlineSnapshot("7998"); } { const res = await resolveGpuLayers("auto", { @@ -1324,7 +1324,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1336,7 +1336,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1349,7 +1349,7 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("3202"); + expect(res.contextSize).to.toMatchInlineSnapshot("4016"); } }); @@ -1362,7 +1362,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1372,7 +1372,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } try { await resolveGpuLayers({min: 2}, { @@ -1426,7 +1426,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1438,7 +1438,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1451,7 +1451,7 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("3202"); + expect(res.contextSize).to.toMatchInlineSnapshot("4016"); } }); }); @@ -1480,7 +1480,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("5737"); + expect(res.contextSize).to.toMatchInlineSnapshot("6535"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1491,8 +1491,8 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("5246"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1503,7 +1503,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } @@ -1515,7 +1515,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } @@ -1569,7 +1569,7 @@ describe("functionary", () => { freeRam: s1GB * 7 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("5737"); + expect(res.contextSize).to.toMatchInlineSnapshot("6535"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1580,8 +1580,8 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("5246"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1592,7 +1592,7 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } @@ -1604,7 +1604,7 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts index 33e638d0..ee193e2c 100644 --- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts +++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts @@ -124,7 +124,7 @@ describe("gguf", async () => { sequences: context.totalSequences, modelGpuLayers: ggufInsights.totalLayers }).gpuVram; - expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot('"1.02GB"'); + expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot("\"1.03GB\""); expect(Math.abs(contextVramUsageDiff - estimatedContextVramUsage)).to.be.lte(s300MB); await model.dispose(); @@ -168,7 +168,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.78GB", + "cpuRam": "1.75GB", "gpuVram": "0B", } `); @@ -179,7 +179,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.02GB", + "cpuRam": "1GB", "gpuVram": "0B", } `); @@ -190,7 +190,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "650.6MB", + "cpuRam": "643.07MB", "gpuVram": "0B", } `); @@ -201,7 +201,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "454.58MB", + "cpuRam": "451.07MB", "gpuVram": "0B", } `); @@ -213,8 +213,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1GB", - "gpuVram": "834.69MB", + "cpuRam": "1.71GB", + "gpuVram": "355.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -224,8 +224,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "512MB", - "gpuVram": "546.63MB", + "cpuRam": "1002.8MB", + "gpuVram": "315.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -235,8 +235,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "256MB", - "gpuVram": "402.6MB", + "cpuRam": "630.8MB", + "gpuVram": "295.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -246,8 +246,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "128MB", - "gpuVram": "330.58MB", + "cpuRam": "444.8MB", + "gpuVram": "285.25MB", } `); @@ -258,8 +258,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "544MB", - "gpuVram": "1.28GB", + "cpuRam": "1022.78MB", + "gpuVram": "1.05GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -269,8 +269,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "272MB", - "gpuVram": "786.67MB", + "cpuRam": "638.78MB", + "gpuVram": "679.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -280,8 +280,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "136MB", - "gpuVram": "522.64MB", + "cpuRam": "446.78MB", + "gpuVram": "479.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -291,8 +291,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "68MB", - "gpuVram": "390.63MB", + "cpuRam": "350.78MB", + "gpuVram": "379.25MB", } `); @@ -303,7 +303,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "32MB", + "cpuRam": "250.5MB", "gpuVram": "1.78GB", } `); @@ -314,8 +314,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "16MB", - "gpuVram": "1.02GB", + "cpuRam": "250.5MB", + "gpuVram": "1.03GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -325,8 +325,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "8MB", - "gpuVram": "650.69MB", + "cpuRam": "250.5MB", + "gpuVram": "667.52MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -336,8 +336,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "4MB", - "gpuVram": "454.67MB", + "cpuRam": "250.5MB", + "gpuVram": "475.52MB", } `); @@ -348,7 +348,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", + "cpuRam": "250.5MB", "gpuVram": "1.78GB", } `); @@ -359,8 +359,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", - "gpuVram": "1.02GB", + "cpuRam": "250.5MB", + "gpuVram": "1.03GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -370,8 +370,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", - "gpuVram": "650.69MB", + "cpuRam": "250.5MB", + "gpuVram": "667.52MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -381,8 +381,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", - "gpuVram": "454.67MB", + "cpuRam": "250.5MB", + "gpuVram": "475.52MB", } `); }); diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts index d667f31c..574524d9 100644 --- a/test/modelDependent/llama3.2/promptCompletion.test.ts +++ b/test/modelDependent/llama3.2/promptCompletion.test.ts @@ -1,5 +1,5 @@ import {describe, expect, test} from "vitest"; -import {LlamaChatSession} from "../../../src/index.js"; +import {LlamaChatSession, resolveChatWrapper} from "../../../src/index.js"; import {getModelFile} from "../../utils/modelFiles.js"; import {getTestLlama} from "../../utils/getTestLlama.js"; import {LlamaText} from "../../../src/utils/LlamaText.js"; @@ -20,16 +20,30 @@ describe("llama 3.2", () => { contextSize: 4096 }); const chatSession = new LlamaChatSession({ - contextSequence: context.getSequence() + contextSequence: context.getSequence(), + chatWrapper: resolveChatWrapper(model, { + customWrapperSettings: { + "llama3.2-lightweight": { + todayDate: new Date("2025-01-01T00:00:00Z") + } + } + }) }); const chatSession2 = new LlamaChatSession({ - contextSequence: context2.getSequence() + contextSequence: context2.getSequence(), + chatWrapper: resolveChatWrapper(model, { + customWrapperSettings: { + "llama3.2-lightweight": { + todayDate: new Date("2025-01-01T00:00:00Z") + } + } + }) }); const promptCompletion = await chatSession.completePrompt("Hi there!", { maxTokens: 50 }); - expect(promptCompletion).toMatchInlineSnapshot("\" I're looking for a new phone case. I want one that is waterproof and has a good camera.\""); + expect(promptCompletion).toMatchInlineSnapshot("\" I'm looking for a new phone case. I need a case that can protect your phone from scratches and drops.\""); expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(` LlamaText([ new SpecialToken("BOS"), @@ -40,7 +54,7 @@ describe("llama 3.2", () => { Cutting Knowledge Date: December 2023", new SpecialToken("NL"), - "Today Date: 29 May 2025 + "Today Date: 1 Jan 2025 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", @@ -50,7 +64,7 @@ describe("llama 3.2", () => { new SpecialTokensText("<|end_header_id|>"), " - Hi there! I're looking for a new phone case. I want one that is waterproof and has a good camera.", + Hi there! I'm looking for a new phone case. I need a case that can protect your phone from scratches and drops.", ]) `); @@ -68,7 +82,7 @@ describe("llama 3.2", () => { Cutting Knowledge Date: December 2023", new SpecialToken("NL"), - "Today Date: 29 May 2025 + "Today Date: 1 Jan 2025 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts index 43145a6d..c2ad773f 100644 --- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts +++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts @@ -111,7 +111,7 @@ describe("stableCode", () => { freeVram: s1GB * 3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("7177"); + expect(res.contextSize).to.toMatchInlineSnapshot("8064"); } try { await resolveGpuLayers(16, { @@ -137,12 +137,12 @@ describe("stableCode", () => { // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left // to create a context - freeVram: s1GB * 0.2, + freeVram: s1GB * 1.4, ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("133"); + expect(res.contextSize).to.toMatchInlineSnapshot("138"); } @@ -174,7 +174,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("11125"); + expect(res.contextSize).to.toMatchInlineSnapshot("11348"); } try { await resolveGpuLayers(32, { @@ -192,7 +192,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("94"); + expect(res.contextSize).to.toMatchInlineSnapshot("48"); } { @@ -223,7 +223,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("11125"); + expect(res.contextSize).to.toMatchInlineSnapshot("11348"); } try { await resolveGpuLayers(33, { @@ -241,7 +241,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("94"); + expect(res.contextSize).to.toMatchInlineSnapshot("48"); } { @@ -303,7 +303,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("94"); + expect(res.contextSize).to.toMatchInlineSnapshot("48"); } { const res = await resolveGpuLayers("max", { @@ -311,7 +311,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); } { const res = await resolveGpuLayers("max", { @@ -319,7 +319,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6866"); + expect(res.contextSize).to.toMatchInlineSnapshot("6979"); } { const res = await resolveGpuLayers("max", { @@ -327,7 +327,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7931"); + expect(res.contextSize).to.toMatchInlineSnapshot("8072"); } }); @@ -345,24 +345,24 @@ describe("stableCode", () => { totalVram: s1GB * 6, freeVram: s1GB * 0.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); + expect(res.contextSize).to.toMatchInlineSnapshot("10864"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 0.8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); - expect(res.contextSize).to.toMatchInlineSnapshot("8724"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 1.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("6203"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("5"); + expect(res.contextSize).to.toMatchInlineSnapshot("8368"); } { const res = await resolveGpuLayers("auto", { @@ -370,7 +370,7 @@ describe("stableCode", () => { freeVram: s1GB * 2.4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("1544"); + expect(res.contextSize).to.toMatchInlineSnapshot("1518"); } { const res = await resolveGpuLayers("auto", { @@ -378,7 +378,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3407"); + expect(res.contextSize).to.toMatchInlineSnapshot("3429"); } { const res = await resolveGpuLayers("auto", { @@ -386,7 +386,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3939"); + expect(res.contextSize).to.toMatchInlineSnapshot("3976"); } { const res = await resolveGpuLayers("auto", { @@ -394,7 +394,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4471"); + expect(res.contextSize).to.toMatchInlineSnapshot("4522"); } { const res = await resolveGpuLayers("auto", { @@ -402,7 +402,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5270"); + expect(res.contextSize).to.toMatchInlineSnapshot("5341"); } { const res = await resolveGpuLayers("auto", { @@ -410,7 +410,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); } { const res = await resolveGpuLayers("auto", { @@ -418,7 +418,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("6600"); + expect(res.contextSize).to.toMatchInlineSnapshot("6706"); } { const res = await resolveGpuLayers("auto", { @@ -426,7 +426,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7133"); + expect(res.contextSize).to.toMatchInlineSnapshot("7252"); } { const res = await resolveGpuLayers("auto", { @@ -434,7 +434,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7931"); + expect(res.contextSize).to.toMatchInlineSnapshot("8072"); } { const res = await resolveGpuLayers("auto", { @@ -442,7 +442,7 @@ describe("stableCode", () => { freeVram: s1GB * 5.2 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("8995"); + expect(res.contextSize).to.toMatchInlineSnapshot("9164"); } { const res = await resolveGpuLayers("auto", { @@ -450,7 +450,7 @@ describe("stableCode", () => { freeVram: s1GB * 5.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("10592"); + expect(res.contextSize).to.toMatchInlineSnapshot("10802"); } { const res = await resolveGpuLayers("auto", { @@ -458,7 +458,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("11125"); + expect(res.contextSize).to.toMatchInlineSnapshot("11348"); } }); @@ -504,7 +504,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("11658"); + expect(res.contextSize).to.toMatchInlineSnapshot("13255"); } try { await resolveGpuLayers({min: 16}, { @@ -522,7 +522,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -531,8 +531,8 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); - expect(res.contextSize).to.toMatchInlineSnapshot("8160"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); + expect(res.contextSize).to.toMatchInlineSnapshot("8249"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -542,7 +542,7 @@ describe("stableCode", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("7177"); + expect(res.contextSize).to.toMatchInlineSnapshot("8064"); } }); @@ -565,7 +565,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -574,8 +574,8 @@ describe("stableCode", () => { totalVram: s1GB * 2, freeVram: s1GB * 1 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("9426"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("3"); + expect(res.contextSize).to.toMatchInlineSnapshot("5933"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -584,8 +584,8 @@ describe("stableCode", () => { totalVram: s1GB * 6, freeVram: s1GB * 4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); - expect(res.contextSize).to.toMatchInlineSnapshot("9167"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); + expect(res.contextSize).to.toMatchInlineSnapshot("9208"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -595,7 +595,7 @@ describe("stableCode", () => { freeVram: s1GB * 1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("9426"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); expect(res.contextSize).to.be.gte(contextSize); } { From e382413c532bdbad077b6cd3c5234a08f7efd029 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 2 Jun 2025 03:02:45 +0300 Subject: [PATCH 06/14] docs: generate a `llms.txt` file --- .vitepress/config.ts | 149 ++++-------------------------- .vitepress/config/getBlogPosts.ts | 46 +++++++++ .vitepress/config/sidebar.ts | 134 +++++++++++++++++++++++++++ package.json | 1 + 4 files changed, 201 insertions(+), 129 deletions(-) create mode 100644 .vitepress/config/getBlogPosts.ts create mode 100644 .vitepress/config/sidebar.ts diff --git a/.vitepress/config.ts b/.vitepress/config.ts index f3b7fab2..e6161519 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -12,11 +12,13 @@ import {rehype} from "rehype"; import sharp from "sharp"; import {GitChangelog, GitChangelogMarkdownSection} from "@nolebase/vitepress-plugin-git-changelog/vite"; import {buildEndGenerateOpenGraphImages} from "@nolebase/vitepress-plugin-og-image/vitepress"; +import llmstxt from "vitepress-plugin-llms"; import {Resvg, initWasm as initResvgWasm, type ResvgRenderOptions} from "@resvg/resvg-wasm"; import {BlogPageInfoPlugin} from "./config/BlogPageInfoPlugin.js"; -import {getApiReferenceSidebar} from "./config/apiReferenceSidebar.js"; import {ensureLocalImage} from "./utils/ensureLocalImage.js"; import {getExcerptFromMarkdownFile} from "./utils/getExcerptFromMarkdownFile.js"; +import {getVitepressSidebar, getVitepressSidebarWithBlog} from "./config/sidebar.js"; +import {getBlogPosts} from "./config/getBlogPosts.js"; import type {Element as HastElement, Parent} from "hast"; import type {Node as UnistNode} from "unist"; @@ -365,6 +367,12 @@ export default defineConfig({ }) as VitepressPlugin, BlogPageInfoPlugin({ include: (id) => id.includes(path.sep + "blog" + path.sep) && !id.endsWith(path.sep + "blog" + path.sep + "index.md") + }), + llmstxt({ + ignoreFiles: ["index.md"], + domain: resolveHref("/test").slice(0, -"/test".length) || undefined, + excludeBlog: false, + sidebar: () => getVitepressSidebarWithBlog(true, false) }) ], build: { @@ -434,6 +442,9 @@ export default defineConfig({ }, { text: "GitHub Discussions", link: "https://github.com/withcatai/node-llama-cpp/discussions" + }, { + text: "Awesome List", + link: "/guide/awesome" }, { text: "Contribute", link: "/guide/contributing" @@ -469,100 +480,14 @@ export default defineConfig({ } } }, - sidebar: { - "/guide/": [{ - text: "Guide", - base: "/guide", - items: [ - {text: "Getting Started", link: "/"}, - {text: "Chat Session", link: "/chat-session"}, - {text: "Chat Wrapper", link: "/chat-wrapper"}, - {text: "Grammar", link: "/grammar"}, - {text: "Function Calling", link: "/function-calling"}, - {text: "Embedding", link: "/embedding"}, - {text: "Text Completion", link: "/text-completion"}, - {text: "Choosing a Model", link: "/choosing-a-model"}, - {text: "Downloading Models", link: "/downloading-models"} - ] - }, { - text: "Advanced", - base: "/guide", - items: [ - {text: "Building From Source", link: "/building-from-source"}, - {text: "Metal Support", link: "/Metal"}, - {text: "CUDA Support", link: "/CUDA"}, - {text: "Vulkan Support", link: "/Vulkan"}, - {text: "Electron Support", link: "/electron"}, - {text: "Using in Docker", link: "/docker"}, - {text: "Using Tokens", link: "/tokens"}, - {text: "LlamaText", link: "/llama-text"}, - {text: "External Chat State", link: "/external-chat-state"}, - {text: "Token Bias", link: "/token-bias"}, - {text: "Objects Lifecycle", link: "/objects-lifecycle"}, - {text: "Chat Context Shift", link: "/chat-context-shift"}, - {text: "Batching", link: "/batching"}, - {text: "Token Prediction", link: "/token-prediction"}, - {text: "Low Level API", link: "/low-level-api"}, - {text: "Awesome List", link: "/awesome"}, - {text: "Troubleshooting", link: "/troubleshooting"}, - {text: "Tips and Tricks", link: "/tips-and-tricks"} - ] - }, { - text: "Contributing", - base: "/guide", - items: [ - {text: "Setting Up a Dev Environment", link: "/development"}, - {text: "Pull Request Guidelines", link: "/contributing"} - ] - }], - - "/cli/": [{ - text: "CLI", - base: "/cli", - link: "/", - items: [ - {text: "Init", link: "/init"}, - {text: "Chat", link: "/chat"}, - {text: "Pull", link: "/pull"}, - { - text: "Source", - link: "/source", - collapsed: true, - items: [ - {text: "Download", link: "/source/download"}, - {text: "Build", link: "/source/build"}, - {text: "Clear", link: "/source/clear"} - ] - }, - {text: "Complete", link: "/complete"}, - {text: "Infill", link: "/infill"}, - { - text: "Inspect", - link: "/inspect", - collapsed: true, - items: [ - {text: "GPU", link: "/inspect/gpu"}, - {text: "GGUF", link: "/inspect/gguf"}, - {text: "Measure", link: "/inspect/measure"}, - {text: "Estimate", link: "/inspect/estimate"} - ] - } - ] - }], - - "/api/": getApiReferenceSidebar() - }, + sidebar: getVitepressSidebar(), socialLinks: [ {icon: "npm", link: "https://www.npmjs.com/package/node-llama-cpp"}, {icon: "github", link: "https://github.com/withcatai/node-llama-cpp"} ] }, async buildEnd(siteConfig) { - const blogPosts = await createContentLoader("blog/*.md", { - excerpt: true, - render: true - }) - .load(); + const blogPosts = await getBlogPosts(false); async function loadSvgFontBuffers() { const interFontFilesDirectoryPath = path.join(require.resolve("@fontsource/inter"), "..", "files"); @@ -699,24 +624,7 @@ export default defineConfig({ ...siteConfig.site, themeConfig: { ...siteConfig.site.themeConfig, - sidebar: { - ...siteConfig.site.themeConfig.sidebar, - "/_blog/": { - text: "Blog", - link: "/blog/", - items: blogPosts - .filter((post) => { - const hasCoverImage = typeof post.frontmatter?.image === "string" || - typeof post.frontmatter?.image?.url === "string"; - - return !hasCoverImage; - }) - .map((post) => ({ - text: post.frontmatter.title, - link: post.url - })) - } - } + sidebar: await getVitepressSidebarWithBlog(true, true) } } }); @@ -744,22 +652,6 @@ export default defineConfig({ hub: "https://pubsubhubbub.appspot.com/" }); - blogPosts.sort((a, b) => { - const aDate = a.frontmatter.date - ? new Date(a.frontmatter.date) - : null; - const bDate = b.frontmatter.date - ? new Date(b.frontmatter.date) - : null; - - if (aDate == null) - return -1; - if (bDate == null) - return 1; - - return bDate.getTime() - aDate.getTime(); - }); - for (const {url, frontmatter, html, src, excerpt: originalExcerpt} of blogPosts) { const ogImageElement = findElementInHtml(html, (element) => ( element.tagName === "meta" && (element.properties?.name === "og:image" || element.properties?.property === "og:image") @@ -819,12 +711,6 @@ export default defineConfig({ await addOgImages(); - const indexPageIndex = blogPosts.findIndex((post) => post.url === "/blog/"); - if (indexPageIndex < 0) - throw new Error("Blog index page not found"); - - blogPosts.splice(indexPageIndex, 1); - await addBlogRssFeed(); try { @@ -853,6 +739,11 @@ export default defineConfig({ path.join(siteConfig.outDir, "logo.preview.avif"), 24 ); + + await Promise.all([ + fs.copy(path.join(siteConfig.outDir, "llms.txt"), path.join(siteConfig.outDir, "llms.md")), + fs.copy(path.join(siteConfig.outDir, "llms-full.txt"), path.join(siteConfig.outDir, "llms-full.md")) + ]); } }); diff --git a/.vitepress/config/getBlogPosts.ts b/.vitepress/config/getBlogPosts.ts new file mode 100644 index 00000000..1d4cb6a5 --- /dev/null +++ b/.vitepress/config/getBlogPosts.ts @@ -0,0 +1,46 @@ +import {ContentData, createContentLoader} from "vitepress"; + +let blogPosts: ContentData[] | undefined = undefined; +export async function getBlogPosts(includeIndex: boolean = false) { + if (includeIndex) + return await _getBlogPosts(); + + const blogPosts = (await _getBlogPosts()).slice(); + + const indexPageIndex = blogPosts.findIndex((post) => post.url === "/blog/"); + if (indexPageIndex < 0) + throw new Error("Blog index page not found"); + + blogPosts.splice(indexPageIndex, 1); + + return blogPosts; +} + +async function _getBlogPosts() { + if (blogPosts != null) + return blogPosts; + + blogPosts = await createContentLoader("blog/*.md", { + excerpt: true, + render: true + }) + .load(); + + blogPosts.sort((a, b) => { + const aDate = a.frontmatter.date + ? new Date(a.frontmatter.date) + : null; + const bDate = b.frontmatter.date + ? new Date(b.frontmatter.date) + : null; + + if (aDate == null) + return -1; + if (bDate == null) + return 1; + + return bDate.getTime() - aDate.getTime(); + }); + + return blogPosts; +} diff --git a/.vitepress/config/sidebar.ts b/.vitepress/config/sidebar.ts new file mode 100644 index 00000000..b151a56c --- /dev/null +++ b/.vitepress/config/sidebar.ts @@ -0,0 +1,134 @@ +import {DefaultTheme} from "vitepress"; +import {getApiReferenceSidebar} from "./apiReferenceSidebar.js"; +import {getBlogPosts} from "./getBlogPosts.js"; + +const apiReferenceSidebar = getApiReferenceSidebar(); + +export function getVitepressSidebar(blog?: DefaultTheme.SidebarItem[]): DefaultTheme.Sidebar { + return { + "/guide/": [{ + text: "Guide", + base: "/guide", + items: [ + {text: "Getting Started", link: "/"}, + {text: "Chat Session", link: "/chat-session"}, + {text: "Chat Wrapper", link: "/chat-wrapper"}, + {text: "Grammar", link: "/grammar"}, + {text: "Function Calling", link: "/function-calling"}, + {text: "Embedding", link: "/embedding"}, + {text: "Text Completion", link: "/text-completion"}, + {text: "Choosing a Model", link: "/choosing-a-model"}, + {text: "Downloading Models", link: "/downloading-models"} + ] + }, { + text: "Advanced", + base: "/guide", + items: [ + {text: "Building From Source", link: "/building-from-source"}, + {text: "Metal Support", link: "/Metal"}, + {text: "CUDA Support", link: "/CUDA"}, + {text: "Vulkan Support", link: "/Vulkan"}, + {text: "Electron Support", link: "/electron"}, + {text: "Using in Docker", link: "/docker"}, + {text: "Using Tokens", link: "/tokens"}, + {text: "LlamaText", link: "/llama-text"}, + {text: "External Chat State", link: "/external-chat-state"}, + {text: "Token Bias", link: "/token-bias"}, + {text: "Objects Lifecycle", link: "/objects-lifecycle"}, + {text: "Chat Context Shift", link: "/chat-context-shift"}, + {text: "Batching", link: "/batching"}, + {text: "Token Prediction", link: "/token-prediction"}, + {text: "Low Level API", link: "/low-level-api"}, + {text: "Awesome List", link: "/awesome"}, + {text: "Troubleshooting", link: "/troubleshooting"}, + {text: "Tips and Tricks", link: "/tips-and-tricks"} + ] + }, { + text: "Contributing", + base: "/guide", + items: [ + {text: "Setting Up a Dev Environment", link: "/development"}, + {text: "Pull Request Guidelines", link: "/contributing"} + ] + }], + + ...( + blog != null + ? { + "/_blog/": [{ + text: "Blog", + link: "/blog/", + items: blog + }] + } + : {} + ), + + "/cli/": [{ + text: "CLI", + base: "/cli", + link: "/", + items: [ + {text: "Init", link: "/init"}, + {text: "Chat", link: "/chat"}, + {text: "Pull", link: "/pull"}, + { + text: "Source", + link: "/source", + collapsed: true, + items: [ + {text: "Download", link: "/source/download"}, + {text: "Build", link: "/source/build"}, + {text: "Clear", link: "/source/clear"} + ] + }, + {text: "Complete", link: "/complete"}, + {text: "Infill", link: "/infill"}, + { + text: "Inspect", + link: "/inspect", + collapsed: true, + items: [ + {text: "GPU", link: "/inspect/gpu"}, + {text: "GGUF", link: "/inspect/gguf"}, + {text: "Measure", link: "/inspect/measure"}, + {text: "Estimate", link: "/inspect/estimate"} + ] + } + ] + }], + + "/api/": structuredClone(apiReferenceSidebar) + }; +} + +export async function getSidebarBlogPostItems( + includeIndex: boolean = false, + onlyItemsWithoutCoverImage: boolean = false +): Promise { + const blogPosts = await getBlogPosts(includeIndex); + + return blogPosts + .filter((post) => { + if (!onlyItemsWithoutCoverImage) + return true; + + const hasCoverImage = typeof post.frontmatter?.image === "string" || + typeof post.frontmatter?.image?.url === "string"; + + return !hasCoverImage; + }) + .map((post) => ({ + text: post.frontmatter.title, + link: post.url + })); +} + +export async function getVitepressSidebarWithBlog( + includeIndex: boolean = false, + onlyItemsWithoutCoverImage: boolean = false +) { + const blogItems = await getSidebarBlogPostItems(includeIndex, onlyItemsWithoutCoverImage); + + return getVitepressSidebar(blogItems); +} diff --git a/package.json b/package.json index 79f9834f..0636af08 100644 --- a/package.json +++ b/package.json @@ -210,6 +210,7 @@ "stdout-update": "^4.0.1", "strip-ansi": "^7.1.0", "validate-npm-package-name": "^6.0.0", + "vitepress-plugin-llms": "^1.3.4", "which": "^5.0.0", "yargs": "^17.7.2" }, From 262999f84498d3b5c8895720f5084a943a2821c7 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 2 Jun 2025 03:05:02 +0300 Subject: [PATCH 07/14] fix: update `ipull` --- package-lock.json | 308 ++++++++++++++++++++++++++++++++++++++++++++-- package.json | 6 +- 2 files changed, 298 insertions(+), 16 deletions(-) diff --git a/package-lock.json b/package-lock.json index 72578ac6..ad8c83c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,7 +22,7 @@ "filenamify": "^6.0.0", "fs-extra": "^11.3.0", "ignore": "^7.0.4", - "ipull": "^3.9.2", + "ipull": "^4.0.3", "is-unicode-supported": "^2.1.0", "lifecycle-utils": "^2.0.0", "log-symbols": "^7.0.0", @@ -62,7 +62,7 @@ "@types/bytes": "^3.1.5", "@types/cross-spawn": "^6.0.6", "@types/fs-extra": "^11.0.4", - "@types/node": "^22.15.17", + "@types/node": "^20.17.50", "@types/proper-lockfile": "^4.1.4", "@types/semver": "^7.7.0", "@types/validate-npm-package-name": "^4.0.2", @@ -91,6 +91,7 @@ "typescript-eslint": "^8.32.0", "vite-node": "^3.1.3", "vitepress": "^1.6.3", + "vitepress-plugin-llms": "^1.3.4", "vitest": "^3.1.3", "zx": "^8.5.4" }, @@ -2231,6 +2232,15 @@ "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==", "license": "MIT" }, + "node_modules/@lukeed/csprng": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@lukeed/csprng/-/csprng-1.1.0.tgz", + "integrity": "sha512-Z7C/xXCiGWsg0KuKsHTKJxbWhpI3Vs5GwLfOean7MGyVFGqdRgBbAjOCh6u4bbjPc/8MJ2pZmK/0DLdCbivLDA==", + "license": "MIT", + "engines": { + "node": ">=8" + } + }, "node_modules/@modelcontextprotocol/sdk": { "version": "1.11.1", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.11.1.tgz", @@ -4420,13 +4430,13 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.15.17", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.17.tgz", - "integrity": "sha512-wIX2aSZL5FE+MR0JlvF87BNVrtFWf6AE6rxSE9X7OwnVvoyCQjpzSRJ+M87se/4QCkCiebQAqrJ0y6fwIyi7nw==", + "version": "20.17.50", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.50.tgz", + "integrity": "sha512-Mxiq0ULv/zo1OzOhwPqOA13I81CV/W3nvd3ChtQZRT5Cwz3cr0FKo/wMSsbTqL3EXpaBAEQhva2B8ByRkOIh9A==", "dev": true, "license": "MIT", "dependencies": { - "undici-types": "~6.21.0" + "undici-types": "~6.19.2" } }, "node_modules/@types/normalize-package-data": { @@ -5996,6 +6006,24 @@ "node": "*" } }, + "node_modules/byte-size": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/byte-size/-/byte-size-9.0.1.tgz", + "integrity": "sha512-YLe9x3rabBrcI0cueCdLS2l5ONUKywcRpTs02B8KP9/Cimhj7o3ZccGrPnRvcbyHMbb7W79/3MUJl7iGgTXKEw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.17" + }, + "peerDependencies": { + "@75lb/nature": "latest" + }, + "peerDependenciesMeta": { + "@75lb/nature": { + "optional": true + } + } + }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -7500,6 +7528,23 @@ "node": ">= 12.20.55" } }, + "node_modules/electron/node_modules/@types/node": { + "version": "22.15.21", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.21.tgz", + "integrity": "sha512-EV/37Td6c+MgKAbkcLG6vqZ2zEYHD7bvSrzqqs2RIhbA6w3x+Dqz8MZM3sP6kGTeLrdoOgKZe+Xja7tUB2DNkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/electron/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, "node_modules/emoji-regex": { "version": "10.4.0", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", @@ -8945,6 +8990,20 @@ "reusify": "^1.0.4" } }, + "node_modules/fault": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/fault/-/fault-2.0.1.tgz", + "integrity": "sha512-WtySTkS4OKev5JtpHXnib4Gxiurzh5NCGvWrFaZ34m6JehfTUhKZvn9njTfw48t6JumVQOmrKqpmGcdwxnhqBQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "format": "^0.2.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/fd-slicer": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", @@ -9238,6 +9297,15 @@ "node": ">= 6" } }, + "node_modules/format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", + "integrity": "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==", + "dev": true, + "engines": { + "node": ">=0.4.x" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -10550,9 +10618,9 @@ } }, "node_modules/ipull": { - "version": "3.9.2", - "resolved": "https://registry.npmjs.org/ipull/-/ipull-3.9.2.tgz", - "integrity": "sha512-YbCDsqcf0ytc3b8304ygBlvRtKJTvyygkQX2xcmPkih6vdVKbRw13pDdtSR+vEqLql3owyuPj9m6iT6IfwFaCg==", + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/ipull/-/ipull-4.0.3.tgz", + "integrity": "sha512-mPcOnm1hX1GTL4/f1C5IQFbo1uxqKihZX8KbaHWWnJ7NW4SKQaelRAVy9iVb8XgugMnlEo6TQVBrzCbOvswbsA==", "license": "MIT", "dependencies": { "@tinyhttp/content-disposition": "^2.2.0", @@ -10573,7 +10641,8 @@ "sleep-promise": "^9.1.0", "slice-ansi": "^7.1.0", "stdout-update": "^4.0.1", - "strip-ansi": "^7.1.0" + "strip-ansi": "^7.1.0", + "uid": "^2.0.2" }, "bin": { "ipull": "dist/cli/cli.js" @@ -11808,6 +11877,16 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/markdown-title": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/markdown-title/-/markdown-title-1.0.2.tgz", + "integrity": "sha512-MqIQVVkz+uGEHi3TsHx/czcxxCbRIL7sv5K5DnYw/tI+apY54IbPefV/cmgxp6LoJSEx/TqcHdLs/298afG5QQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/marked": { "version": "12.0.2", "resolved": "https://registry.npmjs.org/marked/-/marked-12.0.2.tgz", @@ -11922,6 +12001,38 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/mdast-util-frontmatter": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-frontmatter/-/mdast-util-frontmatter-2.0.1.tgz", + "integrity": "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "escape-string-regexp": "^5.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-extension-frontmatter": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-frontmatter/node_modules/escape-string-regexp": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/mdast-util-gfm": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", @@ -12242,6 +12353,23 @@ "micromark-util-types": "^2.0.0" } }, + "node_modules/micromark-extension-frontmatter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-frontmatter/-/micromark-extension-frontmatter-2.0.0.tgz", + "integrity": "sha512-C4AkuM3dA58cgZha7zVnuVxBhDsbttIMiytjgsM2XbHAB2faRVaHRle40558FBN+DJcrLNCoqG5mlrpdU4cRtg==", + "dev": true, + "license": "MIT", + "dependencies": { + "fault": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/micromark-factory-destination": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", @@ -12661,6 +12789,19 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/millify": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/millify/-/millify-6.1.0.tgz", + "integrity": "sha512-H/E3J6t+DQs/F2YgfDhxUVZz/dF8JXPPKTLHL/yHCcLZLtCXJDUaqvhJXQwqOVBvbyNn4T0WjLpIHd7PAw7fBA==", + "dev": true, + "license": "MIT", + "dependencies": { + "yargs": "^17.0.1" + }, + "bin": { + "millify": "bin/millify" + } + }, "node_modules/mime": { "version": "4.0.6", "resolved": "https://registry.npmjs.org/mime/-/mime-4.0.6.tgz", @@ -17122,6 +17263,73 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remark": { + "version": "15.0.1", + "resolved": "https://registry.npmjs.org/remark/-/remark-15.0.1.tgz", + "integrity": "sha512-Eht5w30ruCXgFmxVUSlNWQ9iiimq07URKeFS3hNc8cUWy1llX4KDWfyEDZRycMc+znsN9Ux5/tJ/BFdgdOwA3A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-frontmatter": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/remark-frontmatter/-/remark-frontmatter-5.0.0.tgz", + "integrity": "sha512-XTFYvNASMe5iPN0719nPrdItC9aU0ssC4v14mH1BCi1u0n1gAocqcujWUrByftZTbLhRtiKRyjYTSIOcr69UVQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-frontmatter": "^2.0.0", + "micromark-extension-frontmatter": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-parse": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", + "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "micromark-util-types": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-stringify": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", + "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-to-markdown": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -19266,6 +19474,13 @@ "node": ">=0.6" } }, + "node_modules/tokenx": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-0.4.1.tgz", + "integrity": "sha512-LCMniis0WsHel07xh3K9OIt5c9Xla1awtOoWBmUHZBQR7pvTvgGFuYpLiCZWohXPC1YuZORnN0+fCVYI/ie8Jg==", + "dev": true, + "license": "MIT" + }, "node_modules/totalist": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/totalist/-/totalist-3.0.1.tgz", @@ -19661,6 +19876,18 @@ "node": ">=0.8.0" } }, + "node_modules/uid": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/uid/-/uid-2.0.2.tgz", + "integrity": "sha512-u3xV3X7uzvi5b1MncmZo3i2Aw222Zk1keqLA1YkHldREkAhAqi65wuPfe7lHx8H/Wzy+8CE7S7uS3jekIM5s8g==", + "license": "MIT", + "dependencies": { + "@lukeed/csprng": "^1.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/unbox-primitive": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", @@ -19688,9 +19915,9 @@ "license": "MIT" }, "node_modules/undici-types": { - "version": "6.21.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", - "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "version": "6.19.8", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz", + "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==", "dev": true, "license": "MIT" }, @@ -19781,6 +20008,22 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/unist-util-remove": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/unist-util-remove/-/unist-util-remove-4.0.0.tgz", + "integrity": "sha512-b4gokeGId57UVRX/eVKej5gXqGlc9+trkORhFJpu9raqZkZhU0zm8Doi05+HaiBsMEIJowL+2WtQ5ItjsngPXg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/unist-util-stringify-position": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", @@ -20127,6 +20370,45 @@ } } }, + "node_modules/vitepress-plugin-llms": { + "version": "1.3.4", + "resolved": "https://registry.npmjs.org/vitepress-plugin-llms/-/vitepress-plugin-llms-1.3.4.tgz", + "integrity": "sha512-owEPumKy5syjRRG0OSA2635NoeR/U+eiLIjurLTUMXxdmtJ0h6OrTLqvIFPYTV0gSQfaWY/owKdTxjZxv2n2bQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "byte-size": "^9.0.1", + "gray-matter": "^4.0.3", + "markdown-title": "^1.0.2", + "millify": "^6.1.0", + "minimatch": "^10.0.1", + "picocolors": "^1.1.1", + "remark": "^15.0.1", + "remark-frontmatter": "^5.0.0", + "tokenx": "^0.4.1", + "unist-util-remove": "^4.0.0", + "unist-util-visit": "^5.0.0" + }, + "funding": { + "url": "https://github.com/okineadev/vitepress-plugin-llms?sponsor=1" + } + }, + "node_modules/vitepress-plugin-llms/node_modules/minimatch": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz", + "integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": "20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/vitepress/node_modules/@shikijs/core": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-2.2.0.tgz", diff --git a/package.json b/package.json index 0636af08..b6b2493e 100644 --- a/package.json +++ b/package.json @@ -149,7 +149,7 @@ "@types/bytes": "^3.1.5", "@types/cross-spawn": "^6.0.6", "@types/fs-extra": "^11.0.4", - "@types/node": "^22.15.17", + "@types/node": "^20.17.50", "@types/proper-lockfile": "^4.1.4", "@types/semver": "^7.7.0", "@types/validate-npm-package-name": "^4.0.2", @@ -178,6 +178,7 @@ "typescript-eslint": "^8.32.0", "vite-node": "^3.1.3", "vitepress": "^1.6.3", + "vitepress-plugin-llms": "^1.3.4", "vitest": "^3.1.3", "zx": "^8.5.4" }, @@ -194,7 +195,7 @@ "filenamify": "^6.0.0", "fs-extra": "^11.3.0", "ignore": "^7.0.4", - "ipull": "^3.9.2", + "ipull": "^4.0.3", "is-unicode-supported": "^2.1.0", "lifecycle-utils": "^2.0.0", "log-symbols": "^7.0.0", @@ -210,7 +211,6 @@ "stdout-update": "^4.0.1", "strip-ansi": "^7.1.0", "validate-npm-package-name": "^6.0.0", - "vitepress-plugin-llms": "^1.3.4", "which": "^5.0.0", "yargs": "^17.7.2" }, From 39373a6997a2ea0c6cb69b4f89de39d8fea116e9 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 4 Jun 2025 01:04:13 +0300 Subject: [PATCH 08/14] docs: generate a `llms.txt` file --- package-lock.json | 14 +++++++------- package.json | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/package-lock.json b/package-lock.json index ad8c83c4..871a8067 100644 --- a/package-lock.json +++ b/package-lock.json @@ -91,7 +91,7 @@ "typescript-eslint": "^8.32.0", "vite-node": "^3.1.3", "vitepress": "^1.6.3", - "vitepress-plugin-llms": "^1.3.4", + "vitepress-plugin-llms": "https://pkg.pr.new/vitepress-plugin-llms@51", "vitest": "^3.1.3", "zx": "^8.5.4" }, @@ -19475,9 +19475,9 @@ } }, "node_modules/tokenx": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-0.4.1.tgz", - "integrity": "sha512-LCMniis0WsHel07xh3K9OIt5c9Xla1awtOoWBmUHZBQR7pvTvgGFuYpLiCZWohXPC1YuZORnN0+fCVYI/ie8Jg==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-1.0.1.tgz", + "integrity": "sha512-MhOngUHRuVE0CHP4cNEZ/XpdXETFL65nJpEvoTW+VYPuXsT/MTeNj+UNnekNsnxecmj2DEvUYPebqz+CsPTUSg==", "dev": true, "license": "MIT" }, @@ -20372,8 +20372,8 @@ }, "node_modules/vitepress-plugin-llms": { "version": "1.3.4", - "resolved": "https://registry.npmjs.org/vitepress-plugin-llms/-/vitepress-plugin-llms-1.3.4.tgz", - "integrity": "sha512-owEPumKy5syjRRG0OSA2635NoeR/U+eiLIjurLTUMXxdmtJ0h6OrTLqvIFPYTV0gSQfaWY/owKdTxjZxv2n2bQ==", + "resolved": "https://pkg.pr.new/vitepress-plugin-llms@51", + "integrity": "sha512-FTyNYyx1jVbKae/raJLgDTgMaHSmY51B1nbokeC4KAhXMe413eGSexNIdvnCHXf9U1t92VlLajJ5S9E7adDoOQ==", "dev": true, "license": "MIT", "dependencies": { @@ -20385,7 +20385,7 @@ "picocolors": "^1.1.1", "remark": "^15.0.1", "remark-frontmatter": "^5.0.0", - "tokenx": "^0.4.1", + "tokenx": "^1.0.0", "unist-util-remove": "^4.0.0", "unist-util-visit": "^5.0.0" }, diff --git a/package.json b/package.json index b6b2493e..4c1a6865 100644 --- a/package.json +++ b/package.json @@ -178,7 +178,7 @@ "typescript-eslint": "^8.32.0", "vite-node": "^3.1.3", "vitepress": "^1.6.3", - "vitepress-plugin-llms": "^1.3.4", + "vitepress-plugin-llms": "https://pkg.pr.new/vitepress-plugin-llms@51", "vitest": "^3.1.3", "zx": "^8.5.4" }, From 06c0d60c984662ce84835073bd3f8807121abdcf Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 4 Jun 2025 01:14:30 +0300 Subject: [PATCH 09/14] fix: naming consistency --- docs/guide/chat-session.md | 4 ++-- src/cli/commands/ChatCommand.ts | 16 ++++++++-------- ...ingBudget.test.ts => reasoningBudget.test.ts} | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) rename test/modelDependent/qwen3-0.6b/{thinkingBudget.test.ts => reasoningBudget.test.ts} (88%) diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md index ec9c8541..a6a1a097 100644 --- a/docs/guide/chat-session.md +++ b/docs/guide/chat-session.md @@ -899,8 +899,8 @@ const fullResponse = a1.response console.log("Full response: " + fullResponse); ``` -## Set Thinking Budget {#thinking-budget} -You can set a thinking budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments). +## Set Reasoning Budget {#reasoning-budget} +You can set a reasoning budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments). ```typescript import { getLlama, LlamaChatSession, resolveModelFile, Token diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index d1ebc8e1..a23e58e5 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -62,7 +62,7 @@ type ChatCommand = { repeatFrequencyPenalty?: number, repeatPresencePenalty?: number, maxTokens: number, - thoughtBudget?: number, + reasoningBudget?: number, noHistory: boolean, environmentFunctions: boolean, tokenPredictionDraftModel?: string, @@ -263,8 +263,8 @@ export const ChatCommand: CommandModule = { default: 0, description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size" }) - .option("thoughtBudget", { - alias: ["tb", "thinkingBudget", "reasoningBudget"], + .option("reasoningBudget", { + alias: ["tb", "thinkingBudget", "thoughtsBudget"], type: "number", default: -1, defaultDescription: "Unlimited", @@ -326,7 +326,7 @@ export const ChatCommand: CommandModule = { promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, - repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }) { try { @@ -335,7 +335,7 @@ export const ChatCommand: CommandModule = { batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, + maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }); } catch (err) { @@ -352,12 +352,12 @@ async function RunChat({ contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, - repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, thoughtBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; - if (thoughtBudget === -1) thoughtBudget = undefined; + if (reasoningBudget === -1) reasoningBudget = undefined; const headers = resolveHeaderFlag(headerArg); const trimWhitespace = !noTrimWhitespace; @@ -696,7 +696,7 @@ async function RunChat({ signal: abortController.signal, stopOnAbortSignal: true, budgets: { - thoughtTokens: thoughtBudget + thoughtTokens: reasoningBudget }, repeatPenalty: { penalty: repeatPenalty, diff --git a/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts b/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts similarity index 88% rename from test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts rename to test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts index 35522794..78cf5480 100644 --- a/test/modelDependent/qwen3-0.6b/thinkingBudget.test.ts +++ b/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts @@ -4,8 +4,8 @@ import {getModelFile} from "../../utils/modelFiles.js"; import {getTestLlama} from "../../utils/getTestLlama.js"; describe("qwen3 0.6b", () => { - describe("thinking budget", () => { - test("doesn't exceed thinking budget", {timeout: 1000 * 60 * 60 * 2}, async () => { + describe("reasoning budget", () => { + test("doesn't exceed reasoning budget", {timeout: 1000 * 60 * 60 * 2}, async () => { const modelPath = await getModelFile("Qwen3-0.6B-Q8_0.gguf"); const llama = await getTestLlama(); @@ -22,9 +22,9 @@ describe("qwen3 0.6b", () => { const initialChatHistory = chatSession.getChatHistory(); async function promptWithBudget({ - prompt, maxTokens, thinkingBudget + prompt, maxTokens, reasoningBudget }: { - prompt: string, maxTokens: number, thinkingBudget?: number + prompt: string, maxTokens: number, reasoningBudget?: number }) { let thoughtTokens = 0; let totalTokens = 0; @@ -33,7 +33,7 @@ describe("qwen3 0.6b", () => { const {responseText, response} = await chatSession.promptWithMeta(prompt, { maxTokens, budgets: { - thoughtTokens: thinkingBudget + thoughtTokens: reasoningBudget }, onResponseChunk(chunk) { if (chunk.type === "segment" && chunk.segmentType === "thought") { @@ -57,7 +57,7 @@ describe("qwen3 0.6b", () => { const res1 = await promptWithBudget({ prompt: "Where do llamas come from?", - thinkingBudget: 10, + reasoningBudget: 10, maxTokens: 20 }); expect(res1.thoughtTokens).to.be.gt(1); @@ -67,7 +67,7 @@ describe("qwen3 0.6b", () => { const res2 = await promptWithBudget({ prompt: "Where do llamas come from?", - thinkingBudget: 0, + reasoningBudget: 0, maxTokens: 20 }); expect(res2.thoughtTokens).to.be.eq(0); @@ -76,7 +76,7 @@ describe("qwen3 0.6b", () => { const res3 = await promptWithBudget({ prompt: "Where do llamas come from?", - thinkingBudget: 20, + reasoningBudget: 20, maxTokens: 20 }); expect(res3.thoughtTokens).to.be.eq(res3.totalTokens); From 119652618b0f575f04749988bfd5b9c4c6afb1b3 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 4 Jun 2025 03:36:08 +0300 Subject: [PATCH 10/14] fix: bugs --- llama/addon/AddonContext.cpp | 2 +- src/evaluator/LlamaContext/LlamaContext.ts | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index a64e3ada..775c2053 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -403,7 +403,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Uint32Value(); + context_params.n_batch = options.Get("batchSize").As().Uint32Value() + 1; // +1 to handle edge cases with SWA KV cache context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side } diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index f1c263be..d3af97e0 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -1055,7 +1055,9 @@ export class LlamaContextSequence { * * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models). * - * When SWA is used, this index will usually be `Math.max(0, .nextTokenIndex - .model.fileInsights.swaSize)` or larger. + * When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger. + * + * When the KV cache is empty, this index will be `-1`. * * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context. */ @@ -1207,6 +1209,8 @@ export class LlamaContextSequence { ) { this._ensureNotDisposed(); + let awaitPromise: Promise | undefined; + await withLock(this._context, "context", async () => { this._ensureNotDisposed(); @@ -1250,7 +1254,7 @@ export class LlamaContextSequence { const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0) ? 0 - : this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId); + : Math.max(0, this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId)); if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition) // we have to drop the cache and reevaluate the sequence due to missing KV cache deletionSuccessful = false; @@ -1310,8 +1314,12 @@ export class LlamaContextSequence { this._nextTokenIndex = 0; this._context._ctx.disposeSequence(this._sequenceId); - await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock}); + // wait for the evaluation outside the "context" lock to avoid deadlocks + awaitPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock}); }); + + if (awaitPromise != null) + await awaitPromise; } /** From 2a0c539a90cf51906fe70b8cec1745a221678517 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 4 Jun 2025 03:42:48 +0300 Subject: [PATCH 11/14] test: fix test --- test/modelDependent/llama3.2/sequenceState.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts index 151fc4f3..e6267045 100644 --- a/test/modelDependent/llama3.2/sequenceState.test.ts +++ b/test/modelDependent/llama3.2/sequenceState.test.ts @@ -34,10 +34,10 @@ describe("llama 3.2", () => { res1, res2 ] = await Promise.all([ - chatSession1.prompt("Remember: locks are not doors", {maxTokens: 6}), + chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}), chatSession2.prompt("Remember: giraffes are not elephants", {maxTokens: 5}) ]); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever phrase.\""); + expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); expect(res2).to.toMatchInlineSnapshot('"I appreciate the reminder."'); @@ -47,8 +47,8 @@ describe("llama 3.2", () => { test.onTestFinished(() => fs.remove(stateFile1Path)); expect(contextSequence1.contextTokens).to.eql(state1Tokens); - expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105"); - expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.49MB"'); + expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103"); + expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot("\"11.27MB\""); const stateFile2Path = await getTempTestFilePath("state2"); @@ -68,7 +68,7 @@ describe("llama 3.2", () => { expect(contextSequence1TokensState1).toMatchInlineSnapshot(` { "usedInputTokens": 99, - "usedOutputTokens": 6, + "usedOutputTokens": 4, } `); @@ -91,7 +91,7 @@ describe("llama 3.2", () => { await contextSequence1.loadStateFromFile(stateFile1Path, {acceptRisk: true}); expect(contextSequence1.contextTokens).to.eql(state1Tokens); - expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105"); + expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103"); const contextSequence1TokensState3 = contextSequence1.tokenMeter.getState(); expect(TokenMeter.diff(contextSequence1TokensState3, contextSequence1TokensState2)).toMatchInlineSnapshot(` From 8d85100bf190153fa35f24d8e4186b665c5203fc Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 4 Jun 2025 03:51:58 +0300 Subject: [PATCH 12/14] fix: bugs --- llama/addon/AddonContext.cpp | 2 +- src/evaluator/LlamaContext/LlamaContext.ts | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 775c2053..a64e3ada 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -403,7 +403,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Uint32Value() + 1; // +1 to handle edge cases with SWA KV cache + context_params.n_batch = options.Get("batchSize").As().Uint32Value(); context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side } diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index d3af97e0..8a5cff98 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -125,7 +125,11 @@ export class LlamaContext { this._swaFullCache = !!swaFullCache; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ contextSize: this._contextSize * this._totalSequences, // each sequence needs its own of cells - batchSize: this._batchSize, + batchSize: this._batchSize + ( + (!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0) + ? 1 // +1 to handle edge cases with SWA KV cache + : 0 + ), sequences: this._totalSequences, flashAttention: this._flashAttention, threads: this._idealThreads, From 5c95321e447e25cbdd4ff9b46a2de02005b67db2 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 4 Jun 2025 23:33:06 +0300 Subject: [PATCH 13/14] chore: add internal debug method --- llama/addon/AddonContext.cpp | 14 ++++++++++++++ llama/addon/AddonContext.h | 1 + src/bindings/AddonTypes.ts | 1 + src/bindings/getLlama.ts | 1 + 4 files changed, 17 insertions(+) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index a64e3ada..574dd79f 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -638,6 +638,19 @@ Napi::Value AddonContext::GetSequenceKvCacheMinPosition(const Napi::CallbackInfo return Napi::Number::New(info.Env(), minPosition); } +Napi::Value AddonContext::GetSequenceKvCacheMaxPosition(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + int32_t sequenceId = info[0].As().Int32Value(); + + + const auto maxPosition = llama_kv_self_seq_pos_max(ctx, sequenceId); + + return Napi::Number::New(info.Env(), maxPosition); +} Napi::Value AddonContext::DecodeBatch(const Napi::CallbackInfo& info) { AddonContextDecodeBatchWorker* worker = new AddonContextDecodeBatchWorker(info.Env(), this); worker->Queue(); @@ -945,6 +958,7 @@ void AddonContext::init(Napi::Object exports) { InstanceMethod("removeTokenCellsFromSequence", &AddonContext::RemoveTokenCellsFromSequence), InstanceMethod("shiftSequenceTokenCells", &AddonContext::ShiftSequenceTokenCells), InstanceMethod("getSequenceKvCacheMinPosition", &AddonContext::GetSequenceKvCacheMinPosition), + InstanceMethod("getSequenceKvCacheMaxPosition", &AddonContext::GetSequenceKvCacheMaxPosition), InstanceMethod("decodeBatch", &AddonContext::DecodeBatch), InstanceMethod("sampleToken", &AddonContext::SampleToken), InstanceMethod("getEmbedding", &AddonContext::GetEmbedding), diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h index 0edbedc7..7e661f12 100644 --- a/llama/addon/AddonContext.h +++ b/llama/addon/AddonContext.h @@ -37,6 +37,7 @@ class AddonContext : public Napi::ObjectWrap { Napi::Value RemoveTokenCellsFromSequence(const Napi::CallbackInfo& info); Napi::Value ShiftSequenceTokenCells(const Napi::CallbackInfo& info); Napi::Value GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info); + Napi::Value GetSequenceKvCacheMaxPosition(const Napi::CallbackInfo& info); Napi::Value DecodeBatch(const Napi::CallbackInfo& info); Napi::Value SampleToken(const Napi::CallbackInfo& info); diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index e74a1132..a2f06ae9 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -146,6 +146,7 @@ export type AddonContext = { shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void, getSequenceKvCacheMinPosition(sequenceId: number): number, + getSequenceKvCacheMaxPosition(sequenceId: number): number, getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array, getStateSize(): number, getThreads(): number, diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index faf626b4..8ba71a22 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -365,6 +365,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp return getLlamaForOptions(options ?? {}); } +// internal export async function getLlamaForOptions({ gpu = defaultLlamaCppGpuSupport, logLevel = defaultLlamaCppLogLevel, From 83dfa5e2d420b61a2dcf21950e76b6046314760e Mon Sep 17 00:00:00 2001 From: Gilad S Date: Thu, 5 Jun 2025 00:28:04 +0300 Subject: [PATCH 14/14] chore: module versions --- package-lock.json | 32 +++++--------------------------- package.json | 2 +- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/package-lock.json b/package-lock.json index 871a8067..2a79518a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,7 +22,7 @@ "filenamify": "^6.0.0", "fs-extra": "^11.3.0", "ignore": "^7.0.4", - "ipull": "^4.0.3", + "ipull": "^3.9.2", "is-unicode-supported": "^2.1.0", "lifecycle-utils": "^2.0.0", "log-symbols": "^7.0.0", @@ -2232,15 +2232,6 @@ "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==", "license": "MIT" }, - "node_modules/@lukeed/csprng": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@lukeed/csprng/-/csprng-1.1.0.tgz", - "integrity": "sha512-Z7C/xXCiGWsg0KuKsHTKJxbWhpI3Vs5GwLfOean7MGyVFGqdRgBbAjOCh6u4bbjPc/8MJ2pZmK/0DLdCbivLDA==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, "node_modules/@modelcontextprotocol/sdk": { "version": "1.11.1", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.11.1.tgz", @@ -10618,9 +10609,9 @@ } }, "node_modules/ipull": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/ipull/-/ipull-4.0.3.tgz", - "integrity": "sha512-mPcOnm1hX1GTL4/f1C5IQFbo1uxqKihZX8KbaHWWnJ7NW4SKQaelRAVy9iVb8XgugMnlEo6TQVBrzCbOvswbsA==", + "version": "3.9.2", + "resolved": "https://registry.npmjs.org/ipull/-/ipull-3.9.2.tgz", + "integrity": "sha512-YbCDsqcf0ytc3b8304ygBlvRtKJTvyygkQX2xcmPkih6vdVKbRw13pDdtSR+vEqLql3owyuPj9m6iT6IfwFaCg==", "license": "MIT", "dependencies": { "@tinyhttp/content-disposition": "^2.2.0", @@ -10641,8 +10632,7 @@ "sleep-promise": "^9.1.0", "slice-ansi": "^7.1.0", "stdout-update": "^4.0.1", - "strip-ansi": "^7.1.0", - "uid": "^2.0.2" + "strip-ansi": "^7.1.0" }, "bin": { "ipull": "dist/cli/cli.js" @@ -19876,18 +19866,6 @@ "node": ">=0.8.0" } }, - "node_modules/uid": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/uid/-/uid-2.0.2.tgz", - "integrity": "sha512-u3xV3X7uzvi5b1MncmZo3i2Aw222Zk1keqLA1YkHldREkAhAqi65wuPfe7lHx8H/Wzy+8CE7S7uS3jekIM5s8g==", - "license": "MIT", - "dependencies": { - "@lukeed/csprng": "^1.0.0" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/unbox-primitive": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", diff --git a/package.json b/package.json index 4c1a6865..cc455955 100644 --- a/package.json +++ b/package.json @@ -195,7 +195,7 @@ "filenamify": "^6.0.0", "fs-extra": "^11.3.0", "ignore": "^7.0.4", - "ipull": "^4.0.3", + "ipull": "^3.9.2", "is-unicode-supported": "^2.1.0", "lifecycle-utils": "^2.0.0", "log-symbols": "^7.0.0",