diff --git a/.vitepress/config.ts b/.vitepress/config.ts index f3b7fab2..e6161519 100644 --- a/.vitepress/config.ts +++ b/.vitepress/config.ts @@ -12,11 +12,13 @@ import {rehype} from "rehype"; import sharp from "sharp"; import {GitChangelog, GitChangelogMarkdownSection} from "@nolebase/vitepress-plugin-git-changelog/vite"; import {buildEndGenerateOpenGraphImages} from "@nolebase/vitepress-plugin-og-image/vitepress"; +import llmstxt from "vitepress-plugin-llms"; import {Resvg, initWasm as initResvgWasm, type ResvgRenderOptions} from "@resvg/resvg-wasm"; import {BlogPageInfoPlugin} from "./config/BlogPageInfoPlugin.js"; -import {getApiReferenceSidebar} from "./config/apiReferenceSidebar.js"; import {ensureLocalImage} from "./utils/ensureLocalImage.js"; import {getExcerptFromMarkdownFile} from "./utils/getExcerptFromMarkdownFile.js"; +import {getVitepressSidebar, getVitepressSidebarWithBlog} from "./config/sidebar.js"; +import {getBlogPosts} from "./config/getBlogPosts.js"; import type {Element as HastElement, Parent} from "hast"; import type {Node as UnistNode} from "unist"; @@ -365,6 +367,12 @@ export default defineConfig({ }) as VitepressPlugin, BlogPageInfoPlugin({ include: (id) => id.includes(path.sep + "blog" + path.sep) && !id.endsWith(path.sep + "blog" + path.sep + "index.md") + }), + llmstxt({ + ignoreFiles: ["index.md"], + domain: resolveHref("/test").slice(0, -"/test".length) || undefined, + excludeBlog: false, + sidebar: () => getVitepressSidebarWithBlog(true, false) }) ], build: { @@ -434,6 +442,9 @@ export default defineConfig({ }, { text: "GitHub Discussions", link: "https://github.com/withcatai/node-llama-cpp/discussions" + }, { + text: "Awesome List", + link: "/guide/awesome" }, { text: "Contribute", link: "/guide/contributing" @@ -469,100 +480,14 @@ export default defineConfig({ } } }, - sidebar: { - "/guide/": [{ - text: "Guide", - base: "/guide", - items: [ - {text: "Getting Started", link: "/"}, - {text: "Chat Session", link: "/chat-session"}, - {text: "Chat Wrapper", link: "/chat-wrapper"}, - {text: "Grammar", link: "/grammar"}, - {text: "Function Calling", link: "/function-calling"}, - {text: "Embedding", link: "/embedding"}, - {text: "Text Completion", link: "/text-completion"}, - {text: "Choosing a Model", link: "/choosing-a-model"}, - {text: "Downloading Models", link: "/downloading-models"} - ] - }, { - text: "Advanced", - base: "/guide", - items: [ - {text: "Building From Source", link: "/building-from-source"}, - {text: "Metal Support", link: "/Metal"}, - {text: "CUDA Support", link: "/CUDA"}, - {text: "Vulkan Support", link: "/Vulkan"}, - {text: "Electron Support", link: "/electron"}, - {text: "Using in Docker", link: "/docker"}, - {text: "Using Tokens", link: "/tokens"}, - {text: "LlamaText", link: "/llama-text"}, - {text: "External Chat State", link: "/external-chat-state"}, - {text: "Token Bias", link: "/token-bias"}, - {text: "Objects Lifecycle", link: "/objects-lifecycle"}, - {text: "Chat Context Shift", link: "/chat-context-shift"}, - {text: "Batching", link: "/batching"}, - {text: "Token Prediction", link: "/token-prediction"}, - {text: "Low Level API", link: "/low-level-api"}, - {text: "Awesome List", link: "/awesome"}, - {text: "Troubleshooting", link: "/troubleshooting"}, - {text: "Tips and Tricks", link: "/tips-and-tricks"} - ] - }, { - text: "Contributing", - base: "/guide", - items: [ - {text: "Setting Up a Dev Environment", link: "/development"}, - {text: "Pull Request Guidelines", link: "/contributing"} - ] - }], - - "/cli/": [{ - text: "CLI", - base: "/cli", - link: "/", - items: [ - {text: "Init", link: "/init"}, - {text: "Chat", link: "/chat"}, - {text: "Pull", link: "/pull"}, - { - text: "Source", - link: "/source", - collapsed: true, - items: [ - {text: "Download", link: "/source/download"}, - {text: "Build", link: "/source/build"}, - {text: "Clear", link: "/source/clear"} - ] - }, - {text: "Complete", link: "/complete"}, - {text: "Infill", link: "/infill"}, - { - text: "Inspect", - link: "/inspect", - collapsed: true, - items: [ - {text: "GPU", link: "/inspect/gpu"}, - {text: "GGUF", link: "/inspect/gguf"}, - {text: "Measure", link: "/inspect/measure"}, - {text: "Estimate", link: "/inspect/estimate"} - ] - } - ] - }], - - "/api/": getApiReferenceSidebar() - }, + sidebar: getVitepressSidebar(), socialLinks: [ {icon: "npm", link: "https://www.npmjs.com/package/node-llama-cpp"}, {icon: "github", link: "https://github.com/withcatai/node-llama-cpp"} ] }, async buildEnd(siteConfig) { - const blogPosts = await createContentLoader("blog/*.md", { - excerpt: true, - render: true - }) - .load(); + const blogPosts = await getBlogPosts(false); async function loadSvgFontBuffers() { const interFontFilesDirectoryPath = path.join(require.resolve("@fontsource/inter"), "..", "files"); @@ -699,24 +624,7 @@ export default defineConfig({ ...siteConfig.site, themeConfig: { ...siteConfig.site.themeConfig, - sidebar: { - ...siteConfig.site.themeConfig.sidebar, - "/_blog/": { - text: "Blog", - link: "/blog/", - items: blogPosts - .filter((post) => { - const hasCoverImage = typeof post.frontmatter?.image === "string" || - typeof post.frontmatter?.image?.url === "string"; - - return !hasCoverImage; - }) - .map((post) => ({ - text: post.frontmatter.title, - link: post.url - })) - } - } + sidebar: await getVitepressSidebarWithBlog(true, true) } } }); @@ -744,22 +652,6 @@ export default defineConfig({ hub: "https://pubsubhubbub.appspot.com/" }); - blogPosts.sort((a, b) => { - const aDate = a.frontmatter.date - ? new Date(a.frontmatter.date) - : null; - const bDate = b.frontmatter.date - ? new Date(b.frontmatter.date) - : null; - - if (aDate == null) - return -1; - if (bDate == null) - return 1; - - return bDate.getTime() - aDate.getTime(); - }); - for (const {url, frontmatter, html, src, excerpt: originalExcerpt} of blogPosts) { const ogImageElement = findElementInHtml(html, (element) => ( element.tagName === "meta" && (element.properties?.name === "og:image" || element.properties?.property === "og:image") @@ -819,12 +711,6 @@ export default defineConfig({ await addOgImages(); - const indexPageIndex = blogPosts.findIndex((post) => post.url === "/blog/"); - if (indexPageIndex < 0) - throw new Error("Blog index page not found"); - - blogPosts.splice(indexPageIndex, 1); - await addBlogRssFeed(); try { @@ -853,6 +739,11 @@ export default defineConfig({ path.join(siteConfig.outDir, "logo.preview.avif"), 24 ); + + await Promise.all([ + fs.copy(path.join(siteConfig.outDir, "llms.txt"), path.join(siteConfig.outDir, "llms.md")), + fs.copy(path.join(siteConfig.outDir, "llms-full.txt"), path.join(siteConfig.outDir, "llms-full.md")) + ]); } }); diff --git a/.vitepress/config/getBlogPosts.ts b/.vitepress/config/getBlogPosts.ts new file mode 100644 index 00000000..1d4cb6a5 --- /dev/null +++ b/.vitepress/config/getBlogPosts.ts @@ -0,0 +1,46 @@ +import {ContentData, createContentLoader} from "vitepress"; + +let blogPosts: ContentData[] | undefined = undefined; +export async function getBlogPosts(includeIndex: boolean = false) { + if (includeIndex) + return await _getBlogPosts(); + + const blogPosts = (await _getBlogPosts()).slice(); + + const indexPageIndex = blogPosts.findIndex((post) => post.url === "/blog/"); + if (indexPageIndex < 0) + throw new Error("Blog index page not found"); + + blogPosts.splice(indexPageIndex, 1); + + return blogPosts; +} + +async function _getBlogPosts() { + if (blogPosts != null) + return blogPosts; + + blogPosts = await createContentLoader("blog/*.md", { + excerpt: true, + render: true + }) + .load(); + + blogPosts.sort((a, b) => { + const aDate = a.frontmatter.date + ? new Date(a.frontmatter.date) + : null; + const bDate = b.frontmatter.date + ? new Date(b.frontmatter.date) + : null; + + if (aDate == null) + return -1; + if (bDate == null) + return 1; + + return bDate.getTime() - aDate.getTime(); + }); + + return blogPosts; +} diff --git a/.vitepress/config/sidebar.ts b/.vitepress/config/sidebar.ts new file mode 100644 index 00000000..b151a56c --- /dev/null +++ b/.vitepress/config/sidebar.ts @@ -0,0 +1,134 @@ +import {DefaultTheme} from "vitepress"; +import {getApiReferenceSidebar} from "./apiReferenceSidebar.js"; +import {getBlogPosts} from "./getBlogPosts.js"; + +const apiReferenceSidebar = getApiReferenceSidebar(); + +export function getVitepressSidebar(blog?: DefaultTheme.SidebarItem[]): DefaultTheme.Sidebar { + return { + "/guide/": [{ + text: "Guide", + base: "/guide", + items: [ + {text: "Getting Started", link: "/"}, + {text: "Chat Session", link: "/chat-session"}, + {text: "Chat Wrapper", link: "/chat-wrapper"}, + {text: "Grammar", link: "/grammar"}, + {text: "Function Calling", link: "/function-calling"}, + {text: "Embedding", link: "/embedding"}, + {text: "Text Completion", link: "/text-completion"}, + {text: "Choosing a Model", link: "/choosing-a-model"}, + {text: "Downloading Models", link: "/downloading-models"} + ] + }, { + text: "Advanced", + base: "/guide", + items: [ + {text: "Building From Source", link: "/building-from-source"}, + {text: "Metal Support", link: "/Metal"}, + {text: "CUDA Support", link: "/CUDA"}, + {text: "Vulkan Support", link: "/Vulkan"}, + {text: "Electron Support", link: "/electron"}, + {text: "Using in Docker", link: "/docker"}, + {text: "Using Tokens", link: "/tokens"}, + {text: "LlamaText", link: "/llama-text"}, + {text: "External Chat State", link: "/external-chat-state"}, + {text: "Token Bias", link: "/token-bias"}, + {text: "Objects Lifecycle", link: "/objects-lifecycle"}, + {text: "Chat Context Shift", link: "/chat-context-shift"}, + {text: "Batching", link: "/batching"}, + {text: "Token Prediction", link: "/token-prediction"}, + {text: "Low Level API", link: "/low-level-api"}, + {text: "Awesome List", link: "/awesome"}, + {text: "Troubleshooting", link: "/troubleshooting"}, + {text: "Tips and Tricks", link: "/tips-and-tricks"} + ] + }, { + text: "Contributing", + base: "/guide", + items: [ + {text: "Setting Up a Dev Environment", link: "/development"}, + {text: "Pull Request Guidelines", link: "/contributing"} + ] + }], + + ...( + blog != null + ? { + "/_blog/": [{ + text: "Blog", + link: "/blog/", + items: blog + }] + } + : {} + ), + + "/cli/": [{ + text: "CLI", + base: "/cli", + link: "/", + items: [ + {text: "Init", link: "/init"}, + {text: "Chat", link: "/chat"}, + {text: "Pull", link: "/pull"}, + { + text: "Source", + link: "/source", + collapsed: true, + items: [ + {text: "Download", link: "/source/download"}, + {text: "Build", link: "/source/build"}, + {text: "Clear", link: "/source/clear"} + ] + }, + {text: "Complete", link: "/complete"}, + {text: "Infill", link: "/infill"}, + { + text: "Inspect", + link: "/inspect", + collapsed: true, + items: [ + {text: "GPU", link: "/inspect/gpu"}, + {text: "GGUF", link: "/inspect/gguf"}, + {text: "Measure", link: "/inspect/measure"}, + {text: "Estimate", link: "/inspect/estimate"} + ] + } + ] + }], + + "/api/": structuredClone(apiReferenceSidebar) + }; +} + +export async function getSidebarBlogPostItems( + includeIndex: boolean = false, + onlyItemsWithoutCoverImage: boolean = false +): Promise { + const blogPosts = await getBlogPosts(includeIndex); + + return blogPosts + .filter((post) => { + if (!onlyItemsWithoutCoverImage) + return true; + + const hasCoverImage = typeof post.frontmatter?.image === "string" || + typeof post.frontmatter?.image?.url === "string"; + + return !hasCoverImage; + }) + .map((post) => ({ + text: post.frontmatter.title, + link: post.url + })); +} + +export async function getVitepressSidebarWithBlog( + includeIndex: boolean = false, + onlyItemsWithoutCoverImage: boolean = false +) { + const blogItems = await getSidebarBlogPostItems(includeIndex, onlyItemsWithoutCoverImage); + + return getVitepressSidebar(blogItems); +} diff --git a/docs/guide/chat-session.md b/docs/guide/chat-session.md index 992a6487..a6a1a097 100644 --- a/docs/guide/chat-session.md +++ b/docs/guide/chat-session.md @@ -898,3 +898,58 @@ const fullResponse = a1.response console.log("Full response: " + fullResponse); ``` + +## Set Reasoning Budget {#reasoning-budget} +You can set a reasoning budget to limit the number of tokens a thinking model can spend on [thought segments](#stream-response-segments). +```typescript +import { + getLlama, LlamaChatSession, resolveModelFile, Token +} from "node-llama-cpp"; + +const modelPath = await resolveModelFile("hf:Qwen/Qwen3-14B-GGUF:Q4_K_M"); + +const llama = await getLlama(); +const model = await llama.loadModel({modelPath}); +const context = await model.createContext(); +const session = new LlamaChatSession({ + contextSequence: context.getSequence() +}); + + +const q1 = "Where do llamas come from?"; +console.log("User: " + q1); + +const maxThoughtTokens = 100; + +let responseTokens = 0; +let thoughtTokens = 0; + +process.stdout.write("AI: "); +const response = await session.prompt(q1, { + budgets: { + thoughtTokens: maxThoughtTokens + }, + onResponseChunk(chunk) { + const isThoughtSegment = chunk.type === "segment" && + chunk.segmentType === "thought"; + + if (chunk.type === "segment" && chunk.segmentStartTime != null) + process.stdout.write(` [segment start: ${chunk.segmentType}] `); + + process.stdout.write(chunk.text); + + if (chunk.type === "segment" && chunk.segmentEndTime != null) + process.stdout.write(` [segment end: ${chunk.segmentType}] `); + + if (isThoughtSegment) + thoughtTokens += chunk.tokens.length; + else + responseTokens += chunk.tokens.length; + } +}); + +console.log("Response: " + response); + +console.log("Response tokens: " + responseTokens); +console.log("Thought tokens: " + thoughtTokens); +``` diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 1f8a8726..574dd79f 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -393,6 +393,7 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap 1 && info[1].IsObject()) { Napi::Object options = info[1].As(); @@ -433,6 +434,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value()); } + + if (options.Has("swaFullCache")) { + context_params.swa_full = options.Get("swaFullCache").As().Value(); + } } } AddonContext::~AddonContext() { @@ -620,6 +625,32 @@ Napi::Value AddonContext::ShiftSequenceTokenCells(const Napi::CallbackInfo& info return info.Env().Undefined(); } +Napi::Value AddonContext::GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + int32_t sequenceId = info[0].As().Int32Value(); + + + const auto minPosition = llama_kv_self_seq_pos_min(ctx, sequenceId); + + return Napi::Number::New(info.Env(), minPosition); +} +Napi::Value AddonContext::GetSequenceKvCacheMaxPosition(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + int32_t sequenceId = info[0].As().Int32Value(); + + + const auto maxPosition = llama_kv_self_seq_pos_max(ctx, sequenceId); + + return Napi::Number::New(info.Env(), maxPosition); +} Napi::Value AddonContext::DecodeBatch(const Napi::CallbackInfo& info) { AddonContextDecodeBatchWorker* worker = new AddonContextDecodeBatchWorker(info.Env(), this); worker->Queue(); @@ -926,6 +957,8 @@ void AddonContext::init(Napi::Object exports) { InstanceMethod("disposeSequence", &AddonContext::DisposeSequence), InstanceMethod("removeTokenCellsFromSequence", &AddonContext::RemoveTokenCellsFromSequence), InstanceMethod("shiftSequenceTokenCells", &AddonContext::ShiftSequenceTokenCells), + InstanceMethod("getSequenceKvCacheMinPosition", &AddonContext::GetSequenceKvCacheMinPosition), + InstanceMethod("getSequenceKvCacheMaxPosition", &AddonContext::GetSequenceKvCacheMaxPosition), InstanceMethod("decodeBatch", &AddonContext::DecodeBatch), InstanceMethod("sampleToken", &AddonContext::SampleToken), InstanceMethod("getEmbedding", &AddonContext::GetEmbedding), diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h index 933ba8f0..7e661f12 100644 --- a/llama/addon/AddonContext.h +++ b/llama/addon/AddonContext.h @@ -36,6 +36,8 @@ class AddonContext : public Napi::ObjectWrap { Napi::Value DisposeSequence(const Napi::CallbackInfo& info); Napi::Value RemoveTokenCellsFromSequence(const Napi::CallbackInfo& info); Napi::Value ShiftSequenceTokenCells(const Napi::CallbackInfo& info); + Napi::Value GetSequenceKvCacheMinPosition(const Napi::CallbackInfo& info); + Napi::Value GetSequenceKvCacheMaxPosition(const Napi::CallbackInfo& info); Napi::Value DecodeBatch(const Napi::CallbackInfo& info); Napi::Value SampleToken(const Napi::CallbackInfo& info); diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index 943866c0..eef81c25 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -73,6 +73,19 @@ Napi::Value addonGetTypeSizeForGgmlType(const Napi::CallbackInfo& info) { return Napi::Number::New(info.Env(), typeSize); } +Napi::Value addonGetGgmlGraphOverheadCustom(const Napi::CallbackInfo& info) { + if (info.Length() < 2 || !info[0].IsNumber() || !info[1].IsBoolean()) { + return Napi::Number::New(info.Env(), 0); + } + + const size_t size = info[0].As().Uint32Value(); + const bool grads = info[1].As().Value(); + + const auto graphOverhead = ggml_graph_overhead_custom(size, grads); + + return Napi::Number::New(info.Env(), graphOverhead); +} + Napi::Value addonGetConsts(const Napi::CallbackInfo& info) { Napi::Object consts = Napi::Object::New(info.Env()); consts.Set("ggmlMaxDims", Napi::Number::New(info.Env(), GGML_MAX_DIMS)); @@ -231,6 +244,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { Napi::PropertyDescriptor::Function("getMathCores", addonGetMathCores), Napi::PropertyDescriptor::Function("getBlockSizeForGgmlType", addonGetBlockSizeForGgmlType), Napi::PropertyDescriptor::Function("getTypeSizeForGgmlType", addonGetTypeSizeForGgmlType), + Napi::PropertyDescriptor::Function("getGgmlGraphOverheadCustom", addonGetGgmlGraphOverheadCustom), Napi::PropertyDescriptor::Function("getConsts", addonGetConsts), Napi::PropertyDescriptor::Function("setLogger", setLogger), Napi::PropertyDescriptor::Function("setLoggerLogLevel", setLoggerLogLevel), diff --git a/package-lock.json b/package-lock.json index 72578ac6..2a79518a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -62,7 +62,7 @@ "@types/bytes": "^3.1.5", "@types/cross-spawn": "^6.0.6", "@types/fs-extra": "^11.0.4", - "@types/node": "^22.15.17", + "@types/node": "^20.17.50", "@types/proper-lockfile": "^4.1.4", "@types/semver": "^7.7.0", "@types/validate-npm-package-name": "^4.0.2", @@ -91,6 +91,7 @@ "typescript-eslint": "^8.32.0", "vite-node": "^3.1.3", "vitepress": "^1.6.3", + "vitepress-plugin-llms": "https://pkg.pr.new/vitepress-plugin-llms@51", "vitest": "^3.1.3", "zx": "^8.5.4" }, @@ -4420,13 +4421,13 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.15.17", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.17.tgz", - "integrity": "sha512-wIX2aSZL5FE+MR0JlvF87BNVrtFWf6AE6rxSE9X7OwnVvoyCQjpzSRJ+M87se/4QCkCiebQAqrJ0y6fwIyi7nw==", + "version": "20.17.50", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.50.tgz", + "integrity": "sha512-Mxiq0ULv/zo1OzOhwPqOA13I81CV/W3nvd3ChtQZRT5Cwz3cr0FKo/wMSsbTqL3EXpaBAEQhva2B8ByRkOIh9A==", "dev": true, "license": "MIT", "dependencies": { - "undici-types": "~6.21.0" + "undici-types": "~6.19.2" } }, "node_modules/@types/normalize-package-data": { @@ -5996,6 +5997,24 @@ "node": "*" } }, + "node_modules/byte-size": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/byte-size/-/byte-size-9.0.1.tgz", + "integrity": "sha512-YLe9x3rabBrcI0cueCdLS2l5ONUKywcRpTs02B8KP9/Cimhj7o3ZccGrPnRvcbyHMbb7W79/3MUJl7iGgTXKEw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.17" + }, + "peerDependencies": { + "@75lb/nature": "latest" + }, + "peerDependenciesMeta": { + "@75lb/nature": { + "optional": true + } + } + }, "node_modules/bytes": { "version": "3.1.2", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", @@ -7500,6 +7519,23 @@ "node": ">= 12.20.55" } }, + "node_modules/electron/node_modules/@types/node": { + "version": "22.15.21", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.21.tgz", + "integrity": "sha512-EV/37Td6c+MgKAbkcLG6vqZ2zEYHD7bvSrzqqs2RIhbA6w3x+Dqz8MZM3sP6kGTeLrdoOgKZe+Xja7tUB2DNkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/electron/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, "node_modules/emoji-regex": { "version": "10.4.0", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.4.0.tgz", @@ -8945,6 +8981,20 @@ "reusify": "^1.0.4" } }, + "node_modules/fault": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/fault/-/fault-2.0.1.tgz", + "integrity": "sha512-WtySTkS4OKev5JtpHXnib4Gxiurzh5NCGvWrFaZ34m6JehfTUhKZvn9njTfw48t6JumVQOmrKqpmGcdwxnhqBQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "format": "^0.2.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/fd-slicer": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", @@ -9238,6 +9288,15 @@ "node": ">= 6" } }, + "node_modules/format": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/format/-/format-0.2.2.tgz", + "integrity": "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww==", + "dev": true, + "engines": { + "node": ">=0.4.x" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -11808,6 +11867,16 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/markdown-title": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/markdown-title/-/markdown-title-1.0.2.tgz", + "integrity": "sha512-MqIQVVkz+uGEHi3TsHx/czcxxCbRIL7sv5K5DnYw/tI+apY54IbPefV/cmgxp6LoJSEx/TqcHdLs/298afG5QQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/marked": { "version": "12.0.2", "resolved": "https://registry.npmjs.org/marked/-/marked-12.0.2.tgz", @@ -11922,6 +11991,38 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/mdast-util-frontmatter": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-frontmatter/-/mdast-util-frontmatter-2.0.1.tgz", + "integrity": "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "escape-string-regexp": "^5.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-extension-frontmatter": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-frontmatter/node_modules/escape-string-regexp": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-5.0.0.tgz", + "integrity": "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/mdast-util-gfm": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", @@ -12242,6 +12343,23 @@ "micromark-util-types": "^2.0.0" } }, + "node_modules/micromark-extension-frontmatter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-frontmatter/-/micromark-extension-frontmatter-2.0.0.tgz", + "integrity": "sha512-C4AkuM3dA58cgZha7zVnuVxBhDsbttIMiytjgsM2XbHAB2faRVaHRle40558FBN+DJcrLNCoqG5mlrpdU4cRtg==", + "dev": true, + "license": "MIT", + "dependencies": { + "fault": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/micromark-factory-destination": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", @@ -12661,6 +12779,19 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/millify": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/millify/-/millify-6.1.0.tgz", + "integrity": "sha512-H/E3J6t+DQs/F2YgfDhxUVZz/dF8JXPPKTLHL/yHCcLZLtCXJDUaqvhJXQwqOVBvbyNn4T0WjLpIHd7PAw7fBA==", + "dev": true, + "license": "MIT", + "dependencies": { + "yargs": "^17.0.1" + }, + "bin": { + "millify": "bin/millify" + } + }, "node_modules/mime": { "version": "4.0.6", "resolved": "https://registry.npmjs.org/mime/-/mime-4.0.6.tgz", @@ -17122,6 +17253,73 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remark": { + "version": "15.0.1", + "resolved": "https://registry.npmjs.org/remark/-/remark-15.0.1.tgz", + "integrity": "sha512-Eht5w30ruCXgFmxVUSlNWQ9iiimq07URKeFS3hNc8cUWy1llX4KDWfyEDZRycMc+znsN9Ux5/tJ/BFdgdOwA3A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-frontmatter": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/remark-frontmatter/-/remark-frontmatter-5.0.0.tgz", + "integrity": "sha512-XTFYvNASMe5iPN0719nPrdItC9aU0ssC4v14mH1BCi1u0n1gAocqcujWUrByftZTbLhRtiKRyjYTSIOcr69UVQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-frontmatter": "^2.0.0", + "micromark-extension-frontmatter": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-parse": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", + "integrity": "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "micromark-util-types": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/remark-stringify": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/remark-stringify/-/remark-stringify-11.0.0.tgz", + "integrity": "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-to-markdown": "^2.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -19266,6 +19464,13 @@ "node": ">=0.6" } }, + "node_modules/tokenx": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/tokenx/-/tokenx-1.0.1.tgz", + "integrity": "sha512-MhOngUHRuVE0CHP4cNEZ/XpdXETFL65nJpEvoTW+VYPuXsT/MTeNj+UNnekNsnxecmj2DEvUYPebqz+CsPTUSg==", + "dev": true, + "license": "MIT" + }, "node_modules/totalist": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/totalist/-/totalist-3.0.1.tgz", @@ -19688,9 +19893,9 @@ "license": "MIT" }, "node_modules/undici-types": { - "version": "6.21.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", - "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "version": "6.19.8", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz", + "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==", "dev": true, "license": "MIT" }, @@ -19781,6 +19986,22 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/unist-util-remove": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/unist-util-remove/-/unist-util-remove-4.0.0.tgz", + "integrity": "sha512-b4gokeGId57UVRX/eVKej5gXqGlc9+trkORhFJpu9raqZkZhU0zm8Doi05+HaiBsMEIJowL+2WtQ5ItjsngPXg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-is": "^6.0.0", + "unist-util-visit-parents": "^6.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/unist-util-stringify-position": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", @@ -20127,6 +20348,45 @@ } } }, + "node_modules/vitepress-plugin-llms": { + "version": "1.3.4", + "resolved": "https://pkg.pr.new/vitepress-plugin-llms@51", + "integrity": "sha512-FTyNYyx1jVbKae/raJLgDTgMaHSmY51B1nbokeC4KAhXMe413eGSexNIdvnCHXf9U1t92VlLajJ5S9E7adDoOQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "byte-size": "^9.0.1", + "gray-matter": "^4.0.3", + "markdown-title": "^1.0.2", + "millify": "^6.1.0", + "minimatch": "^10.0.1", + "picocolors": "^1.1.1", + "remark": "^15.0.1", + "remark-frontmatter": "^5.0.0", + "tokenx": "^1.0.0", + "unist-util-remove": "^4.0.0", + "unist-util-visit": "^5.0.0" + }, + "funding": { + "url": "https://github.com/okineadev/vitepress-plugin-llms?sponsor=1" + } + }, + "node_modules/vitepress-plugin-llms/node_modules/minimatch": { + "version": "10.0.1", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.0.1.tgz", + "integrity": "sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": "20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/vitepress/node_modules/@shikijs/core": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/@shikijs/core/-/core-2.2.0.tgz", diff --git a/package.json b/package.json index 79f9834f..cc455955 100644 --- a/package.json +++ b/package.json @@ -149,7 +149,7 @@ "@types/bytes": "^3.1.5", "@types/cross-spawn": "^6.0.6", "@types/fs-extra": "^11.0.4", - "@types/node": "^22.15.17", + "@types/node": "^20.17.50", "@types/proper-lockfile": "^4.1.4", "@types/semver": "^7.7.0", "@types/validate-npm-package-name": "^4.0.2", @@ -178,6 +178,7 @@ "typescript-eslint": "^8.32.0", "vite-node": "^3.1.3", "vitepress": "^1.6.3", + "vitepress-plugin-llms": "https://pkg.pr.new/vitepress-plugin-llms@51", "vitest": "^3.1.3", "zx": "^8.5.4" }, diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index b1f3ca0b..a2f06ae9 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -28,7 +28,8 @@ export type BindingModule = { embeddings?: boolean, ranking?: boolean, threads?: number, - performanceTracking?: boolean + performanceTracking?: boolean, + swaFullCache?: boolean }): AddonContext }, AddonGrammar: { @@ -54,6 +55,7 @@ export type BindingModule = { getMathCores(): number, getBlockSizeForGgmlType(ggmlType: number): number | undefined, getTypeSizeForGgmlType(ggmlType: number): number | undefined, + getGgmlGraphOverheadCustom(size: number, grads: boolean): number, getConsts(): { ggmlMaxDims: number, ggmlTypeF16Size: number, @@ -143,6 +145,8 @@ export type AddonContext = { // startPos in inclusive, endPos is exclusive shiftSequenceTokenCells(sequenceId: number, startPos: number, endPos: number, shiftDelta: number): void, + getSequenceKvCacheMinPosition(sequenceId: number): number, + getSequenceKvCacheMaxPosition(sequenceId: number): number, getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array, getStateSize(): number, getThreads(): number, diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index faf626b4..8ba71a22 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -365,6 +365,7 @@ export async function getLlama(options?: LlamaOptions | "lastBuild", lastBuildOp return getLlamaForOptions(options ?? {}); } +// internal export async function getLlamaForOptions({ gpu = defaultLlamaCppGpuSupport, logLevel = defaultLlamaCppLogLevel, diff --git a/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts b/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts index 57fc4ceb..30f434a0 100644 --- a/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts +++ b/src/chatWrappers/generic/utils/extractSegmentSettingsFromTokenizerAndChatTemplate.ts @@ -41,7 +41,8 @@ export function extractSegmentSettingsFromTokenizerAndChatTemplate( return removeUndefinedFields({ thought: tryMatchPrefixSuffixPair([ ["", ""], // DeepSeek, QwQ - ["", ""] // EXAONE Deep + ["", ""], // EXAONE Deep + ["<|START_THINKING|>", "<|END_THINKING|>"] // Command R7B ]) }); } diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 79a71c65..a23e58e5 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -45,6 +45,7 @@ type ChatCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + swaFullCache?: boolean, noTrimWhitespace: boolean, grammar: "text" | Parameters[1], jsonSchemaGrammarFile?: string, @@ -61,6 +62,7 @@ type ChatCommand = { repeatFrequencyPenalty?: number, repeatPresencePenalty?: number, maxTokens: number, + reasoningBudget?: number, noHistory: boolean, environmentFunctions: boolean, tokenPredictionDraftModel?: string, @@ -162,6 +164,12 @@ export const ChatCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("noTrimWhitespace", { type: "boolean", alias: ["noTrim"], @@ -255,6 +263,13 @@ export const ChatCommand: CommandModule = { default: 0, description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size" }) + .option("reasoningBudget", { + alias: ["tb", "thinkingBudget", "thoughtsBudget"], + type: "number", + default: -1, + defaultDescription: "Unlimited", + description: "Maximum number of tokens the model can use for thoughts. Set to `0` to disable reasoning" + }) .option("noHistory", { alias: "nh", type: "boolean", @@ -308,19 +323,20 @@ export const ChatCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, - promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, + promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, - repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }) { try { await RunChat({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, - batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, + batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, + temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, - maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, - timing, noMmap, printTimings + maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, + debug, meter, timing, noMmap, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -333,13 +349,15 @@ export const ChatCommand: CommandModule = { async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, - contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, + contextSize, batchSize, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, + jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, - repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, tokenPredictionDraftModel, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; + if (reasoningBudget === -1) reasoningBudget = undefined; const headers = resolveHeaderFlag(headerArg); const trimWhitespace = !noTrimWhitespace; @@ -363,11 +381,13 @@ async function RunChat({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, + swaFullCache, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, + swaFullCache, useMmap, consoleTitle: "Draft model file" }) @@ -413,6 +433,7 @@ async function RunChat({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { @@ -446,6 +467,7 @@ async function RunChat({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); @@ -673,6 +695,9 @@ async function RunChat({ seed: seed ?? undefined, signal: abortController.signal, stopOnAbortSignal: true, + budgets: { + thoughtTokens: reasoningBudget + }, repeatPenalty: { penalty: repeatPenalty, frequencyPenalty: repeatFrequencyPenalty != null ? repeatFrequencyPenalty : undefined, diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index f8c7790e..1aae93fd 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -32,6 +32,7 @@ type CompleteCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + swaFullCache?: boolean, threads?: number, temperature: number, minP: number, @@ -119,6 +120,12 @@ export const CompleteCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -235,14 +242,14 @@ export const CompleteCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, - flashAttention, threads, temperature, minP, topK, + flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }) { try { await RunCompletion({ - modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, + modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings @@ -257,7 +264,7 @@ export const CompleteCommand: CommandModule = { async function RunCompletion({ - modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, + modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings @@ -286,11 +293,13 @@ async function RunCompletion({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, + swaFullCache, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, + swaFullCache, useMmap, consoleTitle: "Draft model file" }) @@ -329,6 +338,7 @@ async function RunCompletion({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { @@ -362,6 +372,7 @@ async function RunCompletion({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/DebugCommand.ts b/src/cli/commands/DebugCommand.ts index 149de90d..d2ee7117 100644 --- a/src/cli/commands/DebugCommand.ts +++ b/src/cli/commands/DebugCommand.ts @@ -65,5 +65,6 @@ async function DebugCmakeOptionsFunction() { console.info(); console.info(`${chalk.yellow("CMake options:")} ${prettyPrintObject(llama.cmakeOptions)}`); + console.info(`${chalk.yellow("Release:")} ${prettyPrintObject(llama.llamaCppRelease)}`); } diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 7a4a536b..a47df068 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -34,6 +34,7 @@ type InfillCommand = { contextSize?: number, batchSize?: number, flashAttention?: boolean, + swaFullCache?: boolean, threads?: number, temperature: number, minP: number, @@ -129,6 +130,12 @@ export const InfillCommand: CommandModule = { default: false, description: "Enable flash attention" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -245,7 +252,7 @@ export const InfillCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, - flashAttention, threads, temperature, minP, topK, + flashAttention, swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings @@ -253,7 +260,7 @@ export const InfillCommand: CommandModule = { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, + swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings }); @@ -268,7 +275,7 @@ export const InfillCommand: CommandModule = { async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - threads, temperature, minP, topK, topP, seed, gpuLayers, + swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings }: InfillCommand) { @@ -296,11 +303,13 @@ async function RunInfill({ const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, + swaFullCache, useMmap }); const resolvedDraftModelPath = (tokenPredictionDraftModel != null && tokenPredictionDraftModel !== "") ? await resolveCommandGgufPath(tokenPredictionDraftModel, llama, headers, { flashAttention, + swaFullCache, useMmap, consoleTitle: "Draft model file" }) @@ -353,6 +362,7 @@ async function RunInfill({ ? {fitContext: {contextSize}} : undefined, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { @@ -386,6 +396,7 @@ async function RunInfill({ return await llama.loadModel({ modelPath: resolvedDraftModelPath, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, useMmap, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index db34de6d..ffd5f65e 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -32,7 +32,8 @@ type InspectEstimateCommand = { gpuLayers?: number | "max", contextSize?: number | "train", embedding?: boolean, - noMmap?: boolean + noMmap?: boolean, + swaFullCache?: boolean }; export const InspectEstimateCommand: CommandModule = { @@ -115,10 +116,16 @@ export const InspectEstimateCommand: CommandModule default: false, description: "Enable flash attention for the context" }) + .option("swaFullCache", { + alias: "noSwa", + type: "boolean", + default: false, + description: "Disable SWA (Sliding Window Attention) on supported models" + }) .option("measures", { alias: "n", type: "number", @@ -140,8 +147,8 @@ export const InspectMeasureCommand: CommandModule }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, - memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText + modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, swaFullCache, + measures = 10, memory: measureMemoryType, noMmap, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }: InspectMeasureCommand) { if (maxLayers === -1) maxLayers = undefined; if (maxContextSize === -1) maxContextSize = undefined; @@ -162,7 +169,7 @@ export const InspectMeasureCommand: CommandModule const useMmap = !noMmap && llama.supportsMmap; const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, { - flashAttention, useMmap + flashAttention, swaFullCache, useMmap }); console.info(`${chalk.yellow("File:")} ${getReadablePath(resolvedGgufPath)}`); @@ -216,6 +223,7 @@ export const InspectMeasureCommand: CommandModule maxContextSize, minContextSize, flashAttention, + swaFullCache, tests: measures, evaluateText: evaluateText == null ? undefined @@ -286,7 +294,8 @@ export const InspectMeasureCommand: CommandModule : ggufInsights.estimateContextResourceRequirements({ contextSize: previousContextSizeCheck, modelGpuLayers: lastGpuLayers, - flashAttention + flashAttention, + swaFullCache }); const contextVramEstimation = contextResourceEstimation?.gpuVram; @@ -496,7 +505,7 @@ const expectedFileName = "InspectMeasureCommand"; async function measureModel({ modelPath, useMmap, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, - evaluateText, exitAfterMeasurement = false, onInfo + swaFullCache, evaluateText, exitAfterMeasurement = false, onInfo }: { modelPath: string, useMmap?: boolean, @@ -508,6 +517,7 @@ async function measureModel({ maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + swaFullCache?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean, onInfo(data: { @@ -615,6 +625,7 @@ async function measureModel({ maxGpuLayers, minGpuLayers, flashAttention, + swaFullCache, evaluateText, exitAfterMeasurement } satisfies ParentToChildMessage); @@ -716,11 +727,12 @@ async function runTestWorkerLogic() { } async function testContextSizes({ - model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText, - exitAfterMeasurement = false + model, modelVramUsage, modelRamUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, swaFullCache, + evaluateText, exitAfterMeasurement = false }: { model: LlamaModel, modelVramUsage: number, modelRamUsage: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, tests: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean + minContextSize?: number, tests: number, flashAttention?: boolean, swaFullCache?: boolean, evaluateText?: string, + exitAfterMeasurement?: boolean }) { let measurementsDone: number = 0; const contextSizeCheckPlan = getContextSizesCheckPlan( @@ -750,6 +762,7 @@ async function runTestWorkerLogic() { ), ignoreMemorySafetyChecks: currentContextSizeCheck != null, flashAttention, + swaFullCache, failedCreationRemedy: false }); @@ -803,11 +816,11 @@ async function runTestWorkerLogic() { } async function testWithGpuLayers({ - modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText, + modelPath, useMmap, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, swaFullCache, evaluateText, exitAfterMeasurement = false }: { modelPath: string, useMmap?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, - minContextSize?: number, flashAttention?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean + minContextSize?: number, flashAttention?: boolean, swaFullCache?: boolean, evaluateText?: string, exitAfterMeasurement?: boolean }) { try { const preModelVramUsage = (await llama.getVramState()).used; @@ -817,6 +830,7 @@ async function runTestWorkerLogic() { useMmap, gpuLayers, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, ignoreMemorySafetyChecks: true }); const postModelVramUsage = (await llama.getVramState()).used; @@ -839,6 +853,7 @@ async function runTestWorkerLogic() { maxContextSize, minContextSize, flashAttention, + swaFullCache, tests, evaluateText, exitAfterMeasurement @@ -887,6 +902,7 @@ async function runTestWorkerLogic() { maxContextSize: message.maxContextSize, minContextSize: message.minContextSize, flashAttention: message.flashAttention, + swaFullCache: message.swaFullCache, evaluateText: message.evaluateText, exitAfterMeasurement: message.exitAfterMeasurement }); @@ -976,6 +992,7 @@ type ParentToChildMessage = { maxGpuLayers: number, minGpuLayers?: number, flashAttention?: boolean, + swaFullCache?: boolean, initialMaxContextSize?: number, maxContextSize?: number, minContextSize?: number, diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index 7ceb9773..8238daec 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -60,6 +60,7 @@ export async function interactivelyAskForModel({ allowLocalModels = true, downloadIntent = true, flashAttention = false, + swaFullCache = false, useMmap }: { llama: Llama, @@ -67,6 +68,7 @@ export async function interactivelyAskForModel({ allowLocalModels?: boolean, downloadIntent?: boolean, flashAttention?: boolean, + swaFullCache?: boolean, useMmap?: boolean }): Promise { let localModelFileOptions: (ModelOption & {type: "localModel"})[] = []; @@ -120,6 +122,7 @@ export async function interactivelyAskForModel({ const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention: flashAttention && ggufInsights?.flashAttentionSupported, + swaFullCache, useMmap }); @@ -292,7 +295,9 @@ export async function interactivelyAskForModel({ }, items: options, renderItem(item, focused, rerender) { - return renderSelectionItem(item, focused, rerender, activeInteractionController.signal, llama, flashAttention, useMmap); + return renderSelectionItem( + item, focused, rerender, activeInteractionController.signal, llama, flashAttention, swaFullCache, useMmap + ); }, canFocusItem(item) { return item.type === "recommendedModel" || item.type === "localModel" || item.type === "action"; @@ -408,7 +413,7 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean, - useMmap?: boolean + swaFullCache: boolean, useMmap?: boolean ) { if (item.type === "localModel") { let modelText = item.title instanceof Function @@ -435,6 +440,7 @@ function renderSelectionItem( rerenderOption: rerender, llama, flashAttention, + swaFullCache, useMmap }); } @@ -557,13 +563,14 @@ function renderRecommendedModelTechnicalInfo( } async function selectFileForModelRecommendation({ - recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, useMmap + recommendedModelOption, llama, abortSignal, rerenderOption, flashAttention, swaFullCache, useMmap }: { recommendedModelOption: ModelOption & {type: "recommendedModel"}, llama: Llama, abortSignal: AbortSignal, rerenderOption(): void, flashAttention: boolean, + swaFullCache: boolean, useMmap?: boolean }) { try { @@ -586,6 +593,7 @@ async function selectFileForModelRecommendation({ const compatibilityScore = await ggufInsights.configurationResolver.scoreModelConfigurationCompatibility({ flashAttention, + swaFullCache, useMmap }); diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts index 7b04b0ce..219d1808 100644 --- a/src/cli/utils/resolveCommandGgufPath.ts +++ b/src/cli/utils/resolveCommandGgufPath.ts @@ -13,9 +13,9 @@ import {getReadablePath} from "./getReadablePath.js"; import {interactivelyAskForModel} from "./interactivelyAskForModel.js"; export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record, { - targetDirectory = cliModelsDirectory, flashAttention = false, useMmap, consoleTitle = "File" + targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File" }: { - targetDirectory?: string, flashAttention?: boolean, useMmap?: boolean, consoleTitle?: string + targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string } = {}) { if (ggufPath == null) ggufPath = await interactivelyAskForModel({ @@ -24,6 +24,7 @@ export async function resolveCommandGgufPath(ggufPath: string | undefined, llama allowLocalModels: true, downloadIntent: true, flashAttention, + swaFullCache, useMmap }); diff --git a/src/evaluator/LlamaChat/LlamaChat.ts b/src/evaluator/LlamaChat/LlamaChat.ts index da15b1c0..77a171d9 100644 --- a/src/evaluator/LlamaChat/LlamaChat.ts +++ b/src/evaluator/LlamaChat/LlamaChat.ts @@ -294,7 +294,26 @@ export type LLamaChatGenerateResponseOptions void + onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void, + + /** + * Set the maximum number of tokens the model is allowed to spend on various segmented responses. + */ + budgets?: { + /** + * Whether to include the tokens already consumed by the current model response being completed in the budget. + * + * Defaults to `true`. + */ + includeCurrentResponse?: boolean, + + /** + * Budget for thought tokens. + * + * Defaults to `Infinity`. + */ + thoughtTokens?: number + } } & ({ grammar?: LlamaGrammar, functions?: never, @@ -515,6 +534,7 @@ export class LlamaChat { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal = false, maxTokens, @@ -552,6 +572,7 @@ export class LlamaChat { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal, maxTokens, @@ -595,6 +616,7 @@ export class LlamaChat { ); }; const loadContextWindowForFunctionCallingLoop = async () => loadContextWindow(true); + const loadContextWindowForBudgetTriggers = async () => loadContextWindow(false); while (true) { generateResponseState.startTokenLoop(); @@ -657,6 +679,15 @@ export class LlamaChat { if (maxTokensTriggerRes != null) return maxTokensTriggerRes; + if (generateResponseState.updateShouldContextShift()) + break; + + if (await generateResponseState.handleBudgetTriggers()) { + await loadContextWindowForBudgetTriggers(); + await generateResponseState.alignCurrentSequenceStateWithCurrentTokens(); + await generateResponseState.createNewEvaluationIterator(); + } + if (generateResponseState.updateShouldContextShift()) break; @@ -797,6 +828,17 @@ export class LlamaChat { StopGenerationDetector.resolveLlamaTextTrigger(userTextSuffix, this.model.tokenizer) ); + allSegmentTypes + .map((segmentType) => getChatWrapperSegmentDefinition(this._chatWrapper.settings, segmentType)) + .filter((segmentDefinition) => segmentDefinition != null) + .flatMap((segmentDefinition) => [segmentDefinition?.prefix, segmentDefinition?.suffix]) + .filter((trigger) => trigger != null) + .forEach((trigger) => ( + generateResponseState.stopGenerationDetector.addStopTrigger( + StopGenerationDetector.resolveLlamaTextTrigger(LlamaText(trigger), this.model.tokenizer) + ) + )); + await generateResponseState.alignCurrentSequenceStateWithCurrentTokens(); if (generateResponseState.maxTokens === 0) { @@ -827,7 +869,15 @@ export class LlamaChat { generateResponseState.popStreamRegulatorFreeTokens(); - const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger("user"); + const someOfCurrentTokensAreSpecial = generateResponseState.currentTokens.some((token) => ( + this.model.isSpecialToken(token) + )); + const stopGenerationTriggerRes = generateResponseState.handleStopGenerationTrigger( + "user", + someOfCurrentTokensAreSpecial + ? "eogToken" + : undefined + ); if (stopGenerationTriggerRes != null) return { completion: stopGenerationTriggerRes.response, @@ -1251,10 +1301,9 @@ function generateContextTextThatEndsWithUserText( ...options, chatHistory: setLastUserTextInChatHistory(options.chatHistory, lastUserText + randomId) }); - let newContextText = contextText; - for (let i = 0; i < newContextText.values.length; i++) { - const item = newContextText.values[i]; + for (let i = 0; i < contextText.values.length; i++) { + const item = contextText.values[i]; if (typeof item !== "string") continue; @@ -1263,15 +1312,14 @@ function generateContextTextThatEndsWithUserText( continue; const newValue = item.slice(0, randomTextIndex); - newContextText = LlamaText([ - ...newContextText.values.slice(0, i), - newValue - ]); return { - contextText: newContextText, + contextText: LlamaText([ + ...contextText.values.slice(0, i), + newValue + ]), userTextSuffix: LlamaText([ item.slice(randomTextIndex + randomId.length), - ...newContextText.values.slice(i + 1) + ...contextText.values.slice(i + 1) ]), ...rest }; @@ -1485,6 +1533,7 @@ class GenerateResponseState["onToken"]; private readonly onResponseChunk: LLamaChatGenerateResponseOptions["onResponseChunk"]; private readonly onFunctionCallParamsChunk: LLamaChatGenerateResponseOptions["onFunctionCallParamsChunk"]; + private readonly budgets: LLamaChatGenerateResponseOptions["budgets"]; private readonly signal: LLamaChatGenerateResponseOptions["signal"]; private readonly stopOnAbortSignal: LLamaChatGenerateResponseOptions["stopOnAbortSignal"]; public readonly maxTokens: LLamaChatGenerateResponseOptions["maxTokens"]; @@ -1584,6 +1633,7 @@ class GenerateResponseState budget != null && budget !== Infinity; + + const hasBudgetTriggers = this.budgets != null && hasBudget(this.budgets.thoughtTokens); + if (!hasBudgetTriggers) + return shouldReloadEvaluationState; + + if (hasBudget(this.budgets.thoughtTokens) && this.segmentHandler.isSegmentTypeOpen("thought")) { + const usedThoughtTokens = this.segmentHandler.getSegmentTokensCount("thought"); + if (usedThoughtTokens >= this.budgets.thoughtTokens) { + this.segmentHandler.closeSegment("thought"); + shouldReloadEvaluationState = true; + } + } + + return shouldReloadEvaluationState; + } + public updateShouldContextShift() { this.shouldContextShift = this.llamaChat.sequence.nextTokenIndex >= this.llamaChat.context.contextSize - 1; return this.shouldContextShift; @@ -2946,6 +3019,7 @@ class SegmentHandler[] = []; private readonly _segmentsStartTokenTrail: Token[] = []; + private readonly _segmentTokenCounts: Map; private readonly _contextWindowSegments: RawSegment[] = []; private readonly _contextWindowStartTokenTrail: Token[] = []; private readonly _initialTokensTrail: Token[]; @@ -2958,7 +3032,7 @@ class SegmentHandler, closeAllSegments?: string | LlamaText, initialSegmentStack: S[], + initialTokenCounts: Map, previousTokens: Token[] }) { this.model = model; @@ -2990,6 +3065,7 @@ class SegmentHandler(); + + for (const item of modelResponse) { + if (typeof item === "string") { + segmentTokenCounts.set( + undefined, + (segmentTokenCounts.get(undefined) ?? 0) + tokenizer(item, false, "trimLeadingSpace").length + ); + continue; + } else if (isChatModelResponseFunctionCall(item)) + continue; + + void (item.type satisfies "segment"); + + segmentTokenCounts.set( + item.segmentType, + (segmentTokenCounts.get(item.segmentType) ?? 0) + tokenizer(item.text, false, "trimLeadingSpace").length + ); + } + + return segmentTokenCounts; + } } diff --git a/src/evaluator/LlamaChatSession/LlamaChatSession.ts b/src/evaluator/LlamaChatSession/LlamaChatSession.ts index cb64518d..f0a0ba77 100644 --- a/src/evaluator/LlamaChatSession/LlamaChatSession.ts +++ b/src/evaluator/LlamaChatSession/LlamaChatSession.ts @@ -209,7 +209,19 @@ export type LLamaChatPromptOptions void + onFunctionCallParamsChunk?: (chunk: LlamaChatResponseFunctionCallParamsChunk) => void, + + /** + * Set the maximum number of tokens that the model is allowed to spend on various segmented responses. + */ + budgets?: { + /** + * Budget for thought tokens. + * + * Defaults to `Infinity`. + */ + thoughtTokens?: number + } } & ({ grammar?: LlamaGrammar, functions?: never, @@ -445,6 +457,7 @@ export class LlamaChatSession { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal = false, maxTokens, @@ -469,7 +482,7 @@ export class LlamaChatSession { maxParallelFunctionCalls: maxParallelFunctionCalls as undefined, onFunctionCallParamsChunk: onFunctionCallParamsChunk as undefined, - onTextChunk, onToken, onResponseChunk, signal, stopOnAbortSignal, maxTokens, + onTextChunk, onToken, onResponseChunk, budgets, signal, stopOnAbortSignal, maxTokens, temperature, minP, topK, topP, seed, trimWhitespaceSuffix, responsePrefix, repeatPenalty, tokenBias, customStopTriggers }); @@ -489,6 +502,7 @@ export class LlamaChatSession { onToken, onResponseChunk, onFunctionCallParamsChunk, + budgets, signal, stopOnAbortSignal = false, maxTokens, @@ -589,6 +603,10 @@ export class LlamaChatSession { paramsChunk: chunk.paramsChunk, done: chunk.done })), + budgets: { + includeCurrentResponse: true, + thoughtTokens: budgets?.thoughtTokens + }, signal: abortController.signal, stopOnAbortSignal, repeatPenalty, diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index e5797a4f..8a5cff98 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -53,6 +53,7 @@ export class LlamaContext { /** @internal */ private readonly _totalSequences: number; /** @internal */ private readonly _unusedSequenceIds: number[] = []; /** @internal */ private readonly _batchingOptions: Required; + /** @internal */ private readonly _swaFullCache: boolean = false; /** @internal */ private readonly _queuedDecodeSequenceIds = new Set(); /** @internal */ private readonly _queuedDecodes: InternalQueuedDecode[] = []; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); @@ -84,6 +85,7 @@ export class LlamaContext { dispatchSchedule: batchingDispatchSchedule = "nextCycle", itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, + swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false, _embeddings, _ranking @@ -120,15 +122,21 @@ export class LlamaContext { : this._llama._threadsSplitter.normalizeThreadsValue(threads?.min ?? 1) ); this._performanceTracking = !!performanceTracking; + this._swaFullCache = !!swaFullCache; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ contextSize: this._contextSize * this._totalSequences, // each sequence needs its own of cells - batchSize: this._batchSize, + batchSize: this._batchSize + ( + (!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0) + ? 1 // +1 to handle edge cases with SWA KV cache + : 0 + ), sequences: this._totalSequences, flashAttention: this._flashAttention, threads: this._idealThreads, embeddings: _embeddings, ranking: _ranking, - performanceTracking: this._performanceTracking + performanceTracking: this._performanceTracking, + swaFullCache: this._swaFullCache })); this._batchingOptions = { dispatchSchedule: batchingDispatchSchedule, @@ -783,6 +791,7 @@ export class LlamaContext { const flashAttention = _model.flashAttentionSupported ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) : false; + const swaFullCache = options.swaFullCache ?? _model.defaultContextSwaFullCache; const loraOptions = typeof options.lora === "string" ? {adapters: [{filePath: options.lora}]} satisfies LlamaContextOptions["lora"] : options.lora satisfies LlamaContextOptions["lora"]; @@ -799,6 +808,7 @@ export class LlamaContext { modelGpuLayers: _model.gpuLayers, modelTrainContextSize: _model.trainContextSize, flashAttention, + swaFullCache, getVramState: () => _model._llama._vramOrchestrator.getMemoryState(), llamaGpu: _model._llama.gpu, ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks, @@ -821,10 +831,11 @@ export class LlamaContext { isEmbeddingContext: options._embeddings, modelGpuLayers: _model.gpuLayers, batchSize, - flashAttention + flashAttention, + swaFullCache }); - const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention}); + const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention, swaFullCache}); const contextCreationVramReservation = options.ignoreMemorySafetyChecks ? null : _model._llama._vramOrchestrator.reserveMemory(resourceRequirementsEstimation.gpuVram); @@ -1035,6 +1046,31 @@ export class LlamaContextSequence { return this._tokenPredictor; } + /** + * Get the index of the first token in the KV cache. + * + * If you remove any tokens from the state that come before this index, + * no cached prefix tokens evaluation state will be used for the next evaluation. + * + * For example, if `stateCellsStartIndex` is `10` and you remove the range `{start: 11, end: 16}` + * then the cached state for range `0-10` will be used in the next evaluation, + * but if you remove the range `{start: 10, end: 16}` (or `{start: 9, end: 16}`) then the cached state will not be used at all + * and will be re-evaluated in the next evaluation. + * + * This index can be greater than `0` only when SWA (Sliding Window Attention) is used (only on supported models). + * + * When SWA is used, this index will usually be `Math.max(-1, .nextTokenIndex - .model.fileInsights.swaSize)` or larger. + * + * When the KV cache is empty, this index will be `-1`. + * + * You can disable SWA by setting the `swaFullCache` option to `true` when creating a context. + */ + public get stateCellsStartIndex() { + this._ensureNotDisposed(); + + return this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId); + } + /** * Statistics of token predictions using the sequence's `tokenPredictor`. * @@ -1177,6 +1213,8 @@ export class LlamaContextSequence { ) { this._ensureNotDisposed(); + let awaitPromise: Promise | undefined; + await withLock(this._context, "context", async () => { this._ensureNotDisposed(); @@ -1218,6 +1256,13 @@ export class LlamaContextSequence { return ranges; }, [] as ContextTokensDeleteRange[]); + const minKvCachePosition = (this._contextTokens.length === 0 && this._loadedTokenPredictions.length === 0) + ? 0 + : Math.max(0, this._context._ctx.getSequenceKvCacheMinPosition(this._sequenceId)); + if (resolvedRanges[0] != null && resolvedRanges[0].start <= minKvCachePosition) + // we have to drop the cache and reevaluate the sequence due to missing KV cache + deletionSuccessful = false; + const tokenPredictionsToRemove = (resolvedRanges.length > 0 && canRemovePredictionTokens) ? this._loadedTokenPredictions.length : 0; @@ -1273,8 +1318,12 @@ export class LlamaContextSequence { this._nextTokenIndex = 0; this._context._ctx.disposeSequence(this._sequenceId); - await this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock}); + // wait for the evaluation outside the "context" lock to avoid deadlocks + awaitPromise = this.evaluateWithoutGeneratingNewTokens(newSequenceTokens, {_skipLock: skipLock}); }); + + if (awaitPromise != null) + await awaitPromise; } /** @@ -1578,12 +1627,13 @@ export class LlamaContextSequence { } } + /* eslint-disable @stylistic/max-len */ /** * Save the current context sequence evaluation state to a file. - * @see [Saving and restoring a context sequence evaluation state - * ](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) + * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) */ public async saveStateToFile(filePath: string) { + /* eslint-enable @stylistic/max-len */ this._ensureNotDisposed(); const resolvedPath = path.resolve(process.cwd(), filePath); @@ -1606,14 +1656,14 @@ export class LlamaContextSequence { } } + /* eslint-disable @stylistic/max-len */ /** * Load a context sequence evaluation state from a file. * * Trying to load a state file with a longer context size than the current sequence's context size will fail and throw an error. * * You must ensure that the file was created from the exact same model, otherwise, using this function may crash the process. - * @see [Saving and restoring a context sequence evaluation state - * ](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) + * @see [Saving and restoring a context sequence evaluation state](https://node-llama-cpp.withcat.ai/guide/chat-session#save-and-restore-with-context-sequence-state) */ public async loadStateFromFile(filePath: string, acceptRisk: { /** @@ -1623,6 +1673,7 @@ export class LlamaContextSequence { */ acceptRisk: true }) { + /* eslint-enable @stylistic/max-len */ if (!acceptRisk.acceptRisk) throw new Error("The `acceptRisk` option must be set to `true` to use this feature"); diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index 16d17bce..52a18bf9 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -99,6 +99,22 @@ export type LlamaContextOptions = { */ batching?: BatchingOptions, + /** + * When using SWA (Sliding Window Attention) on a supported model, + * extend the sliding window size to the current context size (meaning practically disabling SWA). + * + * Enabling this option will consume more memory on models that support SWA (Sliding Window Attention), + * but will allow reusing the evaluation cache of any prefix length of the context sequence state + * (instead of just the size of the sliding window when SWA is used). + * + * This option has no effect on models that do not support SWA (Sliding Window Attention). + * + * > **Note:** you can check the SWA size using `model.fileInsights.swaSize`. + * + * Defaults to `false` (inherited from the model option `defaultContextSwaFullCache`); + */ + swaFullCache?: boolean, + /** * Load the provided LoRA adapters onto the context. * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 0be0bddc..f53ab21a 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -111,6 +111,17 @@ export type LlamaModelOptions = { */ defaultContextFlashAttention?: boolean, + /** + * When using SWA (Sliding Window Attention) on a supported model, + * extend the sliding window size to the current context size (meaning practically disabling SWA) + * by default for contexts created with this model. + * + * See the `swaFullCache` option of the `.createContext()` method for more information. + * + * Defaults to `false`. + */ + defaultContextSwaFullCache?: boolean, + /** * Called with the load percentage when the model is being loaded. * @param loadProgress - a number between 0 (exclusive) and 1 (inclusive). @@ -140,6 +151,7 @@ export type LlamaModelOptions = { const defaultUseMmap = true; const defaultContextFlashAttentionEnabled = false; +const defaultContextSwaFullCache = false; export class LlamaModel { /** @internal */ public readonly _llama: Llama; @@ -157,6 +169,7 @@ export class LlamaModel { /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle; /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean; /** @internal */ private readonly _defaultContextFlashAttention: boolean; + /** @internal */ private readonly _defaultContextSwaFullCache: boolean; /** @internal */ private readonly _flashAttentionSupported: boolean; /** @internal */ private readonly _loraAdapters = new Map(); /** @internal */ private _typeDescription?: ModelTypeDescription; @@ -177,6 +190,7 @@ export class LlamaModel { _fileInsights, _defaultContextFlashAttentionOptionEnabled, _defaultContextFlashAttention, + _defaultContextSwaFullCache, _flashAttentionSupported }: { _llama: Llama, @@ -184,6 +198,7 @@ export class LlamaModel { _fileInsights: GgufInsights, _defaultContextFlashAttentionOptionEnabled: boolean, _defaultContextFlashAttention: boolean, + _defaultContextSwaFullCache: boolean, _flashAttentionSupported: boolean }) { this._llama = _llama; @@ -196,6 +211,7 @@ export class LlamaModel { this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle(); this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled; this._defaultContextFlashAttention = _defaultContextFlashAttention; + this._defaultContextSwaFullCache = _defaultContextSwaFullCache; this._flashAttentionSupported = _flashAttentionSupported; const overridesList = ggufMetadataOverridesToList(metadataOverrides); this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({ @@ -321,6 +337,10 @@ export class LlamaModel { return this._defaultContextFlashAttention; } + public get defaultContextSwaFullCache() { + return this._defaultContextSwaFullCache; + } + /** * Transform text into tokens that can be fed to the model * @param text - the text to tokenize @@ -700,9 +720,11 @@ export class LlamaModel { const resolvedDefaultContextFlashAttention = flashAttentionSupported ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) : false; + const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache; const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, defaultContextFlashAttention: resolvedDefaultContextFlashAttention, + defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, useMmap }); const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({ @@ -716,7 +738,8 @@ export class LlamaModel { _llama, _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false, _flashAttentionSupported: flashAttentionSupported, - _defaultContextFlashAttention: resolvedDefaultContextFlashAttention + _defaultContextFlashAttention: resolvedDefaultContextFlashAttention, + _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache }); const modelCreationVramReservation = modelOptions.ignoreMemorySafetyChecks ? null diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index 8b0f85e9..7758a7de 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -15,7 +15,7 @@ export type GgufInsightsResourceRequirements = { export class GgufInsights { /** @internal */ public readonly _llama: Llama; /** @internal */ private readonly _modelSize: number; - /** @internal */ private _totalLayers: number | null = null; + /** @internal */ private _totalFileLayers: number | null = null; /** @internal */ private readonly _ggufFileInfo: GgufFileInfo; /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver; @@ -71,13 +71,8 @@ export class GgufInsights { } public get totalLayers() { - if (this._totalLayers != null) - return this._totalLayers; - const outputLayers = 1; - this._totalLayers = this._getFileLayers() + outputLayers; - - return this._totalLayers; + return this._getTotalFileLayers() + outputLayers; } public get modelSize() { @@ -133,6 +128,23 @@ export class GgufInsights { return false; } + /** + * The size of the SWA (Sliding Window Attention). + * + * When `undefined`, the model does not use sliding window attention. + */ + public get swaSize() { + const slidingWindow = this._ggufFileInfo?.architectureMetadata?.attention?.sliding_window; + if (slidingWindow == null || slidingWindow <= 0) + return undefined; + + const trainContextSize = this.trainContextSize; + if (trainContextSize != null && slidingWindow >= trainContextSize) + return undefined; + + return slidingWindow; + } + public estimateModelResourceRequirements({ gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap }: { @@ -152,72 +164,70 @@ export class GgufInsights { * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. */ public estimateContextResourceRequirements({ - contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, + swaFullCache = false }: { contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, - flashAttention?: boolean, includeGraphOverhead?: boolean + flashAttention?: boolean, includeGraphOverhead?: boolean, swaFullCache?: boolean }): GgufInsightsResourceRequirements { if (sequences == null) sequences = getDefaultContextSequences(); if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences}); - const actualContextSize = contextSize * sequences; - - const totalLayers = this.totalLayers; - const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalLayers, totalLayers)); - const finalCpuLayers = totalLayers - finalGpuLayers; const llmData = this._ggufFileInfo.architectureMetadata; + const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; + const slidingWindow = this.swaSize ?? 0; + const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize && + (this.trainContextSize == null || slidingWindow < this.trainContextSize); + const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture); + const nonSwaPercent = swaPattern <= 1 + ? 1 + : (1 / (swaPattern + (flashAttention ? -0.5 : -1))); + + // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp` + const kvCachePadding = flashAttention + ? 256 + : 32; + const actualContextSize = sequences * contextSize; + const kvSize = usingSWA + ? ( + (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) + + nonSwaPercent * actualContextSize + ) + : actualContextSize; + + const totalFileLayers = this._getTotalFileLayers(); + const finalGpuLayers = Math.max(0, Math.min(modelGpuLayers ?? totalFileLayers, totalFileLayers)); + const finalCpuLayers = totalFileLayers - finalGpuLayers; + const usingGpu = finalGpuLayers !== 0; const vocabularySize = llmData.vocab_size ?? this._ggufFileInfo.metadata.tokenizer?.ggml?.tokens?.length ?? 0; - const logitsSize = vocabularySize * batchSize; - const embedSize = isEmbeddingContext - ? (llmData.embedding_length ?? 0) * batchSize - : 0; + const embeddingSize = llmData.embedding_length ?? 0; - const sizeTBytes = 8; // sizeof(size_t) const floatBytes = 4; // sizeof(float) - const uint32TBytes = 4; // sizeof(uint32_t) const int32TBytes = 4; // sizeof(int32_t) - // source: `llama_state_get_size` in `llama.cpp` - const sRngSize = sizeTBytes; - const sRng = 64 * 1024; // LLAMA_MAX_RNG_STATE - const sNOutputs = sizeTBytes; - const sNOutputPos = batchSize * int32TBytes; - const sLogitsSize = sizeTBytes; - const sLogits = logitsSize * floatBytes; - const sEmbeddingSize = sizeTBytes; - const sEmbedding = embedSize * floatBytes; - const sKvBufSize = sizeTBytes; - const sKvHead = uint32TBytes; - const sKvSize = uint32TBytes; - const sKvUsed = uint32TBytes; - const sKv = 2 * int32TBytes * modelGpuLayers * this._llama._consts.ggmlTensorOverhead; - const sKvCell = this._llama._consts.llamaPosSize + sizeTBytes + this._llama._consts.llamaSeqIdSize; - const kvSelfLength = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba - ? Math.max(1, sequences) - : actualContextSize; - const sKvCells = kvSelfLength * sKvCell; - - const overheadMemory = ( - sRngSize + - sRng + - sNOutputs + - sNOutputPos + - sLogitsSize + - sLogits + - sEmbeddingSize + - sEmbedding + - sKvBufSize + - sKvHead + - sKvSize + - sKvUsed + - sKv + - sKvCells - ); + const estimateOutput = (nOutputs: number) => { + // source: `llama_context::output_reserve` in `llama-context.cpp` + const nOutputsMax = Math.max(batchSize, nOutputs); + + const isT5 = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.t5; + const hasLogits = isT5 || !isEmbeddingContext; + const hasEmbd = isT5 || isEmbeddingContext; + + const logitsSize = hasLogits + ? (vocabularySize * nOutputsMax) + : 0; + const embdSize = hasEmbd + ? (embeddingSize * nOutputsMax) + : 0; + const outputBufferSize = (logitsSize + embdSize) * floatBytes; + + const outputIdsArr = int32TBytes * batchSize; + + return outputBufferSize + outputIdsArr; + }; - // Estimates the memory allocated by `ggml_backend_sched_reserve` in `llama_new_context_with_model` in `llama.cpp`. - // If you read this line and have better insights on how to estimate this memory, please open a PR to improve it :) - const estimateGraphOverheadMemory = () => { + const estimateGraphOverheadMemory = (): number => { const s1MB = Math.pow(1024, 2); const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; @@ -234,23 +244,23 @@ export class GgufInsights { if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; - return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (actualContextSize * headCount)); + return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); } - return int32TBytes * batchSize * (embeddingLength + (actualContextSize * headCount)); + return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) { if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment -= (s1MB * 340) * ( this.trainContextSize == null ? 1 - : actualContextSize / this.trainContextSize + : kvSize / this.trainContextSize ); } else { defaultCalculationAdjustment -= (s1MB * 250) + ( (s1MB * 50) * ( this.trainContextSize == null ? 1 - : actualContextSize / this.trainContextSize + : kvSize / this.trainContextSize ) ); } @@ -263,7 +273,7 @@ export class GgufInsights { (s1MB * 270) * ( this.trainContextSize == null ? 1 - : actualContextSize / this.trainContextSize + : kvSize / this.trainContextSize ) ); } else { @@ -271,21 +281,21 @@ export class GgufInsights { (s1MB * 150) * ( this.trainContextSize == null ? 1 - : Math.max(0, (1 - (actualContextSize / this.trainContextSize))) + : Math.max(0, (1 - (kvSize / this.trainContextSize))) ) ); } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) { const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; - return (int32TBytes * batchSize * actualContextSize * headCount) - (50 * s1MB); + return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB); // if (modelGpuLayers === this.totalLayers) { // defaultCalculationAdjustment += -(s1MB * 20) + ( // (s1MB * 250) * ( // this.trainContextSize == null // ? 1 - // : actualContextSize / this.trainContextSize + // : kvSize / this.trainContextSize // ) // ); // } else { @@ -293,7 +303,7 @@ export class GgufInsights { // (s1MB * 300) * ( // this.trainContextSize == null // ? 1 - // : actualContextSize / this.trainContextSize + // : kvSize / this.trainContextSize // ) // ); // } @@ -312,41 +322,51 @@ export class GgufInsights { if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) { // magic numbers for estimation. will be improved in the future - return (totalElements * 123 * (actualContextSize / 4096)) + defaultCalculationAdjustment; + return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment; } // magic numbers for estimation. will be improved in the future - return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment; + return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment; }; + const gpuKVCacheSize = usingGpu + ? this._estimateKvMemorySizeInBytes( + kvSize, + finalGpuLayers < totalFileLayers + ? (finalGpuLayers + 1) + : finalGpuLayers + ) + : 0; + const cpuKVCacheSize = this._estimateKvMemorySizeInBytes(kvSize, finalCpuLayers); + + // source: `llama_context::graph_max_nodes` in `llama-context.cpp` + const maxNodes = Math.max(65536, 5 * tensorInfo.length); + const cpuNodes = 5 * (tensorInfo.length * (finalCpuLayers / totalFileLayers)); + const gpuNodes = maxNodes - cpuNodes; + + const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) + + this._llama._bindings.getGgmlGraphOverheadCustom(gpuNodes, false); + const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) + + this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false); + const graphOverheadMemory = (flashAttention || !includeGraphOverhead) ? 0 : estimateGraphOverheadMemory(); + const graphOverheadGpuSize = usingGpu + ? Math.round(graphOverheadMemory * (finalGpuLayers / totalFileLayers)) + : 0; + const graphOverheadCpuSize = graphOverheadMemory - graphOverheadGpuSize; - const usingGpu = finalGpuLayers !== 0; + const outputBufferSize = estimateOutput(sequences); - const cpuRam = ( - !usingGpu - ? (overheadMemory + graphOverheadMemory) - : 0 - ) + - this._estimateKvMemorySizeInBytes(actualContextSize, finalCpuLayers); - const gpuVram = usingGpu - ? ( - overheadMemory + - graphOverheadMemory + - this._estimateKvMemorySizeInBytes( - actualContextSize, - finalGpuLayers < totalLayers - ? (finalGpuLayers + 1) - : finalGpuLayers - ) - ) - : 0; + const gpuVram = gpuKVCacheSize + gpuComputeBufferSize + graphOverheadGpuSize + outputBufferSize; + const cpuRam = cpuKVCacheSize + cpuComputeBufferSize + graphOverheadCpuSize + outputBufferSize; return { cpuRam, - gpuVram + gpuVram: usingGpu + ? gpuVram + : 0 }; } @@ -449,7 +469,7 @@ export class GgufInsights { } /** @internal */ - public _estimateKvMemorySizeInBytes(contextSize: number, layers: number) { + public _estimateKvMemorySizeInBytes(kvSize: number, layers: number) { // source: `llama_kv_cache_init` in `llama.cpp` const nHead = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; const nEmbd = this._ggufFileInfo.architectureMetadata.embedding_length ?? 0; @@ -483,8 +503,8 @@ export class GgufInsights { const totalNEmbdKGqa = nEmbdKGqa + modelNEmbdKS; const totalNEmbdVGqa = nEmbdVGqa + modelNEmbdVS; - totalElementsK += totalNEmbdKGqa * contextSize; - totalElementsV += totalNEmbdVGqa * contextSize; + totalElementsK += totalNEmbdKGqa * kvSize; + totalElementsV += totalNEmbdVGqa * kvSize; } const keyTypeSize = this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.mamba @@ -504,6 +524,16 @@ export class GgufInsights { ); } + /** @internal */ + private _getTotalFileLayers() { + if (this._totalFileLayers != null) + return this._totalFileLayers; + + this._totalFileLayers = this._getFileLayers(); + + return this._totalFileLayers; + } + /** * @param ggufFileInfo * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance. @@ -718,3 +748,25 @@ function isTokenEmbedLayer(layerName: string) { return firstPart === "token_embd"; } + +function ggmlPad(value: number, padding: number): number { + return ((value + padding - 1) & ~(padding - 1)); +} + +function getSwaPatternForArchitecture(architecture?: GgufArchitectureType): number { + // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern` + switch (architecture) { + case GgufArchitectureType.llama4: + return 4; + case GgufArchitectureType.phi3: + return 1; + case GgufArchitectureType.gemma2: + return 2; + case GgufArchitectureType.gemma3: + return 6; + case GgufArchitectureType.cohere2: + return 4; + } + + return 1; +} diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index 05595c98..cbae45d5 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -39,12 +39,14 @@ export class GgufInsightsConfigurationResolver { targetContextSize, embeddingContext = false, flashAttention = false, + swaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap }: { targetGpuLayers?: number | "max", targetContextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, + swaFullCache?: boolean, useMmap?: boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), @@ -63,6 +65,7 @@ export class GgufInsightsConfigurationResolver { } = {}) { const compatibilityScore = await this.scoreModelConfigurationCompatibility({ flashAttention, + swaFullCache, contextSize: targetContextSize, embeddingContext, forceGpuLayers: targetGpuLayers, @@ -105,6 +108,7 @@ export class GgufInsightsConfigurationResolver { contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, flashAttention = false, + swaFullCache = false, maximumFittedContextSizeMultiplier = 100, maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, @@ -114,6 +118,7 @@ export class GgufInsightsConfigurationResolver { contextSize?: number, embeddingContext?: boolean, flashAttention?: boolean, + swaFullCache?: boolean, maximumFittedContextSizeMultiplier?: number, maximumUnfitConfigurationResourceMultiplier?: number, @@ -209,6 +214,7 @@ export class GgufInsightsConfigurationResolver { llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention: flashAttention, + defaultContextSwaFullCache: swaFullCache, ignoreMemorySafetyChecks: forceGpuLayers != null, useMmap } @@ -263,7 +269,8 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers: resolvedGpuLayers, modelTrainContextSize: this._ggufInsights.trainContextSize ?? defaultTrainContextSizeForEstimationPurposes, ignoreMemorySafetyChecks: forceStrictContextSize, - flashAttention + flashAttention, + swaFullCache }); contextFitsMemory = true; } catch (err) { @@ -275,7 +282,8 @@ export class GgufInsightsConfigurationResolver { contextSize: resolvedContextSize, isEmbeddingContext: embeddingContext, modelGpuLayers: resolvedGpuLayers, - flashAttention + flashAttention, + swaFullCache }); const rankPoints = { @@ -371,11 +379,12 @@ export class GgufInsightsConfigurationResolver { llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, defaultContextFlashAttention = false, + defaultContextSwaFullCache = false, useMmap = this._ggufInsights._llama.supportsMmap }: { ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean, - useMmap?: boolean + defaultContextSwaFullCache?: boolean, useMmap?: boolean } = {}) { return resolveModelGpuLayersOption(gpuLayers, { ggufInsights: this._ggufInsights, @@ -385,6 +394,7 @@ export class GgufInsightsConfigurationResolver { llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); } @@ -399,6 +409,7 @@ export class GgufInsightsConfigurationResolver { batchSize, modelTrainContextSize, flashAttention = false, + swaFullCache = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), @@ -410,6 +421,7 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers: number, modelTrainContextSize: number, flashAttention?: boolean, + swaFullCache?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, @@ -427,6 +439,7 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers, modelTrainContextSize, flashAttention, + swaFullCache, getVramState, getRamState, getSwapState, diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index f800f712..49ace603 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -9,7 +9,7 @@ import {getRamUsageFromUnifiedVram} from "./getRamUsageFromUnifiedVram.js"; const defaultMaxContextSizeSwapUse = 2048; export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, + contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, swaFullCache, getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, maxContextSizeSwapUse = defaultMaxContextSizeSwapUse }: { @@ -20,6 +20,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: number, modelTrainContextSize: number, flashAttention: boolean, + swaFullCache: boolean, getVramState(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState(): Promise<{total: number, free: number}>, getSwapState(): Promise<{total: number, free: number}>, @@ -52,6 +53,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + swaFullCache, isEmbeddingContext }); @@ -97,6 +99,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + swaFullCache, isEmbeddingContext }); @@ -145,6 +148,7 @@ export async function resolveContextContextSizeOption({ modelGpuLayers: modelGpuLayers, sequences, flashAttention, + swaFullCache, isEmbeddingContext }); diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts index 1edae352..62d58141 100644 --- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts +++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts @@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5; export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, - llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, useMmap + llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, defaultContextSwaFullCache, useMmap }: { ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean, getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu, - llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, useMmap?: boolean + llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean }): Promise { if (gpuLayers == null) gpuLayers = "auto"; @@ -37,6 +37,7 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" ggufInsights, currentVram: vramState.free, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); @@ -73,6 +74,7 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" ? gpuLayers.max : undefined, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); @@ -95,6 +97,7 @@ function getBestGpuLayersForFreeVram({ minGpuLayers, maxGpuLayers, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }: { ggufInsights: GgufInsights, @@ -103,6 +106,7 @@ function getBestGpuLayersForFreeVram({ minGpuLayers?: number, maxGpuLayers?: number, defaultContextFlashAttention: boolean, + defaultContextSwaFullCache: boolean, useMmap?: boolean }) { return findBestOption({ @@ -123,6 +127,7 @@ function getBestGpuLayersForFreeVram({ currentVram: freeVram, fitContext, defaultContextFlashAttention, + defaultContextSwaFullCache, useMmap }); @@ -182,10 +187,10 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer } function getVramRequiredForGpuLayers({ - gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, useMmap + gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, defaultContextSwaFullCache = false, useMmap }: { gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, - defaultContextFlashAttention: boolean, useMmap?: boolean + defaultContextFlashAttention: boolean, defaultContextSwaFullCache: boolean, useMmap?: boolean }) { const modelVram = ggufInsights.estimateModelResourceRequirements({ gpuLayers, @@ -202,7 +207,8 @@ function getVramRequiredForGpuLayers({ modelGpuLayers: gpuLayers, sequences: 1, isEmbeddingContext: fitContext.embeddingContext ?? false, - flashAttention: defaultContextFlashAttention + flashAttention: defaultContextFlashAttention, + swaFullCache: defaultContextSwaFullCache }).gpuVram; const totalVram = modelVram + contextVram; @@ -221,7 +227,8 @@ function getVramRequiredForGpuLayers({ ggufInsights, vram: currentVram - modelVram, isEmbeddingContext: fitContext?.embeddingContext ?? false, - flashAttention: defaultContextFlashAttention + flashAttention: defaultContextFlashAttention, + swaFullCache: defaultContextSwaFullCache }); if (maxContext == null || modelVram + maxContext.vram > currentVram) @@ -234,8 +241,8 @@ function getVramRequiredForGpuLayers({ }; } -function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention}: { - gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean +function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, swaFullCache}: { + gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, swaFullCache: boolean }) { const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}); @@ -250,7 +257,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb modelGpuLayers: gpuLayers, sequences: 1, isEmbeddingContext, - flashAttention + flashAttention, + swaFullCache }).gpuVram; if (contextVram <= vram) diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index 827493fc..5f8a48e1 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -135,8 +135,8 @@ export enum GgufFileType { MOSTLY_Q4_0_4_4 = 33, // deprecated MOSTLY_Q4_0_4_8 = 34, // deprecated MOSTLY_Q4_0_8_8 = 35, // deprecated - MOSTLY_TQ1_0 = 36, // deprecated - MOSTLY_TQ2_0 = 37 // deprecated + MOSTLY_TQ1_0 = 36, + MOSTLY_TQ2_0 = 37 } @@ -316,6 +316,7 @@ export type GgufMetadataDefaultArchitectureType = { readonly layer_norm_rms_epsilon?: number, readonly key_length?: number, readonly value_length?: number, + readonly sliding_window?: number, readonly causal?: boolean }, diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index 28ae45c3..8b7f615a 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -6,7 +6,7 @@ export type GgufTensorInfo = { /** * Adjusted offset relative to the file. - * + * * Added by the GGUF parser - not part of the file's metadata. */ readonly fileOffset: number | bigint, @@ -49,5 +49,15 @@ export const enum GgmlType { I16 = 25, I32 = 26, I64 = 27, - F64 = 28 + F64 = 28, + IQ1_M = 29, + BF16 = 30, + Q4_0_4_4 = 31, + Q4_0_4_8 = 32, + Q4_0_8_8 = 33, + TQ1_0 = 34, + TQ2_0 = 35, + IQ4_NL_4_4 = 36, + IQ4_NL_4_8 = 37, + IQ4_NL_8_8 = 38 } diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts index 03d5942a..d8247dd9 100644 --- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts +++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts @@ -114,7 +114,7 @@ describe("functionary", () => { freeRam: s1GB * 6 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } { const res = await resolveGpuLayers(0, { @@ -151,7 +151,7 @@ describe("functionary", () => { freeSwap: s1GB * 1 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers(0, { @@ -255,7 +255,7 @@ describe("functionary", () => { freeRam: s1GB * 4.5 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("3202"); + expect(res.contextSize).to.toMatchInlineSnapshot("4016"); } try { await resolveGpuLayers(16, { @@ -318,7 +318,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } }); @@ -336,14 +336,14 @@ describe("functionary", () => { } { const res = await resolveGpuLayers(16, { - totalVram: s1GB * 7, - freeVram: s1GB * 7, - totalRam: s1GB * 7, + totalVram: s1GB * 7.5, + freeVram: s1GB * 7.5, + totalRam: s1GB * 7.5, freeRam: s1GB * 5.5, - unifiedMemorySize: s1GB * 7 + unifiedMemorySize: s1GB * 7.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("2086"); + expect(res.contextSize).to.toMatchInlineSnapshot("1760"); } { const res = await resolveGpuLayers(16, { @@ -354,7 +354,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 5.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("6804"); + expect(res.contextSize).to.toMatchInlineSnapshot("5505"); } try { await resolveGpuLayers(16, { @@ -409,7 +409,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.contextSize).to.toMatchInlineSnapshot("4441"); } { const res = await resolveGpuLayers(16, { @@ -422,7 +422,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } }); @@ -608,7 +608,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers(32, { @@ -619,7 +619,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1143"); + expect(res.contextSize).to.toMatchInlineSnapshot("1164"); } { const res = await resolveGpuLayers(32, { @@ -761,7 +761,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1143"); + expect(res.contextSize).to.toMatchInlineSnapshot("1164"); } { const res = await resolveGpuLayers(33, { @@ -772,7 +772,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers(33, { @@ -783,7 +783,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } { const res = await resolveGpuLayers(33, { @@ -795,7 +795,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("7718"); } }); @@ -809,7 +809,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7562"); + expect(res.contextSize).to.toMatchInlineSnapshot("6251"); } { const res = await resolveGpuLayers(33, { @@ -820,18 +820,18 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.contextSize).to.toMatchInlineSnapshot("2974"); } { const res = await resolveGpuLayers(33, { totalVram: s1GB * 6, freeVram: s1GB * 6, totalRam: s1GB * 6, - freeRam: s1GB * 4.8, + freeRam: s1GB * 5.1, unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1142"); + expect(res.contextSize).to.toMatchInlineSnapshot("1336"); } try { await resolveGpuLayers(33, { @@ -908,7 +908,7 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("502"); + expect(res.contextSize).to.toMatchInlineSnapshot("472"); } { const res = await resolveGpuLayers("max", { @@ -918,7 +918,7 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1010"); + expect(res.contextSize).to.toMatchInlineSnapshot("898"); } }); @@ -952,7 +952,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("3606"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -961,8 +961,8 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); } { const res = await resolveGpuLayers("auto", { @@ -972,7 +972,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("10"); - expect(res.contextSize).to.toMatchInlineSnapshot("5856"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -981,7 +981,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -991,7 +991,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("13"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1001,7 +1001,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1011,7 +1011,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1021,7 +1021,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1031,7 +1031,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1042,7 +1042,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); - expect(res.contextSize).to.toMatchInlineSnapshot("7977"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1052,7 +1052,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8043"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1062,7 +1062,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4754"); + expect(res.contextSize).to.toMatchInlineSnapshot("4721"); } { const res = await resolveGpuLayers("auto", { @@ -1072,7 +1072,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7964"); + expect(res.contextSize).to.toMatchInlineSnapshot("7998"); } { const res = await resolveGpuLayers("auto", { @@ -1095,7 +1095,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers("auto", { @@ -1105,7 +1105,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers("auto", { @@ -1115,7 +1115,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("3606"); + expect(res.contextSize).to.toMatchInlineSnapshot("5438"); } { const res = await resolveGpuLayers("auto", { @@ -1124,8 +1124,8 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); } { const res = await resolveGpuLayers("auto", { @@ -1135,7 +1135,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("10"); - expect(res.contextSize).to.toMatchInlineSnapshot("5856"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1144,7 +1144,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("12"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1154,7 +1154,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("13"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1164,7 +1164,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1174,7 +1174,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1184,7 +1184,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1194,7 +1194,7 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1205,7 +1205,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); - expect(res.contextSize).to.toMatchInlineSnapshot("7977"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1215,7 +1215,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8043"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -1225,7 +1225,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4754"); + expect(res.contextSize).to.toMatchInlineSnapshot("4721"); } { const res = await resolveGpuLayers("auto", { @@ -1235,7 +1235,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7964"); + expect(res.contextSize).to.toMatchInlineSnapshot("7998"); } { const res = await resolveGpuLayers("auto", { @@ -1324,7 +1324,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1336,7 +1336,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1349,7 +1349,7 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("3202"); + expect(res.contextSize).to.toMatchInlineSnapshot("4016"); } }); @@ -1362,7 +1362,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1372,7 +1372,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2213"); + expect(res.contextSize).to.toMatchInlineSnapshot("2256"); } try { await resolveGpuLayers({min: 2}, { @@ -1426,7 +1426,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1438,7 +1438,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { @@ -1451,7 +1451,7 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("3202"); + expect(res.contextSize).to.toMatchInlineSnapshot("4016"); } }); }); @@ -1480,7 +1480,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("5737"); + expect(res.contextSize).to.toMatchInlineSnapshot("6535"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1491,8 +1491,8 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("5246"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1503,7 +1503,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } @@ -1515,7 +1515,7 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } @@ -1569,7 +1569,7 @@ describe("functionary", () => { freeRam: s1GB * 7 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("5737"); + expect(res.contextSize).to.toMatchInlineSnapshot("6535"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1580,8 +1580,8 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("5246"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); + expect(res.contextSize).to.toMatchInlineSnapshot("7483"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1592,7 +1592,7 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } @@ -1604,7 +1604,7 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); expect(res.contextSize).to.be.gte(contextSize); } diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts index 33e638d0..ee193e2c 100644 --- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts +++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts @@ -124,7 +124,7 @@ describe("gguf", async () => { sequences: context.totalSequences, modelGpuLayers: ggufInsights.totalLayers }).gpuVram; - expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot('"1.02GB"'); + expect(toBytes(estimatedContextVramUsage)).toMatchInlineSnapshot("\"1.03GB\""); expect(Math.abs(contextVramUsageDiff - estimatedContextVramUsage)).to.be.lte(s300MB); await model.dispose(); @@ -168,7 +168,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.78GB", + "cpuRam": "1.75GB", "gpuVram": "0B", } `); @@ -179,7 +179,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.02GB", + "cpuRam": "1GB", "gpuVram": "0B", } `); @@ -190,7 +190,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "650.6MB", + "cpuRam": "643.07MB", "gpuVram": "0B", } `); @@ -201,7 +201,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "454.58MB", + "cpuRam": "451.07MB", "gpuVram": "0B", } `); @@ -213,8 +213,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1GB", - "gpuVram": "834.69MB", + "cpuRam": "1.71GB", + "gpuVram": "355.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -224,8 +224,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "512MB", - "gpuVram": "546.63MB", + "cpuRam": "1002.8MB", + "gpuVram": "315.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -235,8 +235,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "256MB", - "gpuVram": "402.6MB", + "cpuRam": "630.8MB", + "gpuVram": "295.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -246,8 +246,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "128MB", - "gpuVram": "330.58MB", + "cpuRam": "444.8MB", + "gpuVram": "285.25MB", } `); @@ -258,8 +258,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "544MB", - "gpuVram": "1.28GB", + "cpuRam": "1022.78MB", + "gpuVram": "1.05GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -269,8 +269,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "272MB", - "gpuVram": "786.67MB", + "cpuRam": "638.78MB", + "gpuVram": "679.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -280,8 +280,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "136MB", - "gpuVram": "522.64MB", + "cpuRam": "446.78MB", + "gpuVram": "479.25MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -291,8 +291,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "68MB", - "gpuVram": "390.63MB", + "cpuRam": "350.78MB", + "gpuVram": "379.25MB", } `); @@ -303,7 +303,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "32MB", + "cpuRam": "250.5MB", "gpuVram": "1.78GB", } `); @@ -314,8 +314,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "16MB", - "gpuVram": "1.02GB", + "cpuRam": "250.5MB", + "gpuVram": "1.03GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -325,8 +325,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "8MB", - "gpuVram": "650.69MB", + "cpuRam": "250.5MB", + "gpuVram": "667.52MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -336,8 +336,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "4MB", - "gpuVram": "454.67MB", + "cpuRam": "250.5MB", + "gpuVram": "475.52MB", } `); @@ -348,7 +348,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", + "cpuRam": "250.5MB", "gpuVram": "1.78GB", } `); @@ -359,8 +359,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", - "gpuVram": "1.02GB", + "cpuRam": "250.5MB", + "gpuVram": "1.03GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -370,8 +370,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", - "gpuVram": "650.69MB", + "cpuRam": "250.5MB", + "gpuVram": "667.52MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -381,8 +381,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "0B", - "gpuVram": "454.67MB", + "cpuRam": "250.5MB", + "gpuVram": "475.52MB", } `); }); diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts new file mode 100644 index 00000000..574524d9 --- /dev/null +++ b/test/modelDependent/llama3.2/promptCompletion.test.ts @@ -0,0 +1,112 @@ +import {describe, expect, test} from "vitest"; +import {LlamaChatSession, resolveChatWrapper} from "../../../src/index.js"; +import {getModelFile} from "../../utils/modelFiles.js"; +import {getTestLlama} from "../../utils/getTestLlama.js"; +import {LlamaText} from "../../../src/utils/LlamaText.js"; + +describe("llama 3.2", () => { + describe("prompt completion", () => { + test("prompt completion isn't kept in the next evaluation", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("Llama-3.2-3B-Instruct.Q4_K_M.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 4096 + }); + const context2 = await model.createContext({ + contextSize: 4096 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence(), + chatWrapper: resolveChatWrapper(model, { + customWrapperSettings: { + "llama3.2-lightweight": { + todayDate: new Date("2025-01-01T00:00:00Z") + } + } + }) + }); + const chatSession2 = new LlamaChatSession({ + contextSequence: context2.getSequence(), + chatWrapper: resolveChatWrapper(model, { + customWrapperSettings: { + "llama3.2-lightweight": { + todayDate: new Date("2025-01-01T00:00:00Z") + } + } + }) + }); + + const promptCompletion = await chatSession.completePrompt("Hi there!", { + maxTokens: 50 + }); + expect(promptCompletion).toMatchInlineSnapshot("\" I'm looking for a new phone case. I need a case that can protect your phone from scratches and drops.\""); + expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|start_header_id|>"), + "system", + new SpecialTokensText("<|end_header_id|>"), + " + + Cutting Knowledge Date: December 2023", + new SpecialToken("NL"), + "Today Date: 1 Jan 2025 + + You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialToken("EOT"), + new SpecialTokensText("<|start_header_id|>"), + "user", + new SpecialTokensText("<|end_header_id|>"), + " + + Hi there! I'm looking for a new phone case. I need a case that can protect your phone from scratches and drops.", + ]) + `); + + const res = await chatSession.prompt("Hi there!", { + maxTokens: 50 + }); + expect(res).toMatchInlineSnapshot("\"Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat for a bit?\""); + expect(LlamaText.fromTokens(model.tokenizer, chatSession.sequence.contextTokens)).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|start_header_id|>"), + "system", + new SpecialTokensText("<|end_header_id|>"), + " + + Cutting Knowledge Date: December 2023", + new SpecialToken("NL"), + "Today Date: 1 Jan 2025 + + You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialToken("EOT"), + new SpecialTokensText("<|start_header_id|>"), + "user", + new SpecialTokensText("<|end_header_id|>"), + " + + Hi there!", + new SpecialToken("EOT"), + new SpecialTokensText("<|start_header_id|>"), + "assistant", + new SpecialTokensText("<|end_header_id|>"), + " + + Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat for a bit?", + ]) + `); + + const res2 = await chatSession2.prompt("Hi there!", { + maxTokens: 50 + }); + expect(res2).to.eql(res); + }); + }); +}); diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts index 151fc4f3..e6267045 100644 --- a/test/modelDependent/llama3.2/sequenceState.test.ts +++ b/test/modelDependent/llama3.2/sequenceState.test.ts @@ -34,10 +34,10 @@ describe("llama 3.2", () => { res1, res2 ] = await Promise.all([ - chatSession1.prompt("Remember: locks are not doors", {maxTokens: 6}), + chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}), chatSession2.prompt("Remember: giraffes are not elephants", {maxTokens: 5}) ]); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever phrase.\""); + expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); expect(res2).to.toMatchInlineSnapshot('"I appreciate the reminder."'); @@ -47,8 +47,8 @@ describe("llama 3.2", () => { test.onTestFinished(() => fs.remove(stateFile1Path)); expect(contextSequence1.contextTokens).to.eql(state1Tokens); - expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105"); - expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.49MB"'); + expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103"); + expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot("\"11.27MB\""); const stateFile2Path = await getTempTestFilePath("state2"); @@ -68,7 +68,7 @@ describe("llama 3.2", () => { expect(contextSequence1TokensState1).toMatchInlineSnapshot(` { "usedInputTokens": 99, - "usedOutputTokens": 6, + "usedOutputTokens": 4, } `); @@ -91,7 +91,7 @@ describe("llama 3.2", () => { await contextSequence1.loadStateFromFile(stateFile1Path, {acceptRisk: true}); expect(contextSequence1.contextTokens).to.eql(state1Tokens); - expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("105"); + expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103"); const contextSequence1TokensState3 = contextSequence1.tokenMeter.getState(); expect(TokenMeter.diff(contextSequence1TokensState3, contextSequence1TokensState2)).toMatchInlineSnapshot(` diff --git a/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts b/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts new file mode 100644 index 00000000..78cf5480 --- /dev/null +++ b/test/modelDependent/qwen3-0.6b/reasoningBudget.test.ts @@ -0,0 +1,95 @@ +import {describe, expect, test} from "vitest"; +import {LlamaChatSession, isChatModelResponseSegment} from "../../../src/index.js"; +import {getModelFile} from "../../utils/modelFiles.js"; +import {getTestLlama} from "../../utils/getTestLlama.js"; + +describe("qwen3 0.6b", () => { + describe("reasoning budget", () => { + test("doesn't exceed reasoning budget", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("Qwen3-0.6B-Q8_0.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 512 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence() + }); + + const initialChatHistory = chatSession.getChatHistory(); + + async function promptWithBudget({ + prompt, maxTokens, reasoningBudget + }: { + prompt: string, maxTokens: number, reasoningBudget?: number + }) { + let thoughtTokens = 0; + let totalTokens = 0; + + chatSession.setChatHistory(initialChatHistory); + const {responseText, response} = await chatSession.promptWithMeta(prompt, { + maxTokens, + budgets: { + thoughtTokens: reasoningBudget + }, + onResponseChunk(chunk) { + if (chunk.type === "segment" && chunk.segmentType === "thought") { + thoughtTokens += chunk.tokens.length; + } + + totalTokens += chunk.tokens.length; + } + }); + + return { + thoughtTokens, + totalTokens, + responseText, + thoughts: response + .filter((item) => isChatModelResponseSegment(item)) + .filter((item) => item.segmentType === "thought") + .map((item) => item.text) + }; + } + + const res1 = await promptWithBudget({ + prompt: "Where do llamas come from?", + reasoningBudget: 10, + maxTokens: 20 + }); + expect(res1.thoughtTokens).to.be.gt(1); + expect(res1.thoughtTokens).to.be.lte(10); + expect(res1.totalTokens).to.be.gte(16); + expect(res1.totalTokens).to.be.lte(20); + + const res2 = await promptWithBudget({ + prompt: "Where do llamas come from?", + reasoningBudget: 0, + maxTokens: 20 + }); + expect(res2.thoughtTokens).to.be.eq(0); + expect(res2.totalTokens).to.be.gte(16); + expect(res2.totalTokens).to.be.lte(20); + + const res3 = await promptWithBudget({ + prompt: "Where do llamas come from?", + reasoningBudget: 20, + maxTokens: 20 + }); + expect(res3.thoughtTokens).to.be.eq(res3.totalTokens); + expect(res3.totalTokens).to.be.gte(16); + expect(res3.totalTokens).to.be.lte(20); + + const res4 = await promptWithBudget({ + prompt: "Where do llamas come from?", + maxTokens: 20 + }); + expect(res4.thoughtTokens).to.be.eq(res4.totalTokens); + expect(res4.totalTokens).to.be.gte(16); + expect(res4.totalTokens).to.be.lte(20); + }); + }); +}); diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts index 43145a6d..c2ad773f 100644 --- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts +++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts @@ -111,7 +111,7 @@ describe("stableCode", () => { freeVram: s1GB * 3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("7177"); + expect(res.contextSize).to.toMatchInlineSnapshot("8064"); } try { await resolveGpuLayers(16, { @@ -137,12 +137,12 @@ describe("stableCode", () => { // play with this number to make the test pass, it should be low enough so that there won't be any VRAM left // to create a context - freeVram: s1GB * 0.2, + freeVram: s1GB * 1.4, ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("133"); + expect(res.contextSize).to.toMatchInlineSnapshot("138"); } @@ -174,7 +174,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("11125"); + expect(res.contextSize).to.toMatchInlineSnapshot("11348"); } try { await resolveGpuLayers(32, { @@ -192,7 +192,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("94"); + expect(res.contextSize).to.toMatchInlineSnapshot("48"); } { @@ -223,7 +223,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("11125"); + expect(res.contextSize).to.toMatchInlineSnapshot("11348"); } try { await resolveGpuLayers(33, { @@ -241,7 +241,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("94"); + expect(res.contextSize).to.toMatchInlineSnapshot("48"); } { @@ -303,7 +303,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("94"); + expect(res.contextSize).to.toMatchInlineSnapshot("48"); } { const res = await resolveGpuLayers("max", { @@ -311,7 +311,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); } { const res = await resolveGpuLayers("max", { @@ -319,7 +319,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6866"); + expect(res.contextSize).to.toMatchInlineSnapshot("6979"); } { const res = await resolveGpuLayers("max", { @@ -327,7 +327,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7931"); + expect(res.contextSize).to.toMatchInlineSnapshot("8072"); } }); @@ -345,24 +345,24 @@ describe("stableCode", () => { totalVram: s1GB * 6, freeVram: s1GB * 0.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); + expect(res.contextSize).to.toMatchInlineSnapshot("10864"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 0.8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); - expect(res.contextSize).to.toMatchInlineSnapshot("8724"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 1.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("6203"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("5"); + expect(res.contextSize).to.toMatchInlineSnapshot("8368"); } { const res = await resolveGpuLayers("auto", { @@ -370,7 +370,7 @@ describe("stableCode", () => { freeVram: s1GB * 2.4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("1544"); + expect(res.contextSize).to.toMatchInlineSnapshot("1518"); } { const res = await resolveGpuLayers("auto", { @@ -378,7 +378,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3407"); + expect(res.contextSize).to.toMatchInlineSnapshot("3429"); } { const res = await resolveGpuLayers("auto", { @@ -386,7 +386,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3939"); + expect(res.contextSize).to.toMatchInlineSnapshot("3976"); } { const res = await resolveGpuLayers("auto", { @@ -394,7 +394,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4471"); + expect(res.contextSize).to.toMatchInlineSnapshot("4522"); } { const res = await resolveGpuLayers("auto", { @@ -402,7 +402,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5270"); + expect(res.contextSize).to.toMatchInlineSnapshot("5341"); } { const res = await resolveGpuLayers("auto", { @@ -410,7 +410,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); } { const res = await resolveGpuLayers("auto", { @@ -418,7 +418,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("6600"); + expect(res.contextSize).to.toMatchInlineSnapshot("6706"); } { const res = await resolveGpuLayers("auto", { @@ -426,7 +426,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7133"); + expect(res.contextSize).to.toMatchInlineSnapshot("7252"); } { const res = await resolveGpuLayers("auto", { @@ -434,7 +434,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7931"); + expect(res.contextSize).to.toMatchInlineSnapshot("8072"); } { const res = await resolveGpuLayers("auto", { @@ -442,7 +442,7 @@ describe("stableCode", () => { freeVram: s1GB * 5.2 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("8995"); + expect(res.contextSize).to.toMatchInlineSnapshot("9164"); } { const res = await resolveGpuLayers("auto", { @@ -450,7 +450,7 @@ describe("stableCode", () => { freeVram: s1GB * 5.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("10592"); + expect(res.contextSize).to.toMatchInlineSnapshot("10802"); } { const res = await resolveGpuLayers("auto", { @@ -458,7 +458,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("11125"); + expect(res.contextSize).to.toMatchInlineSnapshot("11348"); } }); @@ -504,7 +504,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("11658"); + expect(res.contextSize).to.toMatchInlineSnapshot("13255"); } try { await resolveGpuLayers({min: 16}, { @@ -522,7 +522,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -531,8 +531,8 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); - expect(res.contextSize).to.toMatchInlineSnapshot("8160"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); + expect(res.contextSize).to.toMatchInlineSnapshot("8249"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -542,7 +542,7 @@ describe("stableCode", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("7177"); + expect(res.contextSize).to.toMatchInlineSnapshot("8064"); } }); @@ -565,7 +565,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5802"); + expect(res.contextSize).to.toMatchInlineSnapshot("5887"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -574,8 +574,8 @@ describe("stableCode", () => { totalVram: s1GB * 2, freeVram: s1GB * 1 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("9426"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("3"); + expect(res.contextSize).to.toMatchInlineSnapshot("5933"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -584,8 +584,8 @@ describe("stableCode", () => { totalVram: s1GB * 6, freeVram: s1GB * 4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); - expect(res.contextSize).to.toMatchInlineSnapshot("9167"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); + expect(res.contextSize).to.toMatchInlineSnapshot("9208"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -595,7 +595,7 @@ describe("stableCode", () => { freeVram: s1GB * 1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("9426"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); expect(res.contextSize).to.be.gte(contextSize); } { diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts index bcc6a6c0..fa307dc6 100644 --- a/test/utils/modelFiles.ts +++ b/test/utils/modelFiles.ts @@ -20,7 +20,8 @@ const supportedModels = { "codegemma-2b-Q4_K_M.gguf": "https://huggingface.co/bartowski/codegemma-2b-GGUF/resolve/main/codegemma-2b-Q4_K_M.gguf?download=true", "Llama-3.2-3B-Instruct.Q4_K_M.gguf": "https://huggingface.co/mradermacher/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct.Q4_K_M.gguf?download=true", "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true", - "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true" + "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true", + "Qwen3-0.6B-Q8_0.gguf": "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true" } as const; export async function getModelFile(modelName: keyof typeof supportedModels) {