diff --git a/llama/addon.cpp b/llama/addon.cpp index 117621dd..03bd3ece 100644 --- a/llama/addon.cpp +++ b/llama/addon.cpp @@ -987,6 +987,10 @@ class AddonContext : public Napi::ObjectWrap { context_params.embeddings = options.Get("embeddings").As().Value(); } + if (options.Has("flashAttention")) { + context_params.flash_attn = options.Get("flashAttention").As().Value(); + } + if (options.Has("threads")) { const auto n_threads = options.Get("threads").As().Uint32Value(); const auto resolved_n_threads = n_threads == 0 ? std::thread::hardware_concurrency() : n_threads; diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index fc5de2fb..b71fb3b9 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -20,6 +20,7 @@ export type BindingModule = { contextSize?: number, batchSize?: number, sequences?: number, + flashAttention?: boolean, logitsAll?: boolean, embeddings?: boolean, threads?: number diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index e165a463..bb70d754 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -7,7 +7,7 @@ import {GbnfJsonSchema} from "../utils/gbnfJson/types.js"; import {LlamaJsonSchemaGrammar} from "../evaluator/LlamaJsonSchemaGrammar.js"; import {LlamaGrammar, LlamaGrammarOptions} from "../evaluator/LlamaGrammar.js"; import {BindingModule} from "./AddonTypes.js"; -import {BuildGpu, BuildMetadataFile, LlamaLocks, LlamaLogLevel} from "./types.js"; +import {BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel} from "./types.js"; import {MemoryOrchestrator, MemoryReservation} from "./utils/MemoryOrchestrator.js"; const LlamaLogLevelToAddonLogLevel: ReadonlyMap = new Map([ @@ -31,7 +31,7 @@ export class Llama { /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator; /** @internal */ public readonly _vramPadding: MemoryReservation; /** @internal */ public readonly _debug: boolean; - /** @internal */ private readonly _gpu: BuildGpu; + /** @internal */ private readonly _gpu: LlamaGpuType; /** @internal */ private readonly _buildType: "localBuild" | "prebuilt"; /** @internal */ private readonly _cmakeOptions: Readonly>; /** @internal */ private readonly _supportsGpuOffloading: boolean; @@ -244,7 +244,10 @@ export class Llama { await this._bindings.init(); } - /** @internal */ + /** + * Log messages related to the Llama instance + * @internal + */ public _log(level: LlamaLogLevel, message: string) { this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n"); } diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index 29aa8cad..3938d638 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -16,7 +16,7 @@ import { } from "./utils/compileLLamaCpp.js"; import {getLastBuildInfo} from "./utils/lastBuildInfo.js"; import {getClonedLlamaCppRepoReleaseInfo, isLlamaCppRepoCloned} from "./utils/cloneLlamaCppRepo.js"; -import {BuildGpu, BuildMetadataFile, BuildOptions, LlamaLogLevel} from "./types.js"; +import {BuildGpu, BuildMetadataFile, BuildOptions, LlamaGpuType, LlamaLogLevel} from "./types.js"; import {BinaryPlatform, getPlatform} from "./utils/getPlatform.js"; import {getBuildFolderNameForBuildOptions} from "./utils/getBuildFolderNameForBuildOptions.js"; import {resolveCustomCmakeOptions} from "./utils/resolveCustomCmakeOptions.js"; @@ -46,7 +46,10 @@ export type LlamaOptions = { * * `"auto"` by default. */ - gpu?: "auto" | "metal" | "cuda" | "vulkan" | false, + gpu?: "auto" | LlamaGpuType | { + type: "auto", + exclude?: LlamaGpuType[] + }, /** * Set the minimum log level for llama.cpp. @@ -298,6 +301,9 @@ export async function getLlamaForOptions({ } } + if (buildGpusToTry.length === 0) + throw new Error("No GPU types available to try building with"); + if (build === "auto" || build === "never") { for (let i = 0; i < buildGpusToTry.length; i++) { const gpu = buildGpusToTry[i]; diff --git a/src/bindings/types.ts b/src/bindings/types.ts index 204b00a5..691e1005 100644 --- a/src/bindings/types.ts +++ b/src/bindings/types.ts @@ -3,6 +3,7 @@ import {BinaryPlatform} from "./utils/getPlatform.js"; import {BinaryPlatformInfo} from "./utils/getPlatformInfo.js"; export const buildGpuOptions = ["metal", "cuda", "vulkan", false] as const; +export type LlamaGpuType = "metal" | "cuda" | "vulkan" | false; export const nodeLlamaCppGpuOptions = [ "auto", ...buildGpuOptions diff --git a/src/bindings/utils/getGpuTypesToUseForOption.ts b/src/bindings/utils/getGpuTypesToUseForOption.ts index d5a1b9cd..f171525d 100644 --- a/src/bindings/utils/getGpuTypesToUseForOption.ts +++ b/src/bindings/utils/getGpuTypesToUseForOption.ts @@ -1,28 +1,41 @@ import process from "process"; import {BuildGpu, buildGpuOptions} from "../types.js"; +import {LlamaOptions} from "../getLlama.js"; import {BinaryPlatform, getPlatform} from "./getPlatform.js"; import {getBestComputeLayersAvailable} from "./getBestComputeLayersAvailable.js"; -export async function getGpuTypesToUseForOption(gpu: BuildGpu | "auto", { +export async function getGpuTypesToUseForOption(gpu: Required["gpu"], { platform = getPlatform(), arch = process.arch }: { platform?: BinaryPlatform, arch?: typeof process.arch } = {}): Promise { - const resolvedGpu = resolveValidGpuOptionForPlatform(gpu, { + const resolvedGpuOption = typeof gpu === "object" + ? gpu.type + : gpu; + + function withExcludedGpuTypesRemoved(gpuTypes: BuildGpu[]) { + const resolvedExcludeTypes = typeof gpu === "object" + ? new Set(gpu.exclude ?? []) + : new Set(); + + return gpuTypes.filter(gpuType => !resolvedExcludeTypes.has(gpuType)); + } + + const resolvedGpu = resolveValidGpuOptionForPlatform(resolvedGpuOption, { platform, arch }); if (resolvedGpu === "auto") { if (arch === process.arch) - return await getBestComputeLayersAvailable(); + return withExcludedGpuTypesRemoved(await getBestComputeLayersAvailable()); - return [false]; + return withExcludedGpuTypesRemoved([false]); } - return [resolvedGpu]; + return withExcludedGpuTypesRemoved([resolvedGpu]); } export function resolveValidGpuOptionForPlatform(gpu: BuildGpu | "auto", { diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 684a62d7..c9c9d85b 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -41,6 +41,7 @@ type ChatCommand = { noJinja?: boolean, contextSize?: number, batchSize?: number, + flashAttention?: boolean, noTrimWhitespace: boolean, grammar: "text" | Parameters[1], jsonSchemaGrammarFile?: string, @@ -149,6 +150,12 @@ export const ChatCommand: CommandModule = { type: "number", description: "Batch size to use for the model context. The default value is the context size" }) + .option("flashAttention", { + alias: "fa", + type: "boolean", + default: false, + description: "Enable flash attention" + }) .option("noTrimWhitespace", { type: "boolean", alias: ["noTrim"], @@ -269,7 +276,7 @@ export const ChatCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, - promptFile, wrapper, noJinja, contextSize, batchSize, + promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, @@ -278,9 +285,9 @@ export const ChatCommand: CommandModule = { try { await RunChat({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, - batchSize, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, gpuLayers, - lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, - noHistory, environmentFunctions, debug, meter, printTimings + batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, + gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, + maxTokens, noHistory, environmentFunctions, debug, meter, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -293,9 +300,9 @@ export const ChatCommand: CommandModule = { async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, - contextSize, batchSize, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, - minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, - repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, debug, meter, printTimings + contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, + threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, + repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, debug, meter, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; @@ -360,6 +367,7 @@ async function RunChat({ : contextSize != null ? {fitContext: {contextSize}} : undefined, + defaultContextFlashAttention: flashAttention, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index b642d7eb..78f434ca 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -28,6 +28,7 @@ type CompleteCommand = { textFile?: string, contextSize?: number, batchSize?: number, + flashAttention?: boolean, threads: number, temperature: number, minP: number, @@ -104,6 +105,12 @@ export const CompleteCommand: CommandModule = { type: "number", description: "Batch size to use for the model context. The default value is the context size" }) + .option("flashAttention", { + alias: "fa", + type: "boolean", + default: false, + description: "Enable flash attention" + }) .option("threads", { type: "number", default: 6, @@ -194,14 +201,14 @@ export const CompleteCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, - threads, temperature, minP, topK, + flashAttention, threads, temperature, minP, topK, topP, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, debug, meter, printTimings }) { try { await RunCompletion({ - modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, + modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, debug, meter, printTimings @@ -216,7 +223,7 @@ export const CompleteCommand: CommandModule = { async function RunCompletion({ - modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, + modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, debug, meter, printTimings @@ -276,6 +283,7 @@ async function RunCompletion({ : contextSize != null ? {fitContext: {contextSize}} : undefined, + defaultContextFlashAttention: flashAttention, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 84485fb3..c350556f 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -30,6 +30,7 @@ type InfillCommand = { suffixFile?: string, contextSize?: number, batchSize?: number, + flashAttention?: boolean, threads: number, temperature: number, minP: number, @@ -114,6 +115,12 @@ export const InfillCommand: CommandModule = { type: "number", description: "Batch size to use for the model context. The default value is the context size" }) + .option("flashAttention", { + alias: "fa", + type: "boolean", + default: false, + description: "Enable flash attention" + }) .option("threads", { type: "number", default: 6, @@ -204,14 +211,14 @@ export const InfillCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, - threads, temperature, minP, topK, + flashAttention, threads, temperature, minP, topK, topP, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, debug, meter, printTimings }) { try { await RunInfill({ - modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, + modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, debug, meter, printTimings @@ -226,7 +233,7 @@ export const InfillCommand: CommandModule = { async function RunInfill({ - modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, + modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, debug, meter, printTimings @@ -300,6 +307,7 @@ async function RunInfill({ : contextSize != null ? {fitContext: {contextSize}} : undefined, + defaultContextFlashAttention: flashAttention, ignoreMemorySafetyChecks: gpuLayers != null, onLoadProgress(loadProgress: number) { progressUpdater.setProgress(loadProgress); diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts index d9920726..37df63d7 100644 --- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts +++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts @@ -28,6 +28,7 @@ type InspectMeasureCommand = { maxLayers?: number, minContextSize: number, maxContextSize?: number, + flashAttention?: boolean, measures: number, printHeaderBeforeEachLayer?: boolean, evaluateText?: string, @@ -93,6 +94,12 @@ export const InspectMeasureCommand: CommandModule defaultDescription: "Train context size", description: "Maximum context size" }) + .option("flashAttention", { + alias: "fa", + type: "boolean", + default: false, + description: "Enable flash attention for the context" + }) .option("measures", { alias: "n", type: "number", @@ -118,7 +125,7 @@ export const InspectMeasureCommand: CommandModule }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, measures = 10, + modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }: InspectMeasureCommand) { if (maxLayers === -1) maxLayers = undefined; @@ -175,6 +182,7 @@ export const InspectMeasureCommand: CommandModule initialMaxContextSize: previousContextSizeCheck, maxContextSize, minContextSize, + flashAttention, tests: measures, evaluateText: evaluateText == null ? undefined @@ -233,7 +241,8 @@ export const InspectMeasureCommand: CommandModule ? undefined : ggufInsights.estimateContextResourceRequirements({ contextSize: previousContextSizeCheck, - modelGpuLayers: lastGpuLayers + modelGpuLayers: lastGpuLayers, + flashAttention }).gpuVram; const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null) ? undefined @@ -365,7 +374,8 @@ const detectedFileName = path.basename(__filename); const expectedFileName = "InspectMeasureCommand"; async function measureModel({ - modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, evaluateText, onInfo + modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText, + onInfo }: { modelPath: string, gpu?: BuildGpu | "auto", @@ -375,6 +385,7 @@ async function measureModel({ minContextSize?: number, maxGpuLayers: number, minGpuLayers?: number, + flashAttention?: boolean, evaluateText?: string, onInfo(data: { gpuLayers: number, @@ -475,6 +486,7 @@ async function measureModel({ minContextSize, maxGpuLayers, minGpuLayers, + flashAttention, evaluateText } satisfies ParentToChildMessage); @@ -567,9 +579,11 @@ async function runTestWorkerLogic() { process.send(info); } - async function testContextSizes({model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, evaluateText}: { + async function testContextSizes({ + model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText + }: { model: LlamaModel, modelVramUsage: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number, - tests: number, evaluateText?: string + tests: number, flashAttention?: boolean, evaluateText?: string }) { const contextSizeCheckPlan = getContextSizesCheckPlan( maxContextSize != null @@ -591,7 +605,8 @@ async function runTestWorkerLogic() { const preContextVramUsage = (await llama.getVramState()).used; const context = await model.createContext({ contextSize: currentContextSizeCheck ?? undefined, - ignoreMemorySafetyChecks: currentContextSizeCheck != null + ignoreMemorySafetyChecks: currentContextSizeCheck != null, + flashAttention }); if (evaluateText != null && evaluateText != "") { @@ -633,15 +648,18 @@ async function runTestWorkerLogic() { } } - async function testWithGpuLayers({modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, evaluateText}: { + async function testWithGpuLayers({ + modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText + }: { modelPath: string, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number, - evaluateText?: string + flashAttention?: boolean, evaluateText?: string }) { try { const preModelVramUsage = (await llama.getVramState()).used; const model = await llama.loadModel({ modelPath, gpuLayers, + defaultContextFlashAttention: flashAttention, ignoreMemorySafetyChecks: true }); const postModelVramUsage = (await llama.getVramState()).used; @@ -659,6 +677,7 @@ async function runTestWorkerLogic() { startContextSize, maxContextSize, minContextSize, + flashAttention, tests, evaluateText }); @@ -685,6 +704,7 @@ async function runTestWorkerLogic() { : undefined, maxContextSize: message.maxContextSize, minContextSize: message.minContextSize, + flashAttention: message.flashAttention, evaluateText: message.evaluateText }); } @@ -766,6 +786,7 @@ type ParentToChildMessage = { tests: number, maxGpuLayers: number, minGpuLayers?: number, + flashAttention?: boolean, initialMaxContextSize?: number, maxContextSize?: number, minContextSize?: number, diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts index 2d60250c..9421b29c 100644 --- a/src/cli/utils/printCommonInfoLines.ts +++ b/src/cli/utils/printCommonInfoLines.ts @@ -85,6 +85,10 @@ export async function printCommonInfoLines({ show: logBatchSize, title: "Batch size", value: bytes(context.batchSize) + }, { + show: context.flashAttention, + title: "Flash attention", + value: "enabled" }, { show: tokenMeterEnabled, title: "Token meter", diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index ccb8bab8..76cd8c7c 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -1,7 +1,7 @@ -import {DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAggregator} from "lifecycle-utils"; +import {AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils"; import {removeNullFields} from "../../utils/removeNullFields.js"; import {Token} from "../../types.js"; -import {BatchLogitIndex, AddonContext} from "../../bindings/AddonTypes.js"; +import {AddonContext, BatchLogitIndex} from "../../bindings/AddonTypes.js"; import {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js"; import {compareTokens} from "../../utils/compareTokens.js"; import {DisposalPreventionHandle, DisposeGuard} from "../../utils/DisposeGuard.js"; @@ -25,6 +25,7 @@ export class LlamaContext { /** @internal */ private readonly _model: LlamaModel; /** @internal */ private readonly _contextSize: number; /** @internal */ private readonly _batchSize: number; + /** @internal */ private readonly _flashAttention: boolean; /** @internal */ private readonly _totalSequences: number; /** @internal */ private readonly _unusedSequenceIds: number[] = []; /** @internal */ private readonly _batchingOptions: Required; @@ -50,6 +51,7 @@ export class LlamaContext { seed = null, contextSize, batchSize, + flashAttention = _model.defaultContextFlashAttention, threads = 6, batching: { dispatchSchedule: batchingDispatchSchedule = "nextTick", @@ -60,7 +62,8 @@ export class LlamaContext { }: LlamaContextOptions & { sequences: number, contextSize: number, - batchSize: number + batchSize: number, + flashAttention: boolean }) { if (_model.disposed) throw new DisposedError(); @@ -72,11 +75,13 @@ export class LlamaContext { this._totalSequences = Math.max(1, Math.floor(sequences)); this._contextSize = Math.max(2, contextSize); this._batchSize = Math.max(batchSize, this._totalSequences); + this._flashAttention = flashAttention; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined, contextSize: this._contextSize * this._totalSequences, // each sequence needs its own of cells batchSize: this._batchSize, sequences: this._totalSequences, + flashAttention: this._flashAttention, threads: Math.max(0, Math.floor(threads)), embeddings: _embeddings, noSeed: _noSeed @@ -136,6 +141,10 @@ export class LlamaContext { return this._batchSize; } + public get flashAttention(): boolean { + return this._flashAttention; + } + /** * The actual size of the state in the memory in bytes. * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context. @@ -541,11 +550,15 @@ export class LlamaContext { _model: LlamaModel }): Promise { const sequences = options.sequences ?? getDefaultContextSequences(); + const flashAttention = _model.flashAttentionSupported + ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) + : false; const contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, { batchSize: options.batchSize, sequences: sequences, modelGpuLayers: _model.gpuLayers, modelTrainContextSize: _model.trainContextSize, + flashAttention, getVramState: () => _model._llama._vramOrchestrator.getMemoryState(), llamaGpu: _model._llama.gpu, ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks, @@ -557,10 +570,11 @@ export class LlamaContext { sequences, isEmbeddingContext: options._embeddings, modelGpuLayers: _model.gpuLayers, - batchSize + batchSize, + flashAttention }).gpuVram; - const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences}); + const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention}); const {createSignal} = options; const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks ? null diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index 4e01516e..c5a30cf4 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -40,6 +40,21 @@ export type LlamaContextOptions = { */ batchSize?: number, + /** + * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. + * + * The support for flash attention is currently experimental and may not always work as expected. + * Use with caution. + * + * This option will be ignored if flash attention is not supported by the model. + * + * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`). + * + * Upon flash attention exiting the experimental status, the default value will become `true` + * (the inherited value from the model option `defaultContextFlashAttention` will become `true`). + */ + flashAttention?: boolean, + /** * number of threads to use to evaluate tokens. * set to 0 to use the maximum threads supported by the current machine hardware. diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 41a5d3b5..2973187c 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -15,6 +15,7 @@ import {getReadablePath} from "../../cli/utils/getReadablePath.js"; import {LlamaContextOptions} from "../LlamaContext/types.js"; import {LlamaContext} from "../LlamaContext/LlamaContext.js"; import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "../LlamaEmbeddingContext.js"; +import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js"; import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js"; import type {Llama} from "../../bindings/Llama.js"; import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js"; @@ -106,6 +107,27 @@ export type LlamaModelOptions = { onLoadProgress?(loadProgress: number): void }, + /** + * Enable flash attention by default for contexts created with this model. + * Only works with models that support flash attention. + * + * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. + * + * The support for flash attention is currently experimental and may not always work as expected. + * Use with caution. + * + * This option will be ignored if flash attention is not supported by the model. + * + * Enabling this affects the calculations of default values for the model and contexts created with it + * as flash attention reduces the amount of memory required, + * which allows for more layers to be offloaded to the GPU and for context sizes to be bigger. + * + * Defaults to `false`. + * + * Upon flash attention exiting the experimental status, the default value will become `true`. + */ + defaultContextFlashAttention?: boolean, + /** * Called with the load percentage when the model is being loaded. * > **Note:** This progress does not include the progress of loading the provided LoRA adapters (when `lora` is used) @@ -128,6 +150,7 @@ export type LlamaModelOptions = { const defaultLoraThreads = 6; const defaultLoraScale = 1; const defaultUseMmap = true; +const defaultContextFlashAttentionEnabled = false; export class LlamaModel { /** @internal */ public readonly _llama: Llama; @@ -142,6 +165,9 @@ export class LlamaModel { /** @internal */ private readonly _disposedState: DisposedState = {disposed: false}; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle; + /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean; + /** @internal */ private readonly _defaultContextFlashAttention: boolean; + /** @internal */ private readonly _flashAttentionSupported: boolean; /** @internal */ private _typeDescription?: ModelTypeDescription; /** @internal */ private _trainContextSize?: number; /** @internal */ private _embeddingVectorSize?: number; @@ -157,11 +183,17 @@ export class LlamaModel { }, { _llama, _fileInfo, - _fileInsights + _fileInsights, + _defaultContextFlashAttentionOptionEnabled, + _defaultContextFlashAttention, + _flashAttentionSupported }: { _llama: Llama, _fileInfo: GgufFileInfo, - _fileInsights: GgufInsights + _fileInsights: GgufInsights, + _defaultContextFlashAttentionOptionEnabled: boolean, + _defaultContextFlashAttention: boolean, + _flashAttentionSupported: boolean }) { this._llama = _llama; this._fileInfo = _fileInfo; @@ -170,6 +202,9 @@ export class LlamaModel { this._gpuLayers = gpuLayers; this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]); this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle(); + this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled; + this._defaultContextFlashAttention = _defaultContextFlashAttention; + this._flashAttentionSupported = _flashAttentionSupported; this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({ addonExports: this._llama._bindings, gpuLayers, @@ -270,6 +305,14 @@ export class LlamaModel { return this._model.getModelSize(); } + public get flashAttentionSupported() { + return this._flashAttentionSupported; + } + + public get defaultContextFlashAttention() { + return this._defaultContextFlashAttention; + } + /** * Transform text into tokens that can be fed to the model * @param text - the text to tokenize @@ -485,6 +528,26 @@ export class LlamaModel { // do nothing } + try { + if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) { + if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok) + warnings.push("Flash attention is incompatible with Grok and thus was turned off"); + else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) + warnings.push("Flash attention is incompatible with Gemma2 and thus was turned off"); + else { + const nHead = this.fileInfo.architectureMetadata?.attention?.head_count ?? 0; + const nEmbd = this.fileInfo.architectureMetadata?.embedding_length ?? 0; + const nEmbdHeadK = this.fileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); + const nEmbdHeadV = this.fileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); + + if (nEmbdHeadK !== nEmbdHeadV) + warnings.push("Flash attention is incompatible with this model and thus was turned off"); + } + } + } catch (err) { + // do nothing + } + return warnings; } @@ -562,7 +625,7 @@ export class LlamaModel { }: { _llama: Llama }) { - const {loadSignal} = modelOptions; + const {loadSignal, defaultContextFlashAttention} = modelOptions; let useMmap = modelOptions.useMmap ?? defaultUseMmap; const loraOptions: LlamaModelOptions["lora"] = typeof modelOptions.lora === "string" ? {adapters: [{loraFilePath: modelOptions.lora}]} @@ -576,12 +639,24 @@ export class LlamaModel { signal: loadSignal }); const ggufInsights = await GgufInsights.from(fileInfo, _llama); + const flashAttentionSupported = ggufInsights.flashAttentionSupported; + const resolvedDefaultContextFlashAttention = flashAttentionSupported + ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) + : false; const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { - ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks + ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, + defaultContextFlashAttention: resolvedDefaultContextFlashAttention }); const vramRequiredEstimate = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers}).gpuVram; - const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {_fileInfo: fileInfo, _fileInsights: ggufInsights, _llama}); + const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, { + _fileInfo: fileInfo, + _fileInsights: ggufInsights, + _llama, + _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false, + _flashAttentionSupported: flashAttentionSupported, + _defaultContextFlashAttention: resolvedDefaultContextFlashAttention + }); const modelCreationMemoryReservation = modelOptions.ignoreMemorySafetyChecks ? null : _llama._vramOrchestrator.reserveMemory(vramRequiredEstimate); diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index 51240709..0c7c0075 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -84,6 +84,26 @@ export class GgufInsights { return this._modelSize; } + public get flashAttentionSupported() { + // source: `llama_new_context_with_model` in `llama.cpp` + + if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok) + return false; + else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) + return false; + else { + const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0; + const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0; + const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); + const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); + + if (nEmbdHeadK !== nEmbdHeadV) + return false; + } + + return true; + } + public estimateModelResourceRequirements({gpuLayers}: {gpuLayers: number}): GgufInsightsResourceRequirements { const {cpu, gpu} = this._getTensorResourceSplit(gpuLayers); @@ -99,10 +119,10 @@ export class GgufInsights { * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. */ public estimateContextResourceRequirements({ - contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false }: { contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, - includeGraphOverhead?: boolean + flashAttention?: boolean, includeGraphOverhead?: boolean }): GgufInsightsResourceRequirements { if (sequences == null) sequences = getDefaultContextSequences(); if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences}); @@ -261,7 +281,7 @@ export class GgufInsights { return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment; }; - const graphOverheadMemory = !includeGraphOverhead + const graphOverheadMemory = (flashAttention || !includeGraphOverhead) ? 0 : estimateGraphOverheadMemory(); diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index 09fb56a0..a0f35777 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -35,10 +35,12 @@ export class GgufInsightsConfigurationResolver { */ public async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), - embeddingContext = false + embeddingContext = false, + flashAttention = false }: { contextSize?: number, - embeddingContext?: boolean + embeddingContext?: boolean, + flashAttention?: boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => ({total: os.totalmem(), free: os.freemem()})), @@ -117,7 +119,8 @@ export class GgufInsightsConfigurationResolver { const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({ contextSize: resolvedContextSize, isEmbeddingContext: embeddingContext, - modelGpuLayers: resolvedGpuLayers + modelGpuLayers: resolvedGpuLayers, + flashAttention }); const rankPoints = { @@ -184,10 +187,11 @@ export class GgufInsightsConfigurationResolver { ignoreMemorySafetyChecks = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, - llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading + llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, + defaultContextFlashAttention = false }: { ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, - llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean + llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean } = {}) { return resolveModelGpuLayersOption(gpuLayers, { ggufInsights: this._ggufInsights, @@ -195,7 +199,8 @@ export class GgufInsightsConfigurationResolver { getVramState, llamaVramPaddingSize, llamaGpu, - llamaSupportsGpuOffloading + llamaSupportsGpuOffloading, + defaultContextFlashAttention }); } @@ -203,6 +208,7 @@ export class GgufInsightsConfigurationResolver { modelGpuLayers, batchSize, modelTrainContextSize, + flashAttention = false, getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), llamaGpu = this._ggufInsights._llama.gpu, ignoreMemorySafetyChecks = false, @@ -211,6 +217,7 @@ export class GgufInsightsConfigurationResolver { }: { modelGpuLayers: number, modelTrainContextSize: number, + flashAttention?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, getVramState?(): Promise<{total: number, free: number}>, @@ -225,6 +232,7 @@ export class GgufInsightsConfigurationResolver { modelFileInsights: this._ggufInsights, modelGpuLayers, modelTrainContextSize, + flashAttention, getVramState, llamaGpu, ignoreMemorySafetyChecks, diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index 5bf200c6..c4bb5fcf 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -5,7 +5,7 @@ import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu, + contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, getVramState, llamaGpu, ignoreMemorySafetyChecks = false, isEmbeddingContext = false }: { contextSize?: LlamaContextOptions["contextSize"], @@ -14,6 +14,7 @@ export async function resolveContextContextSizeOption({ modelFileInsights: GgufInsights, modelGpuLayers: number, modelTrainContextSize: number, + flashAttention: boolean, getVramState(): Promise<{total: number, free: number}>, llamaGpu: BuildGpu, ignoreMemorySafetyChecks?: boolean, @@ -34,6 +35,7 @@ export async function resolveContextContextSizeOption({ batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}), modelGpuLayers: modelGpuLayers, sequences, + flashAttention, isEmbeddingContext }).gpuVram; @@ -74,6 +76,7 @@ export async function resolveContextContextSizeOption({ batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}), modelGpuLayers: modelGpuLayers, sequences, + flashAttention, isEmbeddingContext }).gpuVram; diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts index 7a3a70a0..d9dc4369 100644 --- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts +++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts @@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5; export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, - llamaGpu, llamaSupportsGpuOffloading + llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention }: { ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean, getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu, - llamaSupportsGpuOffloading: boolean + llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean }): Promise { if (gpuLayers == null) gpuLayers = "auto"; @@ -35,7 +35,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" const maxLayersRequirements = getVramRequiredForGpuLayers({ gpuLayers: resolvedGpuLayers, ggufInsights, - currentVram: vramState.free + currentVram: vramState.free, + defaultContextFlashAttention }); if (maxLayersRequirements == null) @@ -69,7 +70,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions[" : undefined, maxGpuLayers: typeof gpuLayers === "object" ? gpuLayers.max - : undefined + : undefined, + defaultContextFlashAttention }); const hasGpuLayersRequirements = typeof gpuLayers === "object" && @@ -89,13 +91,15 @@ function getBestGpuLayersForFreeVram({ freeVram, fitContext, minGpuLayers, - maxGpuLayers + maxGpuLayers, + defaultContextFlashAttention }: { ggufInsights: GgufInsights, freeVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, minGpuLayers?: number, - maxGpuLayers?: number + maxGpuLayers?: number, + defaultContextFlashAttention: boolean }) { return findBestOption({ *generator() { @@ -113,7 +117,8 @@ function getBestGpuLayersForFreeVram({ gpuLayers: option.gpuLayers, ggufInsights, currentVram: freeVram, - fitContext + fitContext, + defaultContextFlashAttention }); if (layersRequirements == null) @@ -172,9 +177,10 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer } function getVramRequiredForGpuLayers({ - gpuLayers, ggufInsights, currentVram, fitContext + gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false }: { - gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean} + gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, + defaultContextFlashAttention: boolean }) { const modelVram = ggufInsights.estimateModelResourceRequirements({gpuLayers}).gpuVram; @@ -187,7 +193,8 @@ function getVramRequiredForGpuLayers({ batchSize: getDefaultContextBatchSize({contextSize: fitContext.contextSize, sequences: 1}), modelGpuLayers: gpuLayers, sequences: 1, - isEmbeddingContext: fitContext.embeddingContext ?? false + isEmbeddingContext: fitContext.embeddingContext ?? false, + flashAttention: defaultContextFlashAttention }).gpuVram; const totalVram = modelVram + contextVram; @@ -205,7 +212,8 @@ function getVramRequiredForGpuLayers({ gpuLayers, ggufInsights, vram: currentVram - modelVram, - isEmbeddingContext: fitContext?.embeddingContext ?? false + isEmbeddingContext: fitContext?.embeddingContext ?? false, + flashAttention: defaultContextFlashAttention }); if (maxContext == null || modelVram + maxContext.vram > currentVram) @@ -218,8 +226,8 @@ function getVramRequiredForGpuLayers({ }; } -function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext}: { - gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean +function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention}: { + gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean }) { const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}); @@ -229,7 +237,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb batchSize: getDefaultContextBatchSize({contextSize, sequences: 1}), modelGpuLayers: gpuLayers, sequences: 1, - isEmbeddingContext + isEmbeddingContext, + flashAttention }).gpuVram; if (contextVram <= vram) diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index fbeacf20..46208120 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -1,31 +1,44 @@ export const enum GgufArchitectureType { llama = "llama", falcon = "falcon", + grok = "grok", gpt2 = "gpt2", gptj = "gptj", gptneox = "gptneox", mpt = "mpt", baichuan = "baichuan", starcoder = "starcoder", - persimmon = "persimmon", refact = "refact", bert = "bert", nomicBert = "nomic-bert", + jinaBertV2 = "jina-bert-v2", bloom = "bloom", stablelm = "stablelm", qwen = "qwen", qwen2 = "qwen2", + qwen2moe = "qwen2moe", phi2 = "phi2", + phi3 = "phi3", plamo = "plamo", codeshell = "codeshell", orion = "orion", internlm2 = "internlm2", minicpm = "minicpm", gemma = "gemma", + gemma2 = "gemma2", starcoder2 = "starcoder2", mamba = "mamba", + xverse = "xverse", commandR = "command-r", - rwkv = "rwkv" + dbrx = "dbrx", + olmo = "olmo", + openelm = "openelm", + arctic = "arctic", + deepseek2 = "deepseek2", + bitnet = "bitnet", + t5 = "t5", + jais = "jais", + unknown = "(unknown)" } export type GgufMetadata = { @@ -53,8 +66,7 @@ export type GgufMetadataLlmToType = { [GgufArchitectureType.gpt2]: GgufMetadataGPT2, [GgufArchitectureType.bloom]: GgufMetadataBloom, [GgufArchitectureType.falcon]: GgufMetadataFalcon, - [GgufArchitectureType.mamba]: GgufMetadataMamba, - [GgufArchitectureType.rwkv]: GgufMetadataRWKV + [GgufArchitectureType.mamba]: GgufMetadataMamba }; // source: `enum llama_ftype` in `llama.h` in the `llama.cpp` source code @@ -415,15 +427,6 @@ export type GgufMetadataMamba = { } }; -// source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#rwkv -export type GgufMetadataRWKV = { - readonly architecture_version: 4 | number, - readonly context_length: number, - readonly block_count: number, - readonly embedding_length: number, - readonly feed_forward_length: number -}; - export function isGgufMetadataOfArchitectureType( metadata: GgufMetadata, type: A ): metadata is GgufMetadata { diff --git a/src/index.ts b/src/index.ts index ac90bc3d..1a5de8e8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,7 +2,9 @@ import {DisposedError} from "lifecycle-utils"; import {Llama} from "./bindings/Llama.js"; import {getLlama, type LlamaOptions, type LastBuildOptions} from "./bindings/getLlama.js"; import {NoBinaryFoundError} from "./bindings/utils/NoBinaryFoundError.js"; -import {LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType} from "./bindings/types.js"; +import { + type LlamaGpuType, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType +} from "./bindings/types.js"; import {LlamaModel, LlamaModelInfillTokens, type LlamaModelOptions, LlamaModelTokens} from "./evaluator/LlamaModel/LlamaModel.js"; import {TokenAttributes} from "./evaluator/LlamaModel/utils/TokenAttributes.js"; import {LlamaGrammar, type LlamaGrammarOptions} from "./evaluator/LlamaGrammar.js"; @@ -84,7 +86,7 @@ import { type GgufMetadata, type GgufMetadataLlmToType, GgufArchitectureType, GgufFileType, GgufMetadataTokenizerTokenType, GgufMetadataArchitecturePoolingType, type GgufMetadataGeneral, type GgufMetadataTokenizer, type GgufMetadataDefaultArchitectureType, type GgufMetadataLlmLLaMA, type GgufMetadataMPT, type GgufMetadataGPTNeoX, type GgufMetadataGPTJ, type GgufMetadataGPT2, - type GgufMetadataBloom, type GgufMetadataFalcon, type GgufMetadataMamba, type GgufMetadataRWKV, isGgufMetadataOfArchitectureType + type GgufMetadataBloom, type GgufMetadataFalcon, type GgufMetadataMamba, isGgufMetadataOfArchitectureType } from "./gguf/types/GgufMetadataTypes.js"; import {GgmlType, type GgufTensorInfo} from "./gguf/types/GgufTensorInfoTypes.js"; @@ -94,6 +96,7 @@ export { getLlama, type LlamaOptions, type LastBuildOptions, + type LlamaGpuType, LlamaLogLevel, NoBinaryFoundError, LlamaModel, @@ -238,7 +241,6 @@ export { type GgufMetadataBloom, type GgufMetadataFalcon, type GgufMetadataMamba, - type GgufMetadataRWKV, GgmlType, isGgufMetadataOfArchitectureType, GgufInsights,