diff --git a/llama/addon.cpp b/llama/addon.cpp
index 117621dd..03bd3ece 100644
--- a/llama/addon.cpp
+++ b/llama/addon.cpp
@@ -987,6 +987,10 @@ class AddonContext : public Napi::ObjectWrap<AddonContext> {
                     context_params.embeddings = options.Get("embeddings").As<Napi::Boolean>().Value();
                 }
 
+                if (options.Has("flashAttention")) {
+                    context_params.flash_attn = options.Get("flashAttention").As<Napi::Boolean>().Value();
+                }
+
                 if (options.Has("threads")) {
                     const auto n_threads = options.Get("threads").As<Napi::Number>().Uint32Value();
                     const auto resolved_n_threads = n_threads == 0 ? std::thread::hardware_concurrency() : n_threads;
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index fc5de2fb..b71fb3b9 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -20,6 +20,7 @@ export type BindingModule = {
             contextSize?: number,
             batchSize?: number,
             sequences?: number,
+            flashAttention?: boolean,
             logitsAll?: boolean,
             embeddings?: boolean,
             threads?: number
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index e165a463..bb70d754 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -7,7 +7,7 @@ import {GbnfJsonSchema} from "../utils/gbnfJson/types.js";
 import {LlamaJsonSchemaGrammar} from "../evaluator/LlamaJsonSchemaGrammar.js";
 import {LlamaGrammar, LlamaGrammarOptions} from "../evaluator/LlamaGrammar.js";
 import {BindingModule} from "./AddonTypes.js";
-import {BuildGpu, BuildMetadataFile, LlamaLocks, LlamaLogLevel} from "./types.js";
+import {BuildGpu, BuildMetadataFile, LlamaGpuType, LlamaLocks, LlamaLogLevel} from "./types.js";
 import {MemoryOrchestrator, MemoryReservation} from "./utils/MemoryOrchestrator.js";
 
 const LlamaLogLevelToAddonLogLevel: ReadonlyMap<LlamaLogLevel, number> = new Map([
@@ -31,7 +31,7 @@ export class Llama {
     /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator;
     /** @internal */ public readonly _vramPadding: MemoryReservation;
     /** @internal */ public readonly _debug: boolean;
-    /** @internal */ private readonly _gpu: BuildGpu;
+    /** @internal */ private readonly _gpu: LlamaGpuType;
     /** @internal */ private readonly _buildType: "localBuild" | "prebuilt";
     /** @internal */ private readonly _cmakeOptions: Readonly<Record<string, string>>;
     /** @internal */ private readonly _supportsGpuOffloading: boolean;
@@ -244,7 +244,10 @@ export class Llama {
         await this._bindings.init();
     }
 
-    /** @internal */
+    /**
+     * Log messages related to the Llama instance
+     * @internal
+     */
     public _log(level: LlamaLogLevel, message: string) {
         this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n");
     }
diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts
index 29aa8cad..3938d638 100644
--- a/src/bindings/getLlama.ts
+++ b/src/bindings/getLlama.ts
@@ -16,7 +16,7 @@ import {
 } from "./utils/compileLLamaCpp.js";
 import {getLastBuildInfo} from "./utils/lastBuildInfo.js";
 import {getClonedLlamaCppRepoReleaseInfo, isLlamaCppRepoCloned} from "./utils/cloneLlamaCppRepo.js";
-import {BuildGpu, BuildMetadataFile, BuildOptions, LlamaLogLevel} from "./types.js";
+import {BuildGpu, BuildMetadataFile, BuildOptions, LlamaGpuType, LlamaLogLevel} from "./types.js";
 import {BinaryPlatform, getPlatform} from "./utils/getPlatform.js";
 import {getBuildFolderNameForBuildOptions} from "./utils/getBuildFolderNameForBuildOptions.js";
 import {resolveCustomCmakeOptions} from "./utils/resolveCustomCmakeOptions.js";
@@ -46,7 +46,10 @@ export type LlamaOptions = {
      *
      * `"auto"` by default.
      */
-    gpu?: "auto" | "metal" | "cuda" | "vulkan" | false,
+    gpu?: "auto" | LlamaGpuType | {
+        type: "auto",
+        exclude?: LlamaGpuType[]
+    },
 
     /**
      * Set the minimum log level for llama.cpp.
@@ -298,6 +301,9 @@ export async function getLlamaForOptions({
         }
     }
 
+    if (buildGpusToTry.length === 0)
+        throw new Error("No GPU types available to try building with");
+
     if (build === "auto" || build === "never") {
         for (let i = 0; i < buildGpusToTry.length; i++) {
             const gpu = buildGpusToTry[i];
diff --git a/src/bindings/types.ts b/src/bindings/types.ts
index 204b00a5..691e1005 100644
--- a/src/bindings/types.ts
+++ b/src/bindings/types.ts
@@ -3,6 +3,7 @@ import {BinaryPlatform} from "./utils/getPlatform.js";
 import {BinaryPlatformInfo} from "./utils/getPlatformInfo.js";
 
 export const buildGpuOptions = ["metal", "cuda", "vulkan", false] as const;
+export type LlamaGpuType = "metal" | "cuda" | "vulkan" | false;
 export const nodeLlamaCppGpuOptions = [
     "auto",
     ...buildGpuOptions
diff --git a/src/bindings/utils/getGpuTypesToUseForOption.ts b/src/bindings/utils/getGpuTypesToUseForOption.ts
index d5a1b9cd..f171525d 100644
--- a/src/bindings/utils/getGpuTypesToUseForOption.ts
+++ b/src/bindings/utils/getGpuTypesToUseForOption.ts
@@ -1,28 +1,41 @@
 import process from "process";
 import {BuildGpu, buildGpuOptions} from "../types.js";
+import {LlamaOptions} from "../getLlama.js";
 import {BinaryPlatform, getPlatform} from "./getPlatform.js";
 import {getBestComputeLayersAvailable} from "./getBestComputeLayersAvailable.js";
 
-export async function getGpuTypesToUseForOption(gpu: BuildGpu | "auto", {
+export async function getGpuTypesToUseForOption(gpu: Required<LlamaOptions>["gpu"], {
     platform = getPlatform(),
     arch = process.arch
 }: {
     platform?: BinaryPlatform,
     arch?: typeof process.arch
 } = {}): Promise<BuildGpu[]> {
-    const resolvedGpu = resolveValidGpuOptionForPlatform(gpu, {
+    const resolvedGpuOption = typeof gpu === "object"
+        ? gpu.type
+        : gpu;
+
+    function withExcludedGpuTypesRemoved(gpuTypes: BuildGpu[]) {
+        const resolvedExcludeTypes = typeof gpu === "object"
+            ? new Set(gpu.exclude ?? [])
+            : new Set();
+
+        return gpuTypes.filter(gpuType => !resolvedExcludeTypes.has(gpuType));
+    }
+
+    const resolvedGpu = resolveValidGpuOptionForPlatform(resolvedGpuOption, {
         platform,
         arch
     });
 
     if (resolvedGpu === "auto") {
         if (arch === process.arch)
-            return await getBestComputeLayersAvailable();
+            return withExcludedGpuTypesRemoved(await getBestComputeLayersAvailable());
 
-        return [false];
+        return withExcludedGpuTypesRemoved([false]);
     }
 
-    return [resolvedGpu];
+    return withExcludedGpuTypesRemoved([resolvedGpu]);
 }
 
 export function resolveValidGpuOptionForPlatform(gpu: BuildGpu | "auto", {
diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts
index 684a62d7..c9c9d85b 100644
--- a/src/cli/commands/ChatCommand.ts
+++ b/src/cli/commands/ChatCommand.ts
@@ -41,6 +41,7 @@ type ChatCommand = {
     noJinja?: boolean,
     contextSize?: number,
     batchSize?: number,
+    flashAttention?: boolean,
     noTrimWhitespace: boolean,
     grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[1],
     jsonSchemaGrammarFile?: string,
@@ -149,6 +150,12 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 type: "number",
                 description: "Batch size to use for the model context. The default value is the context size"
             })
+            .option("flashAttention", {
+                alias: "fa",
+                type: "boolean",
+                default: false,
+                description: "Enable flash attention"
+            })
             .option("noTrimWhitespace", {
                 type: "boolean",
                 alias: ["noTrim"],
@@ -269,7 +276,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt,
-        promptFile, wrapper, noJinja, contextSize, batchSize,
+        promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention,
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory,
@@ -278,9 +285,9 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         try {
             await RunChat({
                 modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize,
-                batchSize, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, gpuLayers,
-                lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                noHistory, environmentFunctions, debug, meter, printTimings
+                batchSize, flashAttention, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP,
+                gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
+                maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -293,9 +300,9 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
 
 async function RunChat({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja,
-    contextSize, batchSize, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature,
-    minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty,
-    repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
+    contextSize, batchSize, flashAttention, noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
+    threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
+    repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, noHistory, environmentFunctions, debug, meter, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -360,6 +367,7 @@ async function RunChat({
                     : contextSize != null
                         ? {fitContext: {contextSize}}
                         : undefined,
+                defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts
index b642d7eb..78f434ca 100644
--- a/src/cli/commands/CompleteCommand.ts
+++ b/src/cli/commands/CompleteCommand.ts
@@ -28,6 +28,7 @@ type CompleteCommand = {
     textFile?: string,
     contextSize?: number,
     batchSize?: number,
+    flashAttention?: boolean,
     threads: number,
     temperature: number,
     minP: number,
@@ -104,6 +105,12 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 type: "number",
                 description: "Batch size to use for the model context. The default value is the context size"
             })
+            .option("flashAttention", {
+                alias: "fa",
+                type: "boolean",
+                default: false,
+                description: "Enable flash attention"
+            })
             .option("threads", {
                 type: "number",
                 default: 6,
@@ -194,14 +201,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
-        threads, temperature, minP, topK,
+        flashAttention, threads, temperature, minP, topK,
         topP, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
         debug, meter, printTimings
     }) {
         try {
             await RunCompletion({
-                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize,
+                modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
                 debug, meter, printTimings
@@ -216,7 +223,7 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
 
 
 async function RunCompletion({
-    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize,
+    modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     maxTokens, debug, meter, printTimings
@@ -276,6 +283,7 @@ async function RunCompletion({
                     : contextSize != null
                         ? {fitContext: {contextSize}}
                         : undefined,
+                defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts
index 84485fb3..c350556f 100644
--- a/src/cli/commands/InfillCommand.ts
+++ b/src/cli/commands/InfillCommand.ts
@@ -30,6 +30,7 @@ type InfillCommand = {
     suffixFile?: string,
     contextSize?: number,
     batchSize?: number,
+    flashAttention?: boolean,
     threads: number,
     temperature: number,
     minP: number,
@@ -114,6 +115,12 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
                 type: "number",
                 description: "Batch size to use for the model context. The default value is the context size"
             })
+            .option("flashAttention", {
+                alias: "fa",
+                type: "boolean",
+                default: false,
+                description: "Enable flash attention"
+            })
             .option("threads", {
                 type: "number",
                 default: 6,
@@ -204,14 +211,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
     },
     async handler({
         modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
-        threads, temperature, minP, topK,
+        flashAttention, threads, temperature, minP, topK,
         topP, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
         debug, meter, printTimings
     }) {
         try {
             await RunInfill({
-                modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
+                modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
                 threads, temperature, minP, topK, topP, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
                 debug, meter, printTimings
@@ -226,7 +233,7 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
 
 
 async function RunInfill({
-    modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize,
+    modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
     threads, temperature, minP, topK, topP, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
     maxTokens, debug, meter, printTimings
@@ -300,6 +307,7 @@ async function RunInfill({
                     : contextSize != null
                         ? {fitContext: {contextSize}}
                         : undefined,
+                defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index d9920726..37df63d7 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -28,6 +28,7 @@ type InspectMeasureCommand = {
     maxLayers?: number,
     minContextSize: number,
     maxContextSize?: number,
+    flashAttention?: boolean,
     measures: number,
     printHeaderBeforeEachLayer?: boolean,
     evaluateText?: string,
@@ -93,6 +94,12 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 defaultDescription: "Train context size",
                 description: "Maximum context size"
             })
+            .option("flashAttention", {
+                alias: "fa",
+                type: "boolean",
+                default: false,
+                description: "Enable flash attention for the context"
+            })
             .option("measures", {
                 alias: "n",
                 type: "number",
@@ -118,7 +125,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
             });
     },
     async handler({
-        modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, measures = 10,
+        modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, measures = 10,
         printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText
     }: InspectMeasureCommand) {
         if (maxLayers === -1) maxLayers = undefined;
@@ -175,6 +182,7 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                 initialMaxContextSize: previousContextSizeCheck,
                 maxContextSize,
                 minContextSize,
+                flashAttention,
                 tests: measures,
                 evaluateText: evaluateText == null
                     ? undefined
@@ -233,7 +241,8 @@ export const InspectMeasureCommand: CommandModule<object, InspectMeasureCommand>
                             ? undefined
                             : ggufInsights.estimateContextResourceRequirements({
                                 contextSize: previousContextSizeCheck,
-                                modelGpuLayers: lastGpuLayers
+                                modelGpuLayers: lastGpuLayers,
+                                flashAttention
                             }).gpuVram;
                         const contextVramEstimationDiffBytes = (result.contextVramUsage == null || contextVramEstimation == null)
                             ? undefined
@@ -365,7 +374,8 @@ const detectedFileName = path.basename(__filename);
 const expectedFileName = "InspectMeasureCommand";
 
 async function measureModel({
-    modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, evaluateText, onInfo
+    modelPath, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, flashAttention, evaluateText,
+    onInfo
 }: {
     modelPath: string,
     gpu?: BuildGpu | "auto",
@@ -375,6 +385,7 @@ async function measureModel({
     minContextSize?: number,
     maxGpuLayers: number,
     minGpuLayers?: number,
+    flashAttention?: boolean,
     evaluateText?: string,
     onInfo(data: {
         gpuLayers: number,
@@ -475,6 +486,7 @@ async function measureModel({
                         minContextSize,
                         maxGpuLayers,
                         minGpuLayers,
+                        flashAttention,
                         evaluateText
                     } satisfies ParentToChildMessage);
 
@@ -567,9 +579,11 @@ async function runTestWorkerLogic() {
         process.send(info);
     }
 
-    async function testContextSizes({model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, evaluateText}: {
+    async function testContextSizes({
+        model, modelVramUsage, startContextSize, maxContextSize, minContextSize, tests, flashAttention, evaluateText
+    }: {
         model: LlamaModel, modelVramUsage: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number,
-        tests: number, evaluateText?: string
+        tests: number, flashAttention?: boolean, evaluateText?: string
     }) {
         const contextSizeCheckPlan = getContextSizesCheckPlan(
             maxContextSize != null
@@ -591,7 +605,8 @@ async function runTestWorkerLogic() {
                 const preContextVramUsage = (await llama.getVramState()).used;
                 const context = await model.createContext({
                     contextSize: currentContextSizeCheck ?? undefined,
-                    ignoreMemorySafetyChecks: currentContextSizeCheck != null
+                    ignoreMemorySafetyChecks: currentContextSizeCheck != null,
+                    flashAttention
                 });
 
                 if (evaluateText != null && evaluateText != "") {
@@ -633,15 +648,18 @@ async function runTestWorkerLogic() {
         }
     }
 
-    async function testWithGpuLayers({modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, evaluateText}: {
+    async function testWithGpuLayers({
+        modelPath, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, evaluateText
+    }: {
         modelPath: string, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number,
-        evaluateText?: string
+        flashAttention?: boolean, evaluateText?: string
     }) {
         try {
             const preModelVramUsage = (await llama.getVramState()).used;
             const model = await llama.loadModel({
                 modelPath,
                 gpuLayers,
+                defaultContextFlashAttention: flashAttention,
                 ignoreMemorySafetyChecks: true
             });
             const postModelVramUsage = (await llama.getVramState()).used;
@@ -659,6 +677,7 @@ async function runTestWorkerLogic() {
                 startContextSize,
                 maxContextSize,
                 minContextSize,
+                flashAttention,
                 tests,
                 evaluateText
             });
@@ -685,6 +704,7 @@ async function runTestWorkerLogic() {
                         : undefined,
                     maxContextSize: message.maxContextSize,
                     minContextSize: message.minContextSize,
+                    flashAttention: message.flashAttention,
                     evaluateText: message.evaluateText
                 });
             }
@@ -766,6 +786,7 @@ type ParentToChildMessage = {
     tests: number,
     maxGpuLayers: number,
     minGpuLayers?: number,
+    flashAttention?: boolean,
     initialMaxContextSize?: number,
     maxContextSize?: number,
     minContextSize?: number,
diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts
index 2d60250c..9421b29c 100644
--- a/src/cli/utils/printCommonInfoLines.ts
+++ b/src/cli/utils/printCommonInfoLines.ts
@@ -85,6 +85,10 @@ export async function printCommonInfoLines({
             show: logBatchSize,
             title: "Batch size",
             value: bytes(context.batchSize)
+        }, {
+            show: context.flashAttention,
+            title: "Flash attention",
+            value: "enabled"
         }, {
             show: tokenMeterEnabled,
             title: "Token meter",
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index ccb8bab8..76cd8c7c 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -1,7 +1,7 @@
-import {DisposeAggregator, EventRelay, withLock, DisposedError, AsyncDisposeAggregator} from "lifecycle-utils";
+import {AsyncDisposeAggregator, DisposeAggregator, DisposedError, EventRelay, withLock} from "lifecycle-utils";
 import {removeNullFields} from "../../utils/removeNullFields.js";
 import {Token} from "../../types.js";
-import {BatchLogitIndex, AddonContext} from "../../bindings/AddonTypes.js";
+import {AddonContext, BatchLogitIndex} from "../../bindings/AddonTypes.js";
 import {LlamaGrammarEvaluationState} from "../LlamaGrammarEvaluationState.js";
 import {compareTokens} from "../../utils/compareTokens.js";
 import {DisposalPreventionHandle, DisposeGuard} from "../../utils/DisposeGuard.js";
@@ -25,6 +25,7 @@ export class LlamaContext {
     /** @internal */ private readonly _model: LlamaModel;
     /** @internal */ private readonly _contextSize: number;
     /** @internal */ private readonly _batchSize: number;
+    /** @internal */ private readonly _flashAttention: boolean;
     /** @internal */ private readonly _totalSequences: number;
     /** @internal */ private readonly _unusedSequenceIds: number[] = [];
     /** @internal */ private readonly _batchingOptions: Required<BatchingOptions>;
@@ -50,6 +51,7 @@ export class LlamaContext {
         seed = null,
         contextSize,
         batchSize,
+        flashAttention = _model.defaultContextFlashAttention,
         threads = 6,
         batching: {
             dispatchSchedule: batchingDispatchSchedule = "nextTick",
@@ -60,7 +62,8 @@ export class LlamaContext {
     }: LlamaContextOptions & {
         sequences: number,
         contextSize: number,
-        batchSize: number
+        batchSize: number,
+        flashAttention: boolean
     }) {
         if (_model.disposed)
             throw new DisposedError();
@@ -72,11 +75,13 @@ export class LlamaContext {
         this._totalSequences = Math.max(1, Math.floor(sequences));
         this._contextSize = Math.max(2, contextSize);
         this._batchSize = Math.max(batchSize, this._totalSequences);
+        this._flashAttention = flashAttention;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             seed: seed != null ? Math.max(-1, Math.floor(seed)) : undefined,
             contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize,
             sequences: this._totalSequences,
+            flashAttention: this._flashAttention,
             threads: Math.max(0, Math.floor(threads)),
             embeddings: _embeddings,
             noSeed: _noSeed
@@ -136,6 +141,10 @@ export class LlamaContext {
         return this._batchSize;
     }
 
+    public get flashAttention(): boolean {
+        return this._flashAttention;
+    }
+
     /**
      * The actual size of the state in the memory in bytes.
      * This value is provided by `llama.cpp` and doesn't include all the memory overhead of the context.
@@ -541,11 +550,15 @@ export class LlamaContext {
         _model: LlamaModel
     }): Promise<LlamaContext> {
         const sequences = options.sequences ?? getDefaultContextSequences();
+        const flashAttention = _model.flashAttentionSupported
+            ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention)
+            : false;
         const contextSize = await _model.fileInsights.configurationResolver.resolveContextContextSize(options.contextSize, {
             batchSize: options.batchSize,
             sequences: sequences,
             modelGpuLayers: _model.gpuLayers,
             modelTrainContextSize: _model.trainContextSize,
+            flashAttention,
             getVramState: () => _model._llama._vramOrchestrator.getMemoryState(),
             llamaGpu: _model._llama.gpu,
             ignoreMemorySafetyChecks: options.ignoreMemorySafetyChecks,
@@ -557,10 +570,11 @@ export class LlamaContext {
             sequences,
             isEmbeddingContext: options._embeddings,
             modelGpuLayers: _model.gpuLayers,
-            batchSize
+            batchSize,
+            flashAttention
         }).gpuVram;
 
-        const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences});
+        const context = new LlamaContext({_model}, {...options, contextSize, batchSize, sequences, flashAttention});
         const {createSignal} = options;
         const contextCreationMemoryReservation = options.ignoreMemorySafetyChecks
             ? null
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 4e01516e..c5a30cf4 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -40,6 +40,21 @@ export type LlamaContextOptions = {
      */
     batchSize?: number,
 
+    /**
+     * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
+     *
+     * The support for flash attention is currently experimental and may not always work as expected.
+     * Use with caution.
+     *
+     * This option will be ignored if flash attention is not supported by the model.
+     *
+     * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`).
+     *
+     * Upon flash attention exiting the experimental status, the default value will become `true`
+     * (the inherited value from the model option `defaultContextFlashAttention` will become `true`).
+     */
+    flashAttention?: boolean,
+
     /**
      * number of threads to use to evaluate tokens.
      * set to 0 to use the maximum threads supported by the current machine hardware.
diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts
index 41a5d3b5..2973187c 100644
--- a/src/evaluator/LlamaModel/LlamaModel.ts
+++ b/src/evaluator/LlamaModel/LlamaModel.ts
@@ -15,6 +15,7 @@ import {getReadablePath} from "../../cli/utils/getReadablePath.js";
 import {LlamaContextOptions} from "../LlamaContext/types.js";
 import {LlamaContext} from "../LlamaContext/LlamaContext.js";
 import {LlamaEmbeddingContext, LlamaEmbeddingContextOptions} from "../LlamaEmbeddingContext.js";
+import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js";
 import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js";
 import type {Llama} from "../../bindings/Llama.js";
 import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js";
@@ -106,6 +107,27 @@ export type LlamaModelOptions = {
         onLoadProgress?(loadProgress: number): void
     },
 
+    /**
+     * Enable flash attention by default for contexts created with this model.
+     * Only works with models that support flash attention.
+     *
+     * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
+     *
+     * The support for flash attention is currently experimental and may not always work as expected.
+     * Use with caution.
+     *
+     * This option will be ignored if flash attention is not supported by the model.
+     *
+     * Enabling this affects the calculations of default values for the model and contexts created with it
+     * as flash attention reduces the amount of memory required,
+     * which allows for more layers to be offloaded to the GPU and for context sizes to be bigger.
+     *
+     * Defaults to `false`.
+     *
+     * Upon flash attention exiting the experimental status, the default value will become `true`.
+     */
+    defaultContextFlashAttention?: boolean,
+
     /**
      * Called with the load percentage when the model is being loaded.
      * > **Note:** This progress does not include the progress of loading the provided LoRA adapters (when `lora` is used)
@@ -128,6 +150,7 @@ export type LlamaModelOptions = {
 const defaultLoraThreads = 6;
 const defaultLoraScale = 1;
 const defaultUseMmap = true;
+const defaultContextFlashAttentionEnabled = false;
 
 export class LlamaModel {
     /** @internal */ public readonly _llama: Llama;
@@ -142,6 +165,9 @@ export class LlamaModel {
     /** @internal */ private readonly _disposedState: DisposedState = {disposed: false};
     /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();
     /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle;
+    /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean;
+    /** @internal */ private readonly _defaultContextFlashAttention: boolean;
+    /** @internal */ private readonly _flashAttentionSupported: boolean;
     /** @internal */ private _typeDescription?: ModelTypeDescription;
     /** @internal */ private _trainContextSize?: number;
     /** @internal */ private _embeddingVectorSize?: number;
@@ -157,11 +183,17 @@ export class LlamaModel {
     }, {
         _llama,
         _fileInfo,
-        _fileInsights
+        _fileInsights,
+        _defaultContextFlashAttentionOptionEnabled,
+        _defaultContextFlashAttention,
+        _flashAttentionSupported
     }: {
         _llama: Llama,
         _fileInfo: GgufFileInfo,
-        _fileInsights: GgufInsights
+        _fileInsights: GgufInsights,
+        _defaultContextFlashAttentionOptionEnabled: boolean,
+        _defaultContextFlashAttention: boolean,
+        _flashAttentionSupported: boolean
     }) {
         this._llama = _llama;
         this._fileInfo = _fileInfo;
@@ -170,6 +202,9 @@ export class LlamaModel {
         this._gpuLayers = gpuLayers;
         this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]);
         this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle();
+        this._defaultContextFlashAttentionOptionEnabled = _defaultContextFlashAttentionOptionEnabled;
+        this._defaultContextFlashAttention = _defaultContextFlashAttention;
+        this._flashAttentionSupported = _flashAttentionSupported;
         this._model = new this._llama._bindings.AddonModel(this._modelPath, removeNullFields({
             addonExports: this._llama._bindings,
             gpuLayers,
@@ -270,6 +305,14 @@ export class LlamaModel {
         return this._model.getModelSize();
     }
 
+    public get flashAttentionSupported() {
+        return this._flashAttentionSupported;
+    }
+
+    public get defaultContextFlashAttention() {
+        return this._defaultContextFlashAttention;
+    }
+
     /**
      * Transform text into tokens that can be fed to the model
      * @param text - the text to tokenize
@@ -485,6 +528,26 @@ export class LlamaModel {
             // do nothing
         }
 
+        try {
+            if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) {
+                if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
+                    warnings.push("Flash attention is incompatible with Grok and thus was turned off");
+                else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
+                    warnings.push("Flash attention is incompatible with Gemma2 and thus was turned off");
+                else {
+                    const nHead = this.fileInfo.architectureMetadata?.attention?.head_count ?? 0;
+                    const nEmbd = this.fileInfo.architectureMetadata?.embedding_length ?? 0;
+                    const nEmbdHeadK = this.fileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
+                    const nEmbdHeadV = this.fileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
+
+                    if (nEmbdHeadK !== nEmbdHeadV)
+                        warnings.push("Flash attention is incompatible with this model and thus was turned off");
+                }
+            }
+        } catch (err) {
+            // do nothing
+        }
+
         return warnings;
     }
 
@@ -562,7 +625,7 @@ export class LlamaModel {
     }: {
         _llama: Llama
     }) {
-        const {loadSignal} = modelOptions;
+        const {loadSignal, defaultContextFlashAttention} = modelOptions;
         let useMmap = modelOptions.useMmap ?? defaultUseMmap;
         const loraOptions: LlamaModelOptions["lora"] = typeof modelOptions.lora === "string"
             ? {adapters: [{loraFilePath: modelOptions.lora}]}
@@ -576,12 +639,24 @@ export class LlamaModel {
             signal: loadSignal
         });
         const ggufInsights = await GgufInsights.from(fileInfo, _llama);
+        const flashAttentionSupported = ggufInsights.flashAttentionSupported;
+        const resolvedDefaultContextFlashAttention = flashAttentionSupported
+            ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled)
+            : false;
         const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, {
-            ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks
+            ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks,
+            defaultContextFlashAttention: resolvedDefaultContextFlashAttention
         });
         const vramRequiredEstimate = ggufInsights.estimateModelResourceRequirements({gpuLayers: gpuLayers}).gpuVram;
 
-        const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {_fileInfo: fileInfo, _fileInsights: ggufInsights, _llama});
+        const model = new LlamaModel({...modelOptions, gpuLayers, useMmap}, {
+            _fileInfo: fileInfo,
+            _fileInsights: ggufInsights,
+            _llama,
+            _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false,
+            _flashAttentionSupported: flashAttentionSupported,
+            _defaultContextFlashAttention: resolvedDefaultContextFlashAttention
+        });
         const modelCreationMemoryReservation = modelOptions.ignoreMemorySafetyChecks
             ? null
             : _llama._vramOrchestrator.reserveMemory(vramRequiredEstimate);
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index 51240709..0c7c0075 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -84,6 +84,26 @@ export class GgufInsights {
         return this._modelSize;
     }
 
+    public get flashAttentionSupported() {
+        // source: `llama_new_context_with_model` in `llama.cpp`
+
+        if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.grok)
+            return false;
+        else if (this._ggufFileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2)
+            return false;
+        else {
+            const nHead = this._ggufFileInfo.architectureMetadata?.attention?.head_count ?? 0;
+            const nEmbd = this._ggufFileInfo.architectureMetadata?.embedding_length ?? 0;
+            const nEmbdHeadK = this._ggufFileInfo.architectureMetadata?.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
+            const nEmbdHeadV = this._ggufFileInfo.architectureMetadata?.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
+
+            if (nEmbdHeadK !== nEmbdHeadV)
+                return false;
+        }
+
+        return true;
+    }
+
     public estimateModelResourceRequirements({gpuLayers}: {gpuLayers: number}): GgufInsightsResourceRequirements {
         const {cpu, gpu} = this._getTensorResourceSplit(gpuLayers);
 
@@ -99,10 +119,10 @@ export class GgufInsights {
      * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now.
      */
     public estimateContextResourceRequirements({
-        contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true
+        contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false
     }: {
         contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean,
-        includeGraphOverhead?: boolean
+        flashAttention?: boolean, includeGraphOverhead?: boolean
     }): GgufInsightsResourceRequirements {
         if (sequences == null) sequences = getDefaultContextSequences();
         if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences});
@@ -261,7 +281,7 @@ export class GgufInsights {
             return (totalElements * 77.655 * (actualContextSize / 4096)) + defaultCalculationAdjustment;
         };
 
-        const graphOverheadMemory = !includeGraphOverhead
+        const graphOverheadMemory = (flashAttention || !includeGraphOverhead)
             ? 0
             : estimateGraphOverheadMemory();
 
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index 09fb56a0..a0f35777 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -35,10 +35,12 @@ export class GgufInsightsConfigurationResolver {
      */
     public async scoreModelConfigurationCompatibility({
         contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096),
-        embeddingContext = false
+        embeddingContext = false,
+        flashAttention = false
     }: {
         contextSize?: number,
-        embeddingContext?: boolean
+        embeddingContext?: boolean,
+        flashAttention?: boolean
     } = {}, {
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         getRamState = (async () => ({total: os.totalmem(), free: os.freemem()})),
@@ -117,7 +119,8 @@ export class GgufInsightsConfigurationResolver {
         const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({
             contextSize: resolvedContextSize,
             isEmbeddingContext: embeddingContext,
-            modelGpuLayers: resolvedGpuLayers
+            modelGpuLayers: resolvedGpuLayers,
+            flashAttention
         });
 
         const rankPoints = {
@@ -184,10 +187,11 @@ export class GgufInsightsConfigurationResolver {
         ignoreMemorySafetyChecks = false,
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu,
-        llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading
+        llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading,
+        defaultContextFlashAttention = false
     }: {
         ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>,
-        llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean
+        llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean
     } = {}) {
         return resolveModelGpuLayersOption(gpuLayers, {
             ggufInsights: this._ggufInsights,
@@ -195,7 +199,8 @@ export class GgufInsightsConfigurationResolver {
             getVramState,
             llamaVramPaddingSize,
             llamaGpu,
-            llamaSupportsGpuOffloading
+            llamaSupportsGpuOffloading,
+            defaultContextFlashAttention
         });
     }
 
@@ -203,6 +208,7 @@ export class GgufInsightsConfigurationResolver {
         modelGpuLayers,
         batchSize,
         modelTrainContextSize,
+        flashAttention = false,
         getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()),
         llamaGpu = this._ggufInsights._llama.gpu,
         ignoreMemorySafetyChecks = false,
@@ -211,6 +217,7 @@ export class GgufInsightsConfigurationResolver {
     }: {
         modelGpuLayers: number,
         modelTrainContextSize: number,
+        flashAttention?: boolean,
         batchSize?: LlamaContextOptions["batchSize"],
         sequences?: number,
         getVramState?(): Promise<{total: number, free: number}>,
@@ -225,6 +232,7 @@ export class GgufInsightsConfigurationResolver {
             modelFileInsights: this._ggufInsights,
             modelGpuLayers,
             modelTrainContextSize,
+            flashAttention,
             getVramState,
             llamaGpu,
             ignoreMemorySafetyChecks,
diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
index 5bf200c6..c4bb5fcf 100644
--- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts
+++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts
@@ -5,7 +5,7 @@ import {minAllowedContextSizeInCalculations} from "../../../config.js";
 import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js";
 
 export async function resolveContextContextSizeOption({
-    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, getVramState, llamaGpu,
+    contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, getVramState, llamaGpu,
     ignoreMemorySafetyChecks = false, isEmbeddingContext = false
 }: {
     contextSize?: LlamaContextOptions["contextSize"],
@@ -14,6 +14,7 @@ export async function resolveContextContextSizeOption({
     modelFileInsights: GgufInsights,
     modelGpuLayers: number,
     modelTrainContextSize: number,
+    flashAttention: boolean,
     getVramState(): Promise<{total: number, free: number}>,
     llamaGpu: BuildGpu,
     ignoreMemorySafetyChecks?: boolean,
@@ -34,6 +35,7 @@ export async function resolveContextContextSizeOption({
             batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}),
             modelGpuLayers: modelGpuLayers,
             sequences,
+            flashAttention,
             isEmbeddingContext
         }).gpuVram;
 
@@ -74,6 +76,7 @@ export async function resolveContextContextSizeOption({
                 batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}),
                 modelGpuLayers: modelGpuLayers,
                 sequences,
+                flashAttention,
                 isEmbeddingContext
             }).gpuVram;
 
diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
index 7a3a70a0..d9dc4369 100644
--- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
+++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts
@@ -11,11 +11,11 @@ const fitContextExtraMemoryPaddingPercentage = 0.5;
 
 export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], {
     ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize,
-    llamaGpu, llamaSupportsGpuOffloading
+    llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention
 }: {
     ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean,
     getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu,
-    llamaSupportsGpuOffloading: boolean
+    llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean
 }): Promise<number> {
     if (gpuLayers == null)
         gpuLayers = "auto";
@@ -35,7 +35,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
         const maxLayersRequirements = getVramRequiredForGpuLayers({
             gpuLayers: resolvedGpuLayers,
             ggufInsights,
-            currentVram: vramState.free
+            currentVram: vramState.free,
+            defaultContextFlashAttention
         });
 
         if (maxLayersRequirements == null)
@@ -69,7 +70,8 @@ export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["
                 : undefined,
             maxGpuLayers: typeof gpuLayers === "object"
                 ? gpuLayers.max
-                : undefined
+                : undefined,
+            defaultContextFlashAttention
         });
 
         const hasGpuLayersRequirements = typeof gpuLayers === "object" &&
@@ -89,13 +91,15 @@ function getBestGpuLayersForFreeVram({
     freeVram,
     fitContext,
     minGpuLayers,
-    maxGpuLayers
+    maxGpuLayers,
+    defaultContextFlashAttention
 }: {
     ggufInsights: GgufInsights,
     freeVram: number,
     fitContext?: {contextSize?: number, embeddingContext?: boolean},
     minGpuLayers?: number,
-    maxGpuLayers?: number
+    maxGpuLayers?: number,
+    defaultContextFlashAttention: boolean
 }) {
     return findBestOption({
         *generator() {
@@ -113,7 +117,8 @@ function getBestGpuLayersForFreeVram({
                 gpuLayers: option.gpuLayers,
                 ggufInsights,
                 currentVram: freeVram,
-                fitContext
+                fitContext,
+                defaultContextFlashAttention
             });
 
             if (layersRequirements == null)
@@ -172,9 +177,10 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer
 }
 
 function getVramRequiredForGpuLayers({
-    gpuLayers, ggufInsights, currentVram, fitContext
+    gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false
 }: {
-    gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}
+    gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean},
+    defaultContextFlashAttention: boolean
 }) {
     const modelVram = ggufInsights.estimateModelResourceRequirements({gpuLayers}).gpuVram;
 
@@ -187,7 +193,8 @@ function getVramRequiredForGpuLayers({
             batchSize: getDefaultContextBatchSize({contextSize: fitContext.contextSize, sequences: 1}),
             modelGpuLayers: gpuLayers,
             sequences: 1,
-            isEmbeddingContext: fitContext.embeddingContext ?? false
+            isEmbeddingContext: fitContext.embeddingContext ?? false,
+            flashAttention: defaultContextFlashAttention
         }).gpuVram;
 
         const totalVram = modelVram + contextVram;
@@ -205,7 +212,8 @@ function getVramRequiredForGpuLayers({
         gpuLayers,
         ggufInsights,
         vram: currentVram - modelVram,
-        isEmbeddingContext: fitContext?.embeddingContext ?? false
+        isEmbeddingContext: fitContext?.embeddingContext ?? false,
+        flashAttention: defaultContextFlashAttention
     });
 
     if (maxContext == null || modelVram + maxContext.vram > currentVram)
@@ -218,8 +226,8 @@ function getVramRequiredForGpuLayers({
     };
 }
 
-function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext}: {
-    gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean
+function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention}: {
+    gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean
 }) {
     const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize});
 
@@ -229,7 +237,8 @@ function findMaxPossibleContextSizeForVram({gpuLayers, ggufInsights, vram, isEmb
             batchSize: getDefaultContextBatchSize({contextSize, sequences: 1}),
             modelGpuLayers: gpuLayers,
             sequences: 1,
-            isEmbeddingContext
+            isEmbeddingContext,
+            flashAttention
         }).gpuVram;
 
         if (contextVram <= vram)
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index fbeacf20..46208120 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -1,31 +1,44 @@
 export const enum GgufArchitectureType {
     llama = "llama",
     falcon = "falcon",
+    grok = "grok",
     gpt2 = "gpt2",
     gptj = "gptj",
     gptneox = "gptneox",
     mpt = "mpt",
     baichuan = "baichuan",
     starcoder = "starcoder",
-    persimmon = "persimmon",
     refact = "refact",
     bert = "bert",
     nomicBert = "nomic-bert",
+    jinaBertV2 = "jina-bert-v2",
     bloom = "bloom",
     stablelm = "stablelm",
     qwen = "qwen",
     qwen2 = "qwen2",
+    qwen2moe = "qwen2moe",
     phi2 = "phi2",
+    phi3 = "phi3",
     plamo = "plamo",
     codeshell = "codeshell",
     orion = "orion",
     internlm2 = "internlm2",
     minicpm = "minicpm",
     gemma = "gemma",
+    gemma2 = "gemma2",
     starcoder2 = "starcoder2",
     mamba = "mamba",
+    xverse = "xverse",
     commandR = "command-r",
-    rwkv = "rwkv"
+    dbrx = "dbrx",
+    olmo = "olmo",
+    openelm = "openelm",
+    arctic = "arctic",
+    deepseek2 = "deepseek2",
+    bitnet = "bitnet",
+    t5 = "t5",
+    jais = "jais",
+    unknown = "(unknown)"
 }
 
 export type GgufMetadata<A extends GgufArchitectureType = GgufArchitectureType> = {
@@ -53,8 +66,7 @@ export type GgufMetadataLlmToType = {
     [GgufArchitectureType.gpt2]: GgufMetadataGPT2,
     [GgufArchitectureType.bloom]: GgufMetadataBloom,
     [GgufArchitectureType.falcon]: GgufMetadataFalcon,
-    [GgufArchitectureType.mamba]: GgufMetadataMamba,
-    [GgufArchitectureType.rwkv]: GgufMetadataRWKV
+    [GgufArchitectureType.mamba]: GgufMetadataMamba
 };
 
 // source: `enum llama_ftype` in `llama.h` in the `llama.cpp` source code
@@ -415,15 +427,6 @@ export type GgufMetadataMamba = {
     }
 };
 
-// source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#rwkv
-export type GgufMetadataRWKV = {
-    readonly architecture_version: 4 | number,
-    readonly context_length: number,
-    readonly block_count: number,
-    readonly embedding_length: number,
-    readonly feed_forward_length: number
-};
-
 export function isGgufMetadataOfArchitectureType<A extends GgufArchitectureType>(
     metadata: GgufMetadata, type: A
 ): metadata is GgufMetadata<A> {
diff --git a/src/index.ts b/src/index.ts
index ac90bc3d..1a5de8e8 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -2,7 +2,9 @@ import {DisposedError} from "lifecycle-utils";
 import {Llama} from "./bindings/Llama.js";
 import {getLlama, type LlamaOptions, type LastBuildOptions} from "./bindings/getLlama.js";
 import {NoBinaryFoundError} from "./bindings/utils/NoBinaryFoundError.js";
-import {LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType} from "./bindings/types.js";
+import {
+    type LlamaGpuType, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaLogLevelGreaterThanOrEqual, LlamaVocabularyType
+} from "./bindings/types.js";
 import {LlamaModel, LlamaModelInfillTokens, type LlamaModelOptions, LlamaModelTokens} from "./evaluator/LlamaModel/LlamaModel.js";
 import {TokenAttributes} from "./evaluator/LlamaModel/utils/TokenAttributes.js";
 import {LlamaGrammar, type LlamaGrammarOptions} from "./evaluator/LlamaGrammar.js";
@@ -84,7 +86,7 @@ import {
     type GgufMetadata, type GgufMetadataLlmToType, GgufArchitectureType, GgufFileType, GgufMetadataTokenizerTokenType,
     GgufMetadataArchitecturePoolingType, type GgufMetadataGeneral, type GgufMetadataTokenizer, type GgufMetadataDefaultArchitectureType,
     type GgufMetadataLlmLLaMA, type GgufMetadataMPT, type GgufMetadataGPTNeoX, type GgufMetadataGPTJ, type GgufMetadataGPT2,
-    type GgufMetadataBloom, type GgufMetadataFalcon, type GgufMetadataMamba, type GgufMetadataRWKV, isGgufMetadataOfArchitectureType
+    type GgufMetadataBloom, type GgufMetadataFalcon, type GgufMetadataMamba, isGgufMetadataOfArchitectureType
 } from "./gguf/types/GgufMetadataTypes.js";
 import {GgmlType, type GgufTensorInfo} from "./gguf/types/GgufTensorInfoTypes.js";
 
@@ -94,6 +96,7 @@ export {
     getLlama,
     type LlamaOptions,
     type LastBuildOptions,
+    type LlamaGpuType,
     LlamaLogLevel,
     NoBinaryFoundError,
     LlamaModel,
@@ -238,7 +241,6 @@ export {
     type GgufMetadataBloom,
     type GgufMetadataFalcon,
     type GgufMetadataMamba,
-    type GgufMetadataRWKV,
     GgmlType,
     isGgufMetadataOfArchitectureType,
     GgufInsights,