openvinotoolkit · dmatveev · Oct 4, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 24, 2025
@@ -156,6 +156,7 @@ namespace npuw {
 namespace llm {
 enum class PrefillHint { DYNAMIC, STATIC };
 enum class GenerateHint { FAST_COMPILE, BEST_PERF };
+enum class AttentionHint { DYNAMIC, STATIC };
 }  // namespace llm
 }  // namespace npuw
 
@@ -203,6 +204,58 @@ struct NPUW_LLM_PREFILL_HINT final : OptionBase<NPUW_LLM_PREFILL_HINT, ::intel_n
     }
 };
 
+struct ATTN_HINT_BASE : OptionBase<ATTN_HINT_BASE, ::intel_npu::npuw::llm::AttentionHint> {
+    static constexpr std::string_view getTypeName() {
+        return "::intel_npu::npuw::llm::AttentionHint";
+    }
+
+    static ::intel_npu::npuw::llm::AttentionHint defaultValue() {
+        return ::intel_npu::npuw::llm::AttentionHint::STATIC;
+    }
+
+    static ::intel_npu::npuw::llm::AttentionHint parse(std::string_view val) {
+        if (val == "DYNAMIC") {
+            return ::intel_npu::npuw::llm::AttentionHint::DYNAMIC;
+        } else if (val == "STATIC") {
+            return ::intel_npu::npuw::llm::AttentionHint::STATIC;
+        }
+        OPENVINO_THROW("Unsupported attention hint provided: ", val);
+        return {};
+    }
+
+    static std::string toString(const ::intel_npu::npuw::llm::AttentionHint& val) {
+        switch (val) {
+        case ::intel_npu::npuw::llm::AttentionHint::DYNAMIC:
+            return "DYNAMIC";
+        case ::intel_npu::npuw::llm::AttentionHint::STATIC:
+            return "STATIC";
+        default:
+            OPENVINO_THROW("Can't convert provided attention hint : ", int(val), " to string.");
+        }
+        return {};
+    }
+
+    static OptionMode mode() {
+        return OptionMode::RunTime;
+    }
+
+    static bool isPublic() {
+        return false;
+    }
+};
+
+struct NPUW_LLM_GENERATE_ATTENTION_HINT final : ATTN_HINT_BASE {
+    static std::string_view key() {
+        return ov::intel_npu::npuw::llm::generate_attn_hint.name();
+    }
+};
+
+struct NPUW_LLM_PREFILL_ATTENTION_HINT final : ATTN_HINT_BASE {
+    static std::string_view key() {
+        return ov::intel_npu::npuw::llm::prefill_attn_hint.name();
+    }
+};
+
 struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
     static std::string_view key() {
         return ov::intel_npu::npuw::llm::generate_hint.name();

@@ -505,6 +505,14 @@ static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFI
  */
 static constexpr ov::Property<ov::AnyMap> additional_prefill_config{"++NPUW_LLM_PREFILL_CONFIG"};
 
+/**
+ * @brief
+ * Type: std::string.
+ * Hint for the attention handling in prefill stage. NPUW will use optimal configuration based on the passed preference
+ * via hint. Possible values: "DYNAMIC", "STATIC". Default value: "STATIC".
+ */
+static constexpr ov::Property<std::string> prefill_attn_hint{"NPUW_LLM_PREFILL_ATTENTION_HINT"};
+
 /**
  * @brief
  * Type: std::string.
@@ -536,6 +544,15 @@ static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CON
  */
 static constexpr ov::Property<ov::AnyMap> additional_generate_config{"++NPUW_LLM_GENERATE_CONFIG"};
 
+/**
+ * @brief
+ * Type: std::string.
+ * Hint for the attention handling. NPUW will use optimal configuration based on the passed preference via hint.
+ * Possible values: "DYNAMIC", "STATIC".
+ * Default value: "STATIC".
+ */
+static constexpr ov::Property<std::string> generate_attn_hint{"NPUW_LLM_GENERATE_ATTENTION_HINT"};
+
 /**
  * @brief
  * Type: bool.

@@ -69,6 +69,8 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
     desc.add<NPUW_LLM_PREFILL_HINT>();
     desc.add<NPUW_LLM_GENERATE_HINT>();
+    desc.add<NPUW_LLM_PREFILL_ATTENTION_HINT>();
+    desc.add<NPUW_LLM_GENERATE_ATTENTION_HINT>();
     desc.add<NPUW_LLM_SHARED_HEAD>();
 }
 

@@ -121,9 +121,11 @@ class Properties final {
         ov::intel_npu::npuw::llm::prefill_hint.name(),
         ov::intel_npu::npuw::llm::prefill_config.name(),
         ov::intel_npu::npuw::llm::additional_prefill_config.name(),
+        ov::intel_npu::npuw::llm::prefill_attn_hint.name(),
         ov::intel_npu::npuw::llm::generate_hint.name(),
         ov::intel_npu::npuw::llm::generate_config.name(),
         ov::intel_npu::npuw::llm::additional_generate_config.name(),
+        ov::intel_npu::npuw::llm::generate_attn_hint.name(),
         ov::intel_npu::npuw::llm::shared_lm_head_config.name(),
         ov::intel_npu::npuw::llm::additional_shared_lm_head_config.name()};
 

@@ -922,15 +922,23 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     decompose_GQA(prefill_model, true);
     decompose_GQA(kvcache_model, false);
 
+    const auto prefill_attn_hint = m_cfg.get<::intel_npu::NPUW_LLM_PREFILL_ATTENTION_HINT>();
+    const auto generate_attn_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_ATTENTION_HINT>();
+    const bool prefill_attn_dyn = prefill_attn_hint == ::intel_npu::npuw::llm::AttentionHint::DYNAMIC;
+    const bool generate_attn_dyn = generate_attn_hint == ::intel_npu::npuw::llm::AttentionHint::DYNAMIC;
+
     const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
     if (optimize_v_tensors) {
         LOG_DEBUG("Check and apply opt layout");
         LOG_BLOCK();
-        if (ov::npuw::util::optimize_value_tensors(kvcache_model, false)) {
-            NPUW_ASSERT(ov::npuw::util::optimize_value_tensors(prefill_model, true));
-            m_kvcache_desc.v_tensors_transposed = true;
-        } else {
-            LOG_DEBUG("vtensors optimisation not applied");
+        // Only optimize V tensors for static attention types
+        if (!generate_attn_dyn && ov::npuw::util::optimize_value_tensors(kvcache_model, false)) {
+            LOG_DEBUG("V-tensors tranposed in generate model");
+            m_kvcache_desc.v_tensors_transposed_gen = true;
+        }
+        if (!prefill_attn_dyn && ov::npuw::util::optimize_value_tensors(prefill_model, true)) {
+            LOG_DEBUG("V-tensors tranposed in prefill model");
+            m_kvcache_desc.v_tensors_transposed_pre = true;
         }
     } else {
         LOG_DEBUG("Check and apply opt layout --- SKIPPED");
@@ -972,6 +980,21 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     merge_config_with(prefill_config, prefill_config_addition_value);
     merge_config_with(generate_config, generate_config_addition_value);
 
+    // Handle attention hints. FIXME: Maybe it makes sense to make those
+    // mutually exclusive with the precise configuration sections as well
+    const ov::AnyMap dyn_attn_opts = {
+        {"NPUW_ONLINE_PIPELINE", "REP"},
+        {"NPUW_ONLINE_ISOLATE", "ATTN"},
+        {"NPUW_ONLINE_KEEP_BLOCK_SIZE", "4"},
+        {"NPUW_UNFOLD_IREQS", "NO"},
+    };
+    if (prefill_attn_dyn) {
+        merge_config_with(prefill_config, dyn_attn_opts);
+    }
+    if (generate_attn_dyn) {
+        merge_config_with(generate_config, dyn_attn_opts);
+    }
+
     if (m_cfg.get<::intel_npu::NPUW_LLM_CACHE_ROPE>()) {
         LOG_DEBUG("Caching preROPE ");
         const uint32_t CACHE_ROPE_START = 2048;
@@ -1109,7 +1132,8 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
         write(model_stream, m_kvcache_desc.num_stored_tokens);
         write(model_stream, m_kvcache_desc.dim);
         write(model_stream, m_kvcache_desc.max_generation_token_len);
-        write(model_stream, m_kvcache_desc.v_tensors_transposed);
+        write(model_stream, m_kvcache_desc.v_tensors_transposed_pre);
+        write(model_stream, m_kvcache_desc.v_tensors_transposed_gen);  // FIXME: bump required
         write(model_stream, m_prefill_chunk_size);
         write(model_stream, m_use_chunk_prefill);
         write(model_stream, m_max_lora_rank);
@@ -1318,7 +1342,8 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
         read(model_stream, compiled->m_kvcache_desc.num_stored_tokens);
         read(model_stream, compiled->m_kvcache_desc.dim);
         read(model_stream, compiled->m_kvcache_desc.max_generation_token_len);
-        read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed);
+        read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed_pre);
+        read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed_gen);  // FIXME: bump required!
         read(model_stream, compiled->m_prefill_chunk_size);
         read(model_stream, compiled->m_use_chunk_prefill);
         read(model_stream, compiled->m_max_lora_rank);
@@ -1409,6 +1434,8 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
                           BIND(npuw::llm::prefill_chunk_size, NPUW_LLM_PREFILL_CHUNK_SIZE, get),
                           BIND(npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT, getString),
                           BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString),
+                          BIND(npuw::llm::prefill_attn_hint, NPUW_LLM_PREFILL_ATTENTION_HINT, getString),
+                          BIND(npuw::llm::generate_attn_hint, NPUW_LLM_GENERATE_ATTENTION_HINT, getString),
                           BIND(npuw::llm::shared_lm_head, NPUW_LLM_SHARED_HEAD, get)});
 #undef BIND
 }
@@ -25,7 +25,8 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
         uint32_t num_stored_tokens = 0u;
         uint32_t dim = 0u;
         uint32_t max_generation_token_len = 0u;
-        bool v_tensors_transposed = false;
+        bool v_tensors_transposed_pre = false;  // prefill
+        bool v_tensors_transposed_gen = false;  // generate
     };
 
     LLMCompiledModel(const std::shared_ptr<ov::Model>& model,