Skip to content
53 changes: 53 additions & 0 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ namespace npuw {
namespace llm {
enum class PrefillHint { DYNAMIC, STATIC };
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
enum class AttentionHint { DYNAMIC, STATIC };
} // namespace llm
} // namespace npuw

Expand Down Expand Up @@ -203,6 +204,58 @@ struct NPUW_LLM_PREFILL_HINT final : OptionBase<NPUW_LLM_PREFILL_HINT, ::intel_n
}
};

struct ATTN_HINT_BASE : OptionBase<ATTN_HINT_BASE, ::intel_npu::npuw::llm::AttentionHint> {
static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::llm::AttentionHint";
}

static ::intel_npu::npuw::llm::AttentionHint defaultValue() {
return ::intel_npu::npuw::llm::AttentionHint::STATIC;
}

static ::intel_npu::npuw::llm::AttentionHint parse(std::string_view val) {
if (val == "DYNAMIC") {
return ::intel_npu::npuw::llm::AttentionHint::DYNAMIC;
} else if (val == "STATIC") {
return ::intel_npu::npuw::llm::AttentionHint::STATIC;
}
OPENVINO_THROW("Unsupported attention hint provided: ", val);
return {};
}

static std::string toString(const ::intel_npu::npuw::llm::AttentionHint& val) {
switch (val) {
case ::intel_npu::npuw::llm::AttentionHint::DYNAMIC:
return "DYNAMIC";
case ::intel_npu::npuw::llm::AttentionHint::STATIC:
return "STATIC";
default:
OPENVINO_THROW("Can't convert provided attention hint : ", int(val), " to string.");
}
return {};
}

static OptionMode mode() {
return OptionMode::RunTime;
}

static bool isPublic() {
return false;
}
};

struct NPUW_LLM_GENERATE_ATTENTION_HINT final : ATTN_HINT_BASE {
static std::string_view key() {
return ov::intel_npu::npuw::llm::generate_attn_hint.name();
}
};

struct NPUW_LLM_PREFILL_ATTENTION_HINT final : ATTN_HINT_BASE {
static std::string_view key() {
return ov::intel_npu::npuw::llm::prefill_attn_hint.name();
}
};

struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
static std::string_view key() {
return ov::intel_npu::npuw::llm::generate_hint.name();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,14 @@ static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFI
*/
static constexpr ov::Property<ov::AnyMap> additional_prefill_config{"++NPUW_LLM_PREFILL_CONFIG"};

/**
* @brief
* Type: std::string.
* Hint for the attention handling in prefill stage. NPUW will use optimal configuration based on the passed preference
* via hint. Possible values: "DYNAMIC", "STATIC". Default value: "STATIC".
*/
static constexpr ov::Property<std::string> prefill_attn_hint{"NPUW_LLM_PREFILL_ATTENTION_HINT"};

/**
* @brief
* Type: std::string.
Expand Down Expand Up @@ -536,6 +544,15 @@ static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CON
*/
static constexpr ov::Property<ov::AnyMap> additional_generate_config{"++NPUW_LLM_GENERATE_CONFIG"};

/**
* @brief
* Type: std::string.
* Hint for the attention handling. NPUW will use optimal configuration based on the passed preference via hint.
* Possible values: "DYNAMIC", "STATIC".
* Default value: "STATIC".
*/
static constexpr ov::Property<std::string> generate_attn_hint{"NPUW_LLM_GENERATE_ATTENTION_HINT"};

/**
* @brief
* Type: bool.
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM_MAX_GENERATION_TOKEN_LEN>();
desc.add<NPUW_LLM_PREFILL_HINT>();
desc.add<NPUW_LLM_GENERATE_HINT>();
desc.add<NPUW_LLM_PREFILL_ATTENTION_HINT>();
desc.add<NPUW_LLM_GENERATE_ATTENTION_HINT>();
desc.add<NPUW_LLM_SHARED_HEAD>();
}

Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/include/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,11 @@ class Properties final {
ov::intel_npu::npuw::llm::prefill_hint.name(),
ov::intel_npu::npuw::llm::prefill_config.name(),
ov::intel_npu::npuw::llm::additional_prefill_config.name(),
ov::intel_npu::npuw::llm::prefill_attn_hint.name(),
ov::intel_npu::npuw::llm::generate_hint.name(),
ov::intel_npu::npuw::llm::generate_config.name(),
ov::intel_npu::npuw::llm::additional_generate_config.name(),
ov::intel_npu::npuw::llm::generate_attn_hint.name(),
ov::intel_npu::npuw::llm::shared_lm_head_config.name(),
ov::intel_npu::npuw::llm::additional_shared_lm_head_config.name()};

Expand Down
41 changes: 34 additions & 7 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -922,15 +922,23 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
decompose_GQA(prefill_model, true);
decompose_GQA(kvcache_model, false);

const auto prefill_attn_hint = m_cfg.get<::intel_npu::NPUW_LLM_PREFILL_ATTENTION_HINT>();
const auto generate_attn_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_ATTENTION_HINT>();
const bool prefill_attn_dyn = prefill_attn_hint == ::intel_npu::npuw::llm::AttentionHint::DYNAMIC;
const bool generate_attn_dyn = generate_attn_hint == ::intel_npu::npuw::llm::AttentionHint::DYNAMIC;

const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
if (optimize_v_tensors) {
LOG_DEBUG("Check and apply opt layout");
LOG_BLOCK();
if (ov::npuw::util::optimize_value_tensors(kvcache_model, false)) {
NPUW_ASSERT(ov::npuw::util::optimize_value_tensors(prefill_model, true));
m_kvcache_desc.v_tensors_transposed = true;
} else {
LOG_DEBUG("vtensors optimisation not applied");
// Only optimize V tensors for static attention types
if (!generate_attn_dyn && ov::npuw::util::optimize_value_tensors(kvcache_model, false)) {
LOG_DEBUG("V-tensors tranposed in generate model");
m_kvcache_desc.v_tensors_transposed_gen = true;
}
if (!prefill_attn_dyn && ov::npuw::util::optimize_value_tensors(prefill_model, true)) {
LOG_DEBUG("V-tensors tranposed in prefill model");
m_kvcache_desc.v_tensors_transposed_pre = true;
Copy link
Contributor

@AsyaPronina AsyaPronina Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This a bit changes default behaviour: previously if transpose wasn't needed, then we applied SDPA unroll for generate model, but not for prefill. (As transpose returned false then we didn't go and apply the same transformations to prefill)

Now we apply unroll both to generate and prefill regardless if transpose transformation for generate returned true or false.

Is it expected? It seems more correct now, by the way

Copy link
Contributor Author

@dmatveev dmatveev Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what do you refer to as to the previous behavior.

SDPA unroll is a part of the optimize_value_tensors routine. If transpose wasn't needed (I'd rather call it "cancelled" as it is "needed" by default), neither transformations were called. Where did we apply the SDPA unroll for the generate model in this case, am I missing something?

When it was actually "needed", we applied the transformation first to the kvcache model and IF IT WAS applied, required it (via assert) to be applied to the prefill model as well.

Or do you refer to the fact that optimize_value_tensors could unroll SDPA in one model but return false as it couldn't transpose the v-tensor? That behavior was rather an inconsistency than something we should keep at all costs. It is still the case btw, we can unroll the SDPA for no actual reason (so v-tensors won't be transposed).

Probably a way to workaround this would be to clone the model within the pass and then return either original model if the transpose pass has failed, or the transformed one if it was actually applied. That's probably a thing -
EISW-186959. Thanks!

}
} else {
LOG_DEBUG("Check and apply opt layout --- SKIPPED");
Expand Down Expand Up @@ -972,6 +980,21 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
merge_config_with(prefill_config, prefill_config_addition_value);
merge_config_with(generate_config, generate_config_addition_value);

// Handle attention hints. FIXME: Maybe it makes sense to make those
// mutually exclusive with the precise configuration sections as well
const ov::AnyMap dyn_attn_opts = {
{"NPUW_ONLINE_PIPELINE", "REP"},
{"NPUW_ONLINE_ISOLATE", "ATTN"},
{"NPUW_ONLINE_KEEP_BLOCK_SIZE", "4"},
{"NPUW_UNFOLD_IREQS", "NO"},
};
if (prefill_attn_dyn) {
merge_config_with(prefill_config, dyn_attn_opts);
}
if (generate_attn_dyn) {
merge_config_with(generate_config, dyn_attn_opts);
}

if (m_cfg.get<::intel_npu::NPUW_LLM_CACHE_ROPE>()) {
LOG_DEBUG("Caching preROPE ");
const uint32_t CACHE_ROPE_START = 2048;
Expand Down Expand Up @@ -1109,7 +1132,8 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw:
write(model_stream, m_kvcache_desc.num_stored_tokens);
write(model_stream, m_kvcache_desc.dim);
write(model_stream, m_kvcache_desc.max_generation_token_len);
write(model_stream, m_kvcache_desc.v_tensors_transposed);
write(model_stream, m_kvcache_desc.v_tensors_transposed_pre);
write(model_stream, m_kvcache_desc.v_tensors_transposed_gen); // FIXME: bump required
write(model_stream, m_prefill_chunk_size);
write(model_stream, m_use_chunk_prefill);
write(model_stream, m_max_lora_rank);
Expand Down Expand Up @@ -1318,7 +1342,8 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
read(model_stream, compiled->m_kvcache_desc.num_stored_tokens);
read(model_stream, compiled->m_kvcache_desc.dim);
read(model_stream, compiled->m_kvcache_desc.max_generation_token_len);
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed);
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed_pre);
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed_gen); // FIXME: bump required!
read(model_stream, compiled->m_prefill_chunk_size);
read(model_stream, compiled->m_use_chunk_prefill);
read(model_stream, compiled->m_max_lora_rank);
Expand Down Expand Up @@ -1409,6 +1434,8 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
BIND(npuw::llm::prefill_chunk_size, NPUW_LLM_PREFILL_CHUNK_SIZE, get),
BIND(npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT, getString),
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString),
BIND(npuw::llm::prefill_attn_hint, NPUW_LLM_PREFILL_ATTENTION_HINT, getString),
BIND(npuw::llm::generate_attn_hint, NPUW_LLM_GENERATE_ATTENTION_HINT, getString),
BIND(npuw::llm::shared_lm_head, NPUW_LLM_SHARED_HEAD, get)});
#undef BIND
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
uint32_t num_stored_tokens = 0u;
uint32_t dim = 0u;
uint32_t max_generation_token_len = 0u;
bool v_tensors_transposed = false;
bool v_tensors_transposed_pre = false; // prefill
bool v_tensors_transposed_gen = false; // generate
};

LLMCompiledModel(const std::shared_ptr<ov::Model>& model,
Expand Down
Loading
Loading