-
Notifications
You must be signed in to change notification settings - Fork 2.7k
NPUW: Introduce attention hints, allow different kvcache layouts #32284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
b632224
85adaa4
527a883
e3c4725
3608649
8e34655
a204c95
ea530ab
a208e02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -922,15 +922,23 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m | |
decompose_GQA(prefill_model, true); | ||
decompose_GQA(kvcache_model, false); | ||
|
||
const auto prefill_attn_hint = m_cfg.get<::intel_npu::NPUW_LLM_PREFILL_ATTENTION_HINT>(); | ||
const auto generate_attn_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_ATTENTION_HINT>(); | ||
const bool prefill_attn_dyn = prefill_attn_hint == ::intel_npu::npuw::llm::AttentionHint::DYNAMIC; | ||
const bool generate_attn_dyn = generate_attn_hint == ::intel_npu::npuw::llm::AttentionHint::DYNAMIC; | ||
|
||
const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>(); | ||
if (optimize_v_tensors) { | ||
LOG_DEBUG("Check and apply opt layout"); | ||
LOG_BLOCK(); | ||
if (ov::npuw::util::optimize_value_tensors(kvcache_model, false)) { | ||
NPUW_ASSERT(ov::npuw::util::optimize_value_tensors(prefill_model, true)); | ||
m_kvcache_desc.v_tensors_transposed = true; | ||
} else { | ||
LOG_DEBUG("vtensors optimisation not applied"); | ||
// Only optimize V tensors for static attention types | ||
if (!generate_attn_dyn && ov::npuw::util::optimize_value_tensors(kvcache_model, false)) { | ||
LOG_DEBUG("V-tensors tranposed in generate model"); | ||
m_kvcache_desc.v_tensors_transposed_gen = true; | ||
} | ||
if (!prefill_attn_dyn && ov::npuw::util::optimize_value_tensors(prefill_model, true)) { | ||
LOG_DEBUG("V-tensors tranposed in prefill model"); | ||
m_kvcache_desc.v_tensors_transposed_pre = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This a bit changes default behaviour: previously if transpose wasn't needed, then we applied SDPA unroll for generate model, but not for prefill. (As transpose returned Now we apply unroll both to generate and prefill regardless if transpose transformation for generate returned Is it expected? It seems more correct now, by the way There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure what do you refer to as to the previous behavior. SDPA unroll is a part of the When it was actually "needed", we applied the transformation first to the kvcache model and IF IT WAS applied, required it (via assert) to be applied to the prefill model as well. Or do you refer to the fact that Probably a way to workaround this would be to clone the model within the pass and then return either original model if the transpose pass has failed, or the transformed one if it was actually applied. That's probably a thing - |
||
} | ||
} else { | ||
LOG_DEBUG("Check and apply opt layout --- SKIPPED"); | ||
|
@@ -972,6 +980,21 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m | |
merge_config_with(prefill_config, prefill_config_addition_value); | ||
merge_config_with(generate_config, generate_config_addition_value); | ||
|
||
// Handle attention hints. FIXME: Maybe it makes sense to make those | ||
// mutually exclusive with the precise configuration sections as well | ||
const ov::AnyMap dyn_attn_opts = { | ||
{"NPUW_ONLINE_PIPELINE", "REP"}, | ||
{"NPUW_ONLINE_ISOLATE", "ATTN"}, | ||
{"NPUW_ONLINE_KEEP_BLOCK_SIZE", "4"}, | ||
{"NPUW_UNFOLD_IREQS", "NO"}, | ||
}; | ||
if (prefill_attn_dyn) { | ||
merge_config_with(prefill_config, dyn_attn_opts); | ||
} | ||
if (generate_attn_dyn) { | ||
merge_config_with(generate_config, dyn_attn_opts); | ||
} | ||
|
||
if (m_cfg.get<::intel_npu::NPUW_LLM_CACHE_ROPE>()) { | ||
LOG_DEBUG("Caching preROPE "); | ||
const uint32_t CACHE_ROPE_START = 2048; | ||
|
@@ -1109,7 +1132,8 @@ void ov::npuw::LLMCompiledModel::serialize(std::ostream& stream, const ov::npuw: | |
write(model_stream, m_kvcache_desc.num_stored_tokens); | ||
write(model_stream, m_kvcache_desc.dim); | ||
write(model_stream, m_kvcache_desc.max_generation_token_len); | ||
write(model_stream, m_kvcache_desc.v_tensors_transposed); | ||
write(model_stream, m_kvcache_desc.v_tensors_transposed_pre); | ||
write(model_stream, m_kvcache_desc.v_tensors_transposed_gen); // FIXME: bump required | ||
dmatveev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
dmatveev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
write(model_stream, m_prefill_chunk_size); | ||
write(model_stream, m_use_chunk_prefill); | ||
write(model_stream, m_max_lora_rank); | ||
|
@@ -1318,7 +1342,8 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial | |
read(model_stream, compiled->m_kvcache_desc.num_stored_tokens); | ||
read(model_stream, compiled->m_kvcache_desc.dim); | ||
read(model_stream, compiled->m_kvcache_desc.max_generation_token_len); | ||
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed); | ||
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed_pre); | ||
read(model_stream, compiled->m_kvcache_desc.v_tensors_transposed_gen); // FIXME: bump required! | ||
dmatveev marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
read(model_stream, compiled->m_prefill_chunk_size); | ||
read(model_stream, compiled->m_use_chunk_prefill); | ||
read(model_stream, compiled->m_max_lora_rank); | ||
|
@@ -1409,6 +1434,8 @@ void ov::npuw::LLMCompiledModel::implement_properties() { | |
BIND(npuw::llm::prefill_chunk_size, NPUW_LLM_PREFILL_CHUNK_SIZE, get), | ||
BIND(npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT, getString), | ||
BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString), | ||
BIND(npuw::llm::prefill_attn_hint, NPUW_LLM_PREFILL_ATTENTION_HINT, getString), | ||
BIND(npuw::llm::generate_attn_hint, NPUW_LLM_GENERATE_ATTENTION_HINT, getString), | ||
BIND(npuw::llm::shared_lm_head, NPUW_LLM_SHARED_HEAD, get)}); | ||
#undef BIND | ||
} |
Uh oh!
There was an error while loading. Please reload this page.