Skip to content

Commit 52c17fe

Browse files
authored
Optimize first token of C++ NPU by adding npu_dpu_groups (#12443)
* add npu_dpu_groups * add check for env * fix style
1 parent 66bd7ab commit 52c17fe

File tree

2 files changed

+11
-2
lines changed

2 files changed

+11
-2
lines changed

python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
import numpy as np
2424

2525

26-
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
26+
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True,
27+
npu_dpu_groups=None):
2728
xml_path = os.path.join(dir, model_name + ".xml")
2829
bin_path = os.path.join(dir, model_name + ".bin")
2930
model.save(xml_path)
@@ -35,6 +36,11 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
3536
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
3637
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
3738
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
39+
if (
40+
npu_dpu_groups is not None
41+
and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
42+
):
43+
core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)})
3844

3945
model = core.read_model(xml_path)
4046
inputs = model.inputs

python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,13 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
272272
input_len = 1
273273
decoder_name = f"decoder_layer_{layer_idx}"
274274
keep_position_ids = True
275+
npu_dpu_groups = None
275276
else:
276277
input_len = kv_len
277278
decoder_name = "decoder_layer_prefill"
278279
layernorm_const = False
279280
keep_position_ids = False
281+
npu_dpu_groups = 6
280282

281283
single_decoder = LowBitLlamaMultiDecoderlayer(
282284
[1, input_len, num_heads * head_dim],
@@ -303,7 +305,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
303305
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
304306
decoder_name,
305307
temp_dir,
306-
True, False)
308+
True, False,
309+
npu_dpu_groups=npu_dpu_groups)
307310

308311
if mode == "decode":
309312
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):

0 commit comments

Comments
 (0)