Optimize first token of C++ NPU by adding npu_dpu_groups (#12443)

rnwang04 · web-flow · commit 52c17fe1046c · 2024-11-26T11:41:32.000+08:00
* add npu_dpu_groups

* add check for env

* fix style
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py
@@ -23,7 +23,8 @@
 import numpy as np
 
 
-def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
+def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True,
+                                       npu_dpu_groups=None):
     xml_path = os.path.join(dir, model_name + ".xml")
     bin_path = os.path.join(dir, model_name + ".bin")
     model.save(xml_path)
@@ -35,6 +36,11 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
     core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
                               "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
     core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
+    if (
+        npu_dpu_groups is not None
+        and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1"
+    ):
+        core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)})
 
     model = core.read_model(xml_path)
     inputs = model.inputs
diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/llama.py
@@ -272,11 +272,13 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
         input_len = 1
         decoder_name = f"decoder_layer_{layer_idx}"
         keep_position_ids = True
+        npu_dpu_groups = None
     else:
         input_len = kv_len
         decoder_name = "decoder_layer_prefill"
         layernorm_const = False
         keep_position_ids = False
+        npu_dpu_groups = 6
 
     single_decoder = LowBitLlamaMultiDecoderlayer(
         [1, input_len, num_heads * head_dim],
@@ -303,7 +305,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
     rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
                                                         decoder_name,
                                                         temp_dir,
-                                                        True, False)
+                                                        True, False,
+                                                        npu_dpu_groups=npu_dpu_groups)
 
     if mode == "decode":
         if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):