Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
import numpy as np


def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True):
def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True, keep_ir=True,
npu_dpu_groups=None):
xml_path = os.path.join(dir, model_name + ".xml")
bin_path = os.path.join(dir, model_name + ".bin")
model.save(xml_path)
Expand All @@ -35,6 +36,8 @@ def update_names_of_IR_and_export_blob(model, model_name, dir, compile_blob=True
core.set_property("NPU", {"NPU_COMPILATION_MODE_PARAMS":
"compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"})
core.set_property("NPU", {"PERFORMANCE_HINT": "LATENCY"})
if npu_dpu_groups is not None:
core.set_property("NPU", {"NPU_DPU_GROUPS": str(npu_dpu_groups)})

model = core.read_model(xml_path)
inputs = model.inputs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,13 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
input_len = 1
decoder_name = f"decoder_layer_{layer_idx}"
keep_position_ids = True
npu_dpu_groups = None
else:
input_len = kv_len
decoder_name = "decoder_layer_prefill"
layernorm_const = False
keep_position_ids = False
npu_dpu_groups = 6

single_decoder = LowBitLlamaMultiDecoderlayer(
[1, input_len, num_heads * head_dim],
Expand All @@ -303,7 +305,8 @@ def convert_llama_layer(model, layer_idx, n_splits_linear, n_splits_down_proj,
rest_blob_path = update_names_of_IR_and_export_blob(single_decoder,
decoder_name,
temp_dir,
True, False)
True, False,
npu_dpu_groups=npu_dpu_groups)

if mode == "decode":
if hasattr(curr_layer.self_attn.rotary_emb, "cos_cached"):
Expand Down