@@ -3367,6 +3367,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3367
3367
} break;
3368
3368
case LLM_ARCH_PLAMO2:
3369
3369
{
3370
+ // mamba parameters
3371
+ const uint32_t d_conv = hparams.ssm_d_conv;
3372
+ const uint32_t d_state = hparams.ssm_d_state;
3373
+ const uint32_t num_heads = hparams.ssm_dt_rank;
3374
+ const uint32_t intermediate_size = hparams.ssm_d_inner;
3375
+ const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
3376
+
3377
+ // attention parameters
3378
+ const uint32_t qk_dim = hparams.n_embd_head_k;
3379
+ const uint32_t v_dim = hparams.n_embd_head_v;
3380
+
3370
3381
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3371
3382
3372
3383
// output
@@ -3385,12 +3396,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3385
3396
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3386
3397
3387
3398
if (is_mamba_layer) {
3388
- const uint32_t d_conv = hparams.ssm_d_conv;
3389
- const uint32_t d_state = hparams.ssm_d_state;
3390
- const uint32_t num_heads = hparams.ssm_dt_rank;
3391
- const uint32_t intermediate_size = hparams.ssm_d_inner;
3392
- const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
3393
-
3394
3399
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2 * intermediate_size}, 0);
3395
3400
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, intermediate_size}, 0);
3396
3401
@@ -3407,9 +3412,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3407
3412
layer.ssm_b_norm = create_tensor(tn(LLM_TENSOR_SSM_B_NORM, i), {d_state}, 0);
3408
3413
layer.ssm_c_norm = create_tensor(tn(LLM_TENSOR_SSM_C_NORM, i), {d_state}, 0);
3409
3414
} else {
3410
- const uint32_t head_dim = hparams.n_embd_head_k;
3411
- const uint32_t qk_dim = head_dim;
3412
- const uint32_t v_dim = head_dim;
3413
3415
const int64_t num_attention_heads = hparams.n_head(i);
3414
3416
const int64_t q_num_heads = num_attention_heads;
3415
3417
const int64_t num_key_value_heads = hparams.n_head_kv(i);
@@ -3420,8 +3422,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3420
3422
const int64_t v_proj_dim = v_num_heads * v_dim;
3421
3423
3422
3424
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, q_proj_dim + k_proj_dim + v_proj_dim}, 0);
3423
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim , num_attention_heads}, 0);
3424
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim , k_num_heads}, 0);
3425
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {qk_dim , num_attention_heads}, 0);
3426
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {qk_dim , k_num_heads}, 0);
3425
3427
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {q_num_heads * v_dim, n_embd}, 0);
3426
3428
}
3427
3429
0 commit comments