@@ -14857,12 +14857,13 @@ struct llm_build_context {
14857
14857
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
14858
14858
14859
14859
// sliding window switch pattern
14860
- const int32_t n_layer_switch = 4;
14860
+ const int32_t sliding_window_pattern = 4;
14861
14861
14862
14862
for (int il = 0; il < n_layer; ++il) {
14863
14863
// three layers sliding window attention (window size 4096) and ROPE
14864
14864
// fourth layer uses global attention without positional embeddings
14865
- struct ggml_tensor * KQ_mask_l = (il % n_layer_switch < (n_layer_switch - 1)) ? KQ_mask_swa : KQ_mask;
14865
+ const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
14866
+ struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
14866
14867
14867
14868
// norm
14868
14869
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
@@ -14871,6 +14872,9 @@ struct llm_build_context {
14871
14872
14872
14873
// self-attention
14873
14874
{
14875
+ // rope freq factors for 128k context
14876
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
14877
+
14874
14878
// compute Q and K and RoPE them
14875
14879
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
14876
14880
cb(Qcur, "Qcur", il);
@@ -14893,15 +14897,24 @@ struct llm_build_context {
14893
14897
cb(Vcur, "Vcur", il);
14894
14898
}
14895
14899
14896
- Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
14900
+ if (is_sliding) {
14901
+ Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
14897
14902
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
14898
14903
beta_fast, beta_slow);
14899
- cb(Qcur, "Qcur", il);
14904
+ cb(Qcur, "Qcur", il);
14900
14905
14901
- Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
14902
- nullptr , n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
14906
+ Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
14907
+ rope_factors , n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
14903
14908
attn_factor, beta_fast, beta_slow);
14904
- cb(Kcur, "Kcur", il);
14909
+ cb(Kcur, "Kcur", il);
14910
+ } else {
14911
+ // For non-sliding layers, just reshape without applying RoPE
14912
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
14913
+ cb(Qcur, "Qcur", il);
14914
+
14915
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
14916
+ cb(Kcur, "Kcur", il);
14917
+ }
14905
14918
14906
14919
cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
14907
14920
KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
0 commit comments