Apply RoPE for SWA layers only.

dranger003 · dranger003 · commit 5999fdc8d17c · 2024-12-19T19:21:51.000-05:00
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -14857,12 +14857,13 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
 
         // sliding window switch pattern
-        const int32_t n_layer_switch = 4;
+        const int32_t sliding_window_pattern = 4;
 
         for (int il = 0; il < n_layer; ++il) {
             // three layers sliding window attention (window size 4096) and ROPE
             // fourth layer uses global attention without positional embeddings
-            struct ggml_tensor * KQ_mask_l = (il % n_layer_switch < (n_layer_switch - 1)) ? KQ_mask_swa : KQ_mask;
+            const bool           is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
 
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
@@ -14871,6 +14872,9 @@ struct llm_build_context {
 
             // self-attention
             {
+                // rope freq factors for 128k context
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
@@ -14893,15 +14897,24 @@ struct llm_build_context {
                     cb(Vcur, "Vcur", il);
                 }
 
-                Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                if (is_sliding) {
+                    Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
                                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
                                         beta_fast, beta_slow);
-                cb(Qcur, "Qcur", il);
+                    cb(Qcur, "Qcur", il);
 
-                Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                                        nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
+                    Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                                        rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
                                         attn_factor, beta_fast, beta_slow);
-                cb(Kcur, "Kcur", il);
+                    cb(Kcur, "Kcur", il);
+                } else {
+                    // For non-sliding layers, just reshape without applying RoPE
+                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                    cb(Qcur, "Qcur", il);
+    
+                    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                    cb(Kcur, "Kcur", il);
+                }
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
                                    KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);