Fix Local Attention off by 1 bug (#25927)

aciddelgado · kunal-vaishnavi · web-flow · commit 8af9f580860b · 2025-10-01T18:30:52.000-07:00
### Description
Previously, local window size of GQA op excluded the current token. This
does not match standard HuggingFace implementations where tokens are
appended and then local masking occurs; the mismatch can cause the mask
to be off by 1 during generation, leading to accuracy issues. This PR
corrects this mismatch by including the current token. In practice, this
effectively decreases GQA window size by 1.
 


### Motivation and Context
This helps align our models with HuggingFace models.

---------

Co-authored-by: Kunal Vaishnavi &lt;kvaishnavi@microsoft.com&gt;
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h b/onnxruntime/contrib_ops/cpu/bert/attention_parameters.h
@@ -86,7 +86,7 @@ struct GroupQueryAttentionParameters : AttentionParameters {
   int kv_hidden_size;           // hidden size of key or value
   int seqlen_past_kv_cache;     // sequence length of past kv tensor
   int seqlen_present_kv_cache;  // sequence length of present kv tensor
-  int local_window_size;        // The window size excludes current token. It only includes tokens on the left side.
+  int local_window_size;        // Mask out tokens prior to total_sequence_length - local_window_size
   bool kv_share_buffer;
   bool is_subsequent_prompt;  // indicates whether we have past context and seqlen > 1
   bool is_first_prompt;       // indicates whether this is first decoding step
@@ -106,7 +106,7 @@ struct PagedAttentionParameters : AttentionParameters {
   int block_size;              // block size for kv cache
   int max_num_blocks_per_seq;  // max number of blocks per sequence for kv cache
   int num_blocks;              // number of blocks in kv cache
-  int local_window_size;       // The window size excludes current token. It only includes tokens on the left side.
+  int local_window_size;       // The window size includes new token. It only includes tokens on the left side.
   bool rotary_interleaved;
   float softcap;
 };
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -297,16 +297,15 @@ class GQAAttentionBase {
         for (size_t seq = 0; seq < sequence_length; seq++) {
           size_t seq_causal_length = past_seqlen + seq + 1;
 
-          // local_window_size does not include the current query token, while window_size includes it.
           const bool should_apply_local_window = local_window_size_ >= 0 &&
-                                                 seq_causal_length > static_cast<size_t>(local_window_size_) + 1;
+                                                 seq_causal_length > static_cast<size_t>(local_window_size_);
 
-          const size_t start_offset = should_apply_local_window ? seq_causal_length - local_window_size_ - 1 : 0;
-          const size_t window_size = should_apply_local_window ? local_window_size_ + 1 : seq_causal_length;
+          const size_t start_offset = should_apply_local_window ? seq_causal_length - local_window_size_ : 0;
+          const size_t window_size = should_apply_local_window ? local_window_size_ : seq_causal_length;
 
           // Mask everything before local window, if local window should be applied
           if (should_apply_local_window) {
-            for (size_t total_seq_id = 0; total_seq_id < seq_causal_length - local_window_size_ - 1; total_seq_id++) {
+            for (size_t total_seq_id = 0; total_seq_id < seq_causal_length - local_window_size_; total_seq_id++) {
               if constexpr (std::is_same<U, float>::value) {
                 output_softmax[total_seq_id] = 0.f;
               } else {
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -223,9 +223,7 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) {
     }
 
     p.use_smooth_softmax = params.use_smooth_softmax;
-
-    // local_windows_size in GQA does not include current query token, while windows_size in this kernel includes it.
-    p.window_size = params.local_window_size + 1;
+    p.window_size = params.local_window_size;
   }
 
   auto kernel_fn = attention_kernel_batched_impl<Attention>;
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -476,7 +476,7 @@ Status FlashAttention(
       parameters.seqlen_present_kv_cache, kv_sequence_length, parameters.rotary_dim,
       scale, parameters.softcap, is_causal, is_bf16, parameters.use_smooth_softmax, past_bsnh, parameters.num_splits,
       reinterpret_cast<void*>(data.softmax_lse_accum), reinterpret_cast<void*>(data.out_accum),
-      parameters.local_window_size, parameters.rotary_interleaved, parameters.is_packed_qkv));
+      parameters.local_window_size - 1, parameters.rotary_interleaved, parameters.is_packed_qkv));
 
   // if (parameters.left_padding && parameters.is_first_prompt) {
   //   ORT_RETURN_IF_ERROR(LaunchLeftPadLast(parameters, data, stream, device_prop.maxThreadsPerBlock));
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu
@@ -326,7 +326,7 @@ Status FlashAttention(
   ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_varlen_fwd(
       device_prop, stream, q, key_cache, value_cache, output, cumulative_seqlens_q, cumulative_seqlens_kv,
       /*seqused_k*/ nullptr, block_table, softmax_lse, batch_size, num_heads, kv_num_heads, head_size, max_query_len,
-      max_seq_len, token_count, scale, softcap, /*is_causal*/ true, is_bf16, local_window_size, max_num_blocks_per_seq,
+      max_seq_len, token_count, scale, softcap, /*is_causal*/ true, is_bf16, local_window_size - 1, max_num_blocks_per_seq,
       block_size));
 
   DUMP_TENSOR_INIT();
diff --git a/onnxruntime/contrib_ops/webgpu/bert/attention.cc b/onnxruntime/contrib_ops/webgpu/bert/attention.cc
@@ -250,7 +250,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
   if (has_sliding_window) {
     // Sliding window
     shader.MainFunctionBody()
-        << "let should_apply_local_window = uniforms.local_window_size >= 0 && seq_causal_length > uniforms.local_window_size + 1;\n"
+        << "let should_apply_local_window = uniforms.local_window_size >= 0 && seq_causal_length > uniforms.local_window_size;\n"
         << "let start_offset = select(0, seq_causal_length - uniforms.local_window_size, should_apply_local_window);\n"
         << "let effective_seq_length = select(seq_causal_length, uniforms.local_window_size, should_apply_local_window);\n";
   } else {
diff --git a/onnxruntime/test/python/transformers/test_gqa.py b/onnxruntime/test/python/transformers/test_gqa.py
@@ -582,7 +582,7 @@ def construct_local_mask(seqlen_q, seqlen_k, window_size, query_padding_mask, ke
         sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
         return torch.logical_or(
             col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
-            col_idx < row_idx + sk - sq - window_size[0],
+            col_idx <= row_idx + sk - sq - window_size[0],
         )
 
 
diff --git a/onnxruntime/test/python/transformers/test_gqa_cpu.py b/onnxruntime/test/python/transformers/test_gqa_cpu.py
@@ -1122,7 +1122,7 @@ def construct_local_mask(
         sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
         return torch.logical_or(
             col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
-            col_idx < row_idx + sk - sq - window_size[0],
+            col_idx <= row_idx + sk - sq - window_size[0],
         )
 
 
diff --git a/onnxruntime/test/python/transformers/test_paged_attention_cuda.py b/onnxruntime/test/python/transformers/test_paged_attention_cuda.py
@@ -331,7 +331,7 @@ def construct_local_mask(
         sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
         return torch.logical_or(
             col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
-            col_idx < row_idx + sk - sq - window_size[0],
+            col_idx <= row_idx + sk - sq - window_size[0],
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -223,9 +223,7 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) {`
`223`	`223`	`}`
`224`	`224`
`225`	`225`	`p.use_smooth_softmax = params.use_smooth_softmax;`
`226`		`-`
`227`		`- // local_windows_size in GQA does not include current query token, while windows_size in this kernel includes it.`
`228`		`- p.window_size = params.local_window_size + 1;`
	`226`	`+ p.window_size = params.local_window_size;`
`229`	`227`	`}`
`230`	`228`
`231`	`229`	`auto kernel_fn = attention_kernel_batched_impl<Attention>;`
Original file line number	Diff line number	Diff line change
`@@ -582,7 +582,7 @@ def construct_local_mask(seqlen_q, seqlen_k, window_size, query_padding_mask, ke`
`582`	`582`	`sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk`
`583`	`583`	`return torch.logical_or(`
`584`	`584`	`col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),`
`585`		`- col_idx < row_idx + sk - sq - window_size[0],`
	`585`	`+ col_idx <= row_idx + sk - sq - window_size[0],`
`586`	`586`	`)`
`587`	`587`
`588`	`588`
Original file line number	Diff line number	Diff line change
`@@ -1122,7 +1122,7 @@ def construct_local_mask(`
`1122`	`1122`	`sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk`
`1123`	`1123`	`return torch.logical_or(`
`1124`	`1124`	`col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),`
`1125`		`- col_idx < row_idx + sk - sq - window_size[0],`
	`1125`	`+ col_idx <= row_idx + sk - sq - window_size[0],`
`1126`	`1126`	`)`
`1127`	`1127`
`1128`	`1128`
Original file line number	Diff line number	Diff line change
`@@ -331,7 +331,7 @@ def construct_local_mask(`
`331`	`331`	`sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk`
`332`	`332`	`return torch.logical_or(`
`333`	`333`	`col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),`
`334`		`- col_idx < row_idx + sk - sq - window_size[0],`
	`334`	`+ col_idx <= row_idx + sk - sq - window_size[0],`
`335`	`335`	`)`
`336`	`336`
`337`	`337`