Further relax constraints to cuDNN 9.13 for disabling fused attn for kv caching (#2121)

KshitijLakhani · web-flow · commit de81b7dfb6a9 · 2025-08-27T16:31:29.000-07:00
Signed-off-by: Kshitij Lakhani &lt;klakhani@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -434,8 +434,8 @@ def get_attention_backend(
     #          | FP8            | non-paged/paged | sm90         | thd           | >= 1
     # Unfused  | FP32/FP16/BF16 | non-paged/paged | all          | bshd,sbhd,thd | >= 1
     if inference_params is not None:
-        if device_compute_capability == (8, 9) and cudnn_version <= (9, 12, 0):
-            logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.12")
+        if device_compute_capability == (8, 9) and cudnn_version <= (9, 13, 0):
+            logger.debug("Disabling FusedAttention for KV caching for sm89 and cuDNN <= 9.13")
             use_fused_attention = False
         if context_parallel:
             logger.debug("Disabling all backends for KV caching with context parallelism")