intel
diff --git a/‎csrc/cpu/aten/PagedAttention.cpp‎
Lines changed: 8 additions & 4 deletions b/‎csrc/cpu/aten/PagedAttention.cpp‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎csrc/cpu/aten/PagedAttention.h‎
Lines changed: 8 additions & 4 deletions b/‎csrc/cpu/aten/PagedAttention.h‎
Lines changed: 8 additions & 4 deletions
@@ -26,7 +26,8 @@ void single_query_cached_kv_attention_forward_cpu(
     int64_t max_context_len,
     const c10::optional<at::Tensor>& alibi_slopes,
     const double k_scale,
-    const double v_scale) {
+    const double v_scale,
+    const double softcap) {
   return single_query_cached_kv_attention_kernel_stub(
       kCPU,
       out,
@@ -41,7 +42,8 @@ void single_query_cached_kv_attention_forward_cpu(
       max_context_len,
       alibi_slopes,
       k_scale,
-      v_scale);
+      v_scale,
+      softcap);
 }
 
 void reshape_and_cache_cpu(
@@ -70,7 +72,8 @@ void flash_attn_varlen_cpu(
     at::Tensor& block_table,
     const c10::optional<at::Tensor>& alibi_slopes,
     const double k_scale,
-    const double v_scale) {
+    const double v_scale,
+    const double softcap) {
   return flash_attn_var_len_kernel_stub(
       kCPU,
       out,
@@ -86,7 +89,8 @@ void flash_attn_varlen_cpu(
       block_table,
       alibi_slopes,
       k_scale,
-      v_scale);
+      v_scale,
+      softcap);
 }
 
 } // namespace cpu
 
@@ -21,7 +21,8 @@ void single_query_cached_kv_attention(
     int64_t max_context_len,
     const c10::optional<at::Tensor>& alibi_slopes,
     const double k_scale,
-    const double v_scale);
+    const double v_scale,
+    const double softcap);
 }
 
 void reshape_and_cache(
@@ -47,7 +48,8 @@ void flash_attn_varlen(
     at::Tensor& block_table,
     const c10::optional<at::Tensor>& alibi_slopes,
     const double k_scale,
-    const double v_scale);
+    const double v_scale,
+    const double softcap);
 
 using single_query_cached_kv_attention_fn = void (*)(
     at::Tensor& out, // [num_seqs, num_heads, head_size]
@@ -62,7 +64,8 @@ using single_query_cached_kv_attention_fn = void (*)(
     int64_t max_context_len,
     const c10::optional<at::Tensor>& alibi_slopes,
     const double k_scale,
-    const double v_scale);
+    const double v_scale,
+    const double softcap);
 
 using reshape_and_cache_fn = void (*)(
     at::Tensor& key,
@@ -87,7 +90,8 @@ using flash_attn_var_len_fn = void (*)(
     at::Tensor& block_table,
     const c10::optional<at::Tensor>& alibi_slopes,
     const double k_scale,
-    const double v_scale);
+    const double v_scale,
+    const double softcap);
 
 IPEX_DECLARE_DISPATCH(
     single_query_cached_kv_attention_fn,