[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 06255ac28f84 · 2025-06-20T08:34:39.000Z
for more information, see https://pre-commit.ci
diff --git a/litgpt/attention.py b/litgpt/attention.py
@@ -1,16 +1,16 @@
 import math
-from typing import Optional, Tuple, List, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch.nn import functional as F
 from torch.nn.attention import SDPBackend, sdpa_kernel
 
 from litgpt.attention_utils import (
-    filter_sdpa_kernels,
-    build_mask_cache,
-    build_mask_slice,
     attention_compute_scores,
     attention_compute_weighted_values,
+    build_mask_cache,
+    build_mask_slice,
+    filter_sdpa_kernels,
 )
 from litgpt.config import Config
 
@@ -94,6 +94,7 @@ class MultiHeadSelfAttention:
     `torch.nn.functional.scaled_dot_product_attention` is never used.
 
     """
+
     def __init__(
         self,
         config: Config,
@@ -156,12 +157,16 @@ def __call__(
         if use_eager_sdpa or sliding_window_size is not None or not is_causal:
             # Build attention mask
             if is_causal:
-                mask = build_mask_cache(
-                    max_seq_length=T,
-                    sliding_window_size=sliding_window_size,
-                    dtype=query.dtype,
-                    device=query.device,
-                ).view(1, 1, T, T).detach()
+                mask = (
+                    build_mask_cache(
+                        max_seq_length=T,
+                        sliding_window_size=sliding_window_size,
+                        dtype=query.dtype,
+                        device=query.device,
+                    )
+                    .view(1, 1, T, T)
+                    .detach()
+                )
             elif (not use_eager_sdpa) or T > 1:
                 # We need a mask if T > 1, since inference needs to be causal
                 # for the new tokens
@@ -196,7 +201,12 @@ def _use_eager_sdpa(
         return_attn_weights: bool,
         k_and_v: KeysAndValues,
     ) -> bool:
-        return return_attn_weights or self.use_eager_sdpa_always or self.config.attention_logit_softcapping is not None or not k_and_v.both_in_parallel()
+        return (
+            return_attn_weights
+            or self.use_eager_sdpa_always
+            or self.config.attention_logit_softcapping is not None
+            or not k_and_v.both_in_parallel()
+        )
 
     def _filter_sdpa_kernels(
         self,
diff --git a/litgpt/attention_utils.py b/litgpt/attention_utils.py
@@ -2,11 +2,11 @@
 
 import torch
 from torch.backends.cuda import (
-    can_use_flash_attention,
-    can_use_efficient_attention,
     can_use_cudnn_attention,
+    can_use_efficient_attention,
+    can_use_flash_attention,
 )
-from torch.nn.attention import SDPBackend, SDPAParams
+from torch.nn.attention import SDPAParams, SDPBackend
 
 
 def filter_sdpa_kernels(
@@ -20,9 +20,7 @@ def filter_sdpa_kernels(
     enable_gqa: bool,
     **kwargs,
 ) -> List[SDPBackend]:
-    params = SDPAParams(
-        query, key, value, attn_mask, dropout_p, is_causal, enable_gqa
-    )
+    params = SDPAParams(query, key, value, attn_mask, dropout_p, is_causal, enable_gqa)
     new_kernels = []
     for kernel in sdpa_kernels:
         if kernel == SDPBackend.FLASH_ATTENTION and not can_use_flash_attention(params):
@@ -103,7 +101,10 @@ def mask_cache_bool(
 ) -> torch.Tensor:
     # Usual causal mask:
     mask = torch.ones(
-        max_seq_length, max_seq_length, device=device, dtype=dtype,
+        max_seq_length,
+        max_seq_length,
+        device=device,
+        dtype=dtype,
     ).triu(diagonal=1)
     if sliding_window_size is not None:
         mask += torch.ones_like(mask).tril(diagonal=-sliding_window_size)
@@ -147,25 +148,52 @@ def mask_slice_bool(
     tp_dtype = token_positions.dtype
     batch_size, n_query_groups, _ = token_positions.shape
     assert n_head % n_query_groups == 0 and n_head >= n_query_groups
-    token_positions = token_positions.to(device=device).unsqueeze(2).expand(
-        -1, -1, num, -1,
+    token_positions = (
+        token_positions.to(device=device)
+        .unsqueeze(2)
+        .expand(
+            -1,
+            -1,
+            num,
+            -1,
+        )
     )
     kwargs = dict(device=device, dtype=tp_dtype)
-    bool_mask = torch.arange(
-        input_pos, input_pos + num, **kwargs,
-    ).view(1, 1, -1, 1).expand_as(token_positions) < token_positions
-    if sliding_window_size is not None:
-        extra_mask = torch.arange(
-            input_pos - sliding_window_size,
-            input_pos + num - sliding_window_size,
+    bool_mask = (
+        torch.arange(
+            input_pos,
+            input_pos + num,
             **kwargs,
-        ).view(1, 1, -1, 1).expand_as(token_positions) >= token_positions
+        )
+        .view(1, 1, -1, 1)
+        .expand_as(token_positions)
+        < token_positions
+    )
+    if sliding_window_size is not None:
+        extra_mask = (
+            torch.arange(
+                input_pos - sliding_window_size,
+                input_pos + num - sliding_window_size,
+                **kwargs,
+            )
+            .view(1, 1, -1, 1)
+            .expand_as(token_positions)
+            >= token_positions
+        )
         bool_mask |= extra_mask
     if n_head != n_query_groups:
         q_per_kv = n_head // n_query_groups
-        bool_mask = bool_mask.unsqueeze(2).expand(
-            -1, -1, q_per_kv, -1, -1,
-        ).reshape(batch_size, n_head, num, -1)
+        bool_mask = (
+            bool_mask.unsqueeze(2)
+            .expand(
+                -1,
+                -1,
+                q_per_kv,
+                -1,
+                -1,
+            )
+            .reshape(batch_size, n_head, num, -1)
+        )
     return bool_mask
 
 
@@ -197,7 +225,12 @@ def build_mask_slice(
 
     """
     bool_mask = mask_slice_bool(
-        input_pos, num, token_positions, n_head, device, sliding_window_size,
+        input_pos,
+        num,
+        token_positions,
+        n_head,
+        device,
+        sliding_window_size,
     )
     mask = torch.zeros(bool_mask.shape, dtype=dtype, device=device)
     mask.masked_fill_(bool_mask, minus_infinity(dtype))
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -21,6 +21,7 @@ def find_multiple(n: int, k: int) -> int:
         return n
     return n + k - (n % k)
 
+
 # See `Config.start_of_layer_hook`. A start of layer hook is called just before
 # a layer is computed. The call is `hook(x, block_idx, input_pos)`, where
 # `x` is the layer input, `block_idx` the number of the layer, and `input_pos`
diff --git a/litgpt/generate/speculative_decoding.py b/litgpt/generate/speculative_decoding.py
@@ -20,6 +20,7 @@
     multinomial_num_samples_1,
     sample_top_p,
 )
+from litgpt.kvcache import DenseKVCache
 from litgpt.model import GPT
 from litgpt.prompts import PromptStyle, has_prompt_style, load_prompt_style
 from litgpt.tokenizer import Tokenizer
@@ -30,7 +31,6 @@
     get_default_supported_precision,
     load_checkpoint,
 )
-from litgpt.kvcache import DenseKVCache
 
 
 def sample(
@@ -149,7 +149,8 @@ def speculative_decoding(
     draft_token = token
     for idx in range(speculative_k):
         logits = draft_model(
-            idx=draft_token.unsqueeze(0), input_pos=draft_input_pos,
+            idx=draft_token.unsqueeze(0),
+            input_pos=draft_input_pos,
         )
         draft_token, draft_prob = sample(logits, **sample_kwargs)
         draft_input_pos += 1
@@ -161,7 +162,8 @@ def speculative_decoding(
     # Feed both original token and draft tokens to get target probabilities
     candidate_tokens = torch.cat((token, draft_tokens))
     target_logits = target_model(
-        idx=candidate_tokens.unsqueeze(0), input_pos=input_pos,
+        idx=candidate_tokens.unsqueeze(0),
+        input_pos=input_pos,
     )
 
     # Step 3: Convert target logits to probabilities using same sampling params
@@ -211,7 +213,7 @@ def speculative_decoding(
         draft_model(idx=draft_token.unsqueeze(0), input_pos=draft_input_pos)
         new_token, _ = sample(target_logits, **sample_kwargs)
     else:
-        input_pos += (len(accepted_tokens) + 1)
+        input_pos += len(accepted_tokens) + 1
         _resize_kv_caches(draft_model, input_pos)
         _resize_kv_caches(target_model, input_pos)
     return torch.cat((*accepted_tokens, new_token))
@@ -316,7 +318,10 @@ def generate(
     )
     _process_prompt(draft_model, prompt, prompt_chunksize, **sample_kwargs)
     token = _process_prompt(
-        target_model, prompt, prompt_chunksize, **sample_kwargs,
+        target_model,
+        prompt,
+        prompt_chunksize,
+        **sample_kwargs,
     )
     input_pos = prompt_size
 
diff --git a/litgpt/kvcache/base.py b/litgpt/kvcache/base.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union, List
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 from torch.nn.attention import SDPBackend
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -10,11 +10,10 @@
 from functools import partial
 from typing import Any, List, Optional, Tuple, Union
 
-from typing_extensions import Self
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from typing_extensions import Self
 
 from litgpt.attention import (
     DefaultKeysAndValues,
@@ -604,7 +603,6 @@ def __init__(
         self.config = config
         self.block_idx = block_idx
 
-
     def forward(
         self,
         x: torch.Tensor,
diff --git a/tests/test_attention.py b/tests/test_attention.py
diff --git a/tests/test_model.py b/tests/test_model.py