Save peak memory in logits processor (sgl-project#8343)

ch-wan · narutolhy · commit 01e6b452bc82 · 2025-07-28T23:39:17.000-07:00
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
@@ -174,8 +174,6 @@ def from_forward_batch(cls, forward_batch: ForwardBatch):
         )
 
     def compute_dp_attention_metadata(self):
-        # TODO(ch-wan): gathered_buffer here is larger than the actual required size in draft extend,
-        # we may use a smaller buffer in draft extend.
 
         cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
         dp_rank = get_attention_dp_rank()
@@ -190,6 +188,19 @@ def compute_dp_attention_metadata(self):
         self.dp_local_start_pos = dp_local_start_pos
         self.dp_local_num_tokens = dp_local_num_tokens
 
+        if self.global_num_tokens_for_logprob_cpu is not None:
+            # create a smaller buffer to reduce peak memory usage
+            self.gathered_buffer = torch.empty(
+                (
+                    sum(self.global_num_tokens_for_logprob_cpu),
+                    self.gathered_buffer.shape[1],
+                ),
+                dtype=self.gathered_buffer.dtype,
+                device=self.gathered_buffer.device,
+            )
+        else:
+            self.gathered_buffer = torch.empty_like(self.gathered_buffer)
+
 
 class LogitsProcessor(nn.Module):
     def __init__(
@@ -434,7 +445,7 @@ def _get_logits(
         if self.do_tensor_parallel_all_gather_dp_attn:
             logits_metadata.compute_dp_attention_metadata()
             hidden_states, local_hidden_states = (
-                torch.empty_like(logits_metadata.gathered_buffer),
+                logits_metadata.gathered_buffer,
                 hidden_states,
             )
             dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)