Save peak memory in logits processor (#8343)

ch-wan · ssssnow · commit 99ec67502e6e · 2025-08-05T19:51:58.000+08:00
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
@@ -170,8 +170,6 @@ def from_forward_batch(cls, forward_batch: ForwardBatch):
         )
 
     def compute_dp_attention_metadata(self):
-        # TODO(ch-wan): gathered_buffer here is larger than the actual required size in draft extend,
-        # we may use a smaller buffer in draft extend.
 
         cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
         dp_rank = get_attention_dp_rank()
@@ -186,6 +184,19 @@ def compute_dp_attention_metadata(self):
         self.dp_local_start_pos = dp_local_start_pos
         self.dp_local_num_tokens = dp_local_num_tokens
 
+        if self.global_num_tokens_for_logprob_cpu is not None:
+            # create a smaller buffer to reduce peak memory usage
+            self.gathered_buffer = torch.empty(
+                (
+                    sum(self.global_num_tokens_for_logprob_cpu),
+                    self.gathered_buffer.shape[1],
+                ),
+                dtype=self.gathered_buffer.dtype,
+                device=self.gathered_buffer.device,
+            )
+        else:
+            self.gathered_buffer = torch.empty_like(self.gathered_buffer)
+
 
 class LogitsProcessor(nn.Module):
     def __init__(
@@ -430,7 +441,7 @@ def _get_logits(
         if self.do_tensor_parallel_all_gather_dp_attn:
             logits_metadata.compute_dp_attention_metadata()
             hidden_states, local_hidden_states = (
-                torch.empty_like(logits_metadata.gathered_buffer),
+                logits_metadata.gathered_buffer,
                 hidden_states,
             )
             dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)