Skip llama4 vision module loading when multimodal disabled (#8272)

ispobock · mickqian · web-flow · commit e2d66f60c8f8 · 2025-07-23T12:41:25.000+08:00
Co-authored-by: Mick &lt;mickjagger19@icloud.com&gt;
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
@@ -106,6 +106,7 @@
     "num_reserved_decode_tokens",
     "weight_loader_disable_mmap",
     "enable_triton_kernel_moe",
+    "enable_multimodal",
 ]
 
 # Put some global args for easy access
diff --git a/python/sglang/srt/models/mllama4.py b/python/sglang/srt/models/mllama4.py
@@ -23,6 +23,7 @@
     Modality,
     MultimodalDataItem,
     MultimodalInputs,
+    global_server_args_dict,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -55,13 +56,17 @@ def __init__(
         self.quant_config = quant_config
 
         # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
-        self.has_vision = self._has_vision_weights(config)
-        if not self.has_vision:
+        self.has_vision_weights = self._has_vision_weights(config)
+        if not self.has_vision_weights:
             logger.warning(
                 "No vision weights found in checkpoint. Model will run in text-only mode. "
                 "Multimodal capabilities (image processing) will be unavailable."
             )
 
+        self.has_vision = (
+            self.has_vision_weights and global_server_args_dict["enable_multimodal"]
+        )
+
         if self.has_vision:
             self.vision_model = Llama4VisionModel(config.vision_config)
             self.multi_modal_projector = Llama4MultiModalProjector(config)
@@ -269,7 +274,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
 
     def _should_skip_weight(self, name: str) -> bool:
         """Check if we should skip loading this weight."""
-        return "vision" in name and not self.has_vision
+        return not self.has_vision and (
+            "vision" in name or "multi_modal_projector" in name
+        )
 
     def _transform_weight_name(self, name: str) -> str:
         """Transform weight name by adding language_model prefix if needed."""

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,7 @@`
`106`	`106`	`"num_reserved_decode_tokens",`
`107`	`107`	`"weight_loader_disable_mmap",`
`108`	`108`	`"enable_triton_kernel_moe",`
	`109`	`+ "enable_multimodal",`
`109`	`110`	`]`
`110`	`111`
`111`	`112`	`# Put some global args for easy access`