|
23 | 23 | Modality,
|
24 | 24 | MultimodalDataItem,
|
25 | 25 | MultimodalInputs,
|
| 26 | + global_server_args_dict, |
26 | 27 | )
|
27 | 28 | from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
28 | 29 | from sglang.srt.model_loader.weight_utils import default_weight_loader
|
@@ -55,13 +56,17 @@ def __init__(
|
55 | 56 | self.quant_config = quant_config
|
56 | 57 |
|
57 | 58 | # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
|
58 |
| - self.has_vision = self._has_vision_weights(config) |
59 |
| - if not self.has_vision: |
| 59 | + self.has_vision_weights = self._has_vision_weights(config) |
| 60 | + if not self.has_vision_weights: |
60 | 61 | logger.warning(
|
61 | 62 | "No vision weights found in checkpoint. Model will run in text-only mode. "
|
62 | 63 | "Multimodal capabilities (image processing) will be unavailable."
|
63 | 64 | )
|
64 | 65 |
|
| 66 | + self.has_vision = ( |
| 67 | + self.has_vision_weights and global_server_args_dict["enable_multimodal"] |
| 68 | + ) |
| 69 | + |
65 | 70 | if self.has_vision:
|
66 | 71 | self.vision_model = Llama4VisionModel(config.vision_config)
|
67 | 72 | self.multi_modal_projector = Llama4MultiModalProjector(config)
|
@@ -269,7 +274,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
|
269 | 274 |
|
270 | 275 | def _should_skip_weight(self, name: str) -> bool:
|
271 | 276 | """Check if we should skip loading this weight."""
|
272 |
| - return "vision" in name and not self.has_vision |
| 277 | + return not self.has_vision and ( |
| 278 | + "vision" in name or "multi_modal_projector" in name |
| 279 | + ) |
273 | 280 |
|
274 | 281 | def _transform_weight_name(self, name: str) -> str:
|
275 | 282 | """Transform weight name by adding language_model prefix if needed."""
|
|
0 commit comments