From e79db7d2f8064c7298c09db14e02b1d7a41f9279 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Mon, 21 Jul 2025 09:10:17 +0000 Subject: [PATCH 1/2] fix: add method to retrieve token ID by modality in base processor and raise error if no match Signed-off-by: Xinyuan Tong --- .../srt/multimodal/processors/base_processor.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index b79d90b987e..d4e1d89cbb3 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -101,6 +101,13 @@ def get_modality_of_token(self, token: str) -> Optional[Modality]: return None + def get_token_id_by_modality(self, modality: Modality) -> Optional[int]: + return { + Modality.IMAGE: self.image_token_id, + Modality.VIDEO: self.video_token_id, + Modality.AUDIO: self.audio_token_id, + }.get(modality) + def parse_regex(self): if self.image_token_regex is None and self.image_token is not None: self.image_token_regex = re.compile(re.escape(self.image_token)) @@ -608,14 +615,12 @@ def process_and_combine_mm_data( # Add offsets to all items for mm_item in all_collected_items: + mm_token_id = mm_tokens.get_token_id_by_modality(mm_item.modality) + if mm_token_id is None: + raise ValueError(f"No token id found for modality: {mm_item.modality}") mm_item.offsets = self.get_mm_items_offset( input_ids=input_ids, - mm_token_id={ - Modality.IMAGE: mm_tokens.image_token_id, - Modality.MULTI_IMAGES: mm_tokens.image_token_id, - Modality.VIDEO: mm_tokens.video_token_id, - Modality.AUDIO: mm_tokens.audio_token_id, - }.get(mm_item.modality, None), + mm_token_id=mm_token_id, ) return all_collected_items, input_ids, ret From bee995a2e6ff6d0ba03bd18c0a839e6c8fc6e161 Mon Sep 17 00:00:00 2001 From: Xinyuan Tong Date: Mon, 21 Jul 2025 09:18:52 +0000 Subject: [PATCH 2/2] fix: add support for MULTI_IMAGES modality in token ID retrieval Signed-off-by: Xinyuan Tong --- python/sglang/srt/multimodal/processors/base_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index d4e1d89cbb3..3d548a19ee9 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -104,6 +104,7 @@ def get_modality_of_token(self, token: str) -> Optional[Modality]: def get_token_id_by_modality(self, modality: Modality) -> Optional[int]: return { Modality.IMAGE: self.image_token_id, + Modality.MULTI_IMAGES: self.image_token_id, Modality.VIDEO: self.video_token_id, Modality.AUDIO: self.audio_token_id, }.get(modality)