@@ -203,7 +203,7 @@ class MultimodalDataItem:
203
203
204
204
# the real data, pixel_values or audio_features
205
205
# data: Union[List[torch.Tensor], List[np.ndarray]]
206
- pixel_values : Union [torch .Tensor , np .ndarray ] = None
206
+ pixel_values : Union [torch .Tensor , np .ndarray , "PIL.Image" ] = None
207
207
audio_features : Union [torch .Tensor , np .ndarray ] = None
208
208
audio_feature_lens : Optional [List [torch .Tensor ]] = None
209
209
audio_offsets : Optional [List [Tuple [int , int ]]] = None
@@ -244,15 +244,16 @@ def set_pad_value(self):
244
244
"""
245
245
from sglang .srt .managers .mm_utils import hash_feature
246
246
247
- if self .precomputed_features is not None :
248
- self .hash = hash_feature (self .precomputed_features )
249
- elif self .is_audio ():
250
- if self .audio_features is not None :
251
- self .hash = hash_feature (self .audio_features )
252
- elif self .input_features is not None :
253
- self .hash = hash_feature (self .input_features )
254
- else :
255
- self .hash = hash_feature (self .pixel_values )
247
+ if self .hash is None :
248
+ if self .precomputed_features is not None :
249
+ self .hash = hash_feature (self .precomputed_features )
250
+ elif self .is_audio ():
251
+ if self .audio_features is not None :
252
+ self .hash = hash_feature (self .audio_features )
253
+ elif self .input_features is not None :
254
+ self .hash = hash_feature (self .input_features )
255
+ else :
256
+ self .hash = hash_feature (self .pixel_values )
256
257
257
258
assert self .hash is not None
258
259
self .pad_value = self .hash % (1 << 30 )
@@ -295,6 +296,13 @@ def from_dict(obj: dict):
295
296
ret .validate ()
296
297
return ret
297
298
299
+ def merge (self , other ):
300
+ self .pixel_values += other .pixel_values
301
+ self .image_sizes += other .image_sizes
302
+ self .image_offsets += other .image_offsets
303
+ self .hash = hash ((self .hash , other .hash ))
304
+ self .set_pad_value ()
305
+
298
306
299
307
@dataclasses .dataclass
300
308
class MultimodalInputs :
0 commit comments