@@ -14,6 +14,21 @@ namespace {
14
14
15
15
std::string NATIVE_TAG = " <image>./</image>" ;
16
16
17
+ /* *
18
+ * @brief Represents the result of slicing an image into smaller patches.
19
+ *
20
+ * This struct is used in miniCPM inputs embedder to store the sliced image patches
21
+ * and the target size of the processed image.
22
+ *
23
+ * @param slices A tensor containing the sliced image patches.
24
+ * @param target_size The desired size of the image after processing.
25
+ */
26
+ struct ImageSliceResult {
27
+ ov::Tensor slices;
28
+ ImageSize target_size;
29
+ };
30
+
31
+
17
32
int ensure_divide (int length, int patch_size) {
18
33
return std::max (static_cast <int >(std::round (static_cast <float >(length) / patch_size) * patch_size), patch_size);
19
34
}
@@ -279,7 +294,7 @@ ov::Tensor prepare_vis_position_ids(
279
294
return position_ids;
280
295
}
281
296
282
- EncodedImage llava_image_embed_make_with_bytes_slice (clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
297
+ std::pair< EncodedImage, ImageSliceResult> llava_image_embed_make_with_bytes_slice (clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
283
298
clip_image_u8 source = tensor_to_clip_image_u8 (img);
284
299
std::vector<std::vector<clip_image_u8>> imgs = slice_image (source, max_slice_nums, scale_resolution, patch_size, never_split);
285
300
std::vector<std::vector<ov::Tensor>> results;
@@ -379,6 +394,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
379
394
}
380
395
}
381
396
}
397
+ ImageSliceResult image_slice_result;
382
398
ov::Tensor position_ids = prepare_vis_position_ids (pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size);
383
399
encoder.set_tensor (" position_ids" , position_ids);
384
400
encoder.infer ();
@@ -387,7 +403,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
387
403
if (1 == preprocessed.size ()) {
388
404
ov::Tensor resized_source{ov::element::f32 , output_tensor.get_shape ()};
389
405
output_tensor.copy_to (resized_source);
390
- return {std::move (resized_source), resized_source_size};
406
+ return {{ std::move (resized_source), resized_source_size}, std::move (image_slice_result) };
391
407
}
392
408
393
409
size_t old_hidden_size = output_tensor.get_shape ().at (2 );
@@ -396,13 +412,14 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
396
412
std::copy_n (out, resized_source.get_size (), resized_source.data <float >());
397
413
398
414
size_t n_patches = tgt_sizes.at (1 ).height * tgt_sizes.at (1 ).width ;
399
- ov::Tensor encoded_slices {ov::element::f32 , {preprocessed.size () - 1 , preprocessed.at (1 ).size (), n_patches, old_hidden_size}};
415
+ image_slice_result. slices = ov::Tensor{ov::element::f32 , {preprocessed.size () - 1 , preprocessed.at (1 ).size (), n_patches, old_hidden_size}};
400
416
for (size_t col = 0 ; col < preprocessed.size () - 1 ; ++col) {
401
417
for (size_t row = 0 ; row < preprocessed.at (1 ).size (); ++row) {
402
- std::copy_n (out + (col * preprocessed.at (1 ).size () + row + 1 ) * n_patches * old_hidden_size, n_patches * old_hidden_size, encoded_slices .data <float >() + (col * preprocessed.at (1 ).size () + row) * n_patches * old_hidden_size);
418
+ std::copy_n (out + (col * preprocessed.at (1 ).size () + row + 1 ) * n_patches * old_hidden_size, n_patches * old_hidden_size, image_slice_result. slices .data <float >() + (col * preprocessed.at (1 ).size () + row) * n_patches * old_hidden_size);
403
419
}
404
420
}
405
- return {resized_source, resized_source_size, encoded_slices, tgt_sizes.at (1 )};
421
+ image_slice_result.target_size = tgt_sizes.at (1 );
422
+ return {{std::move (resized_source), resized_source_size}, std::move (image_slice_result)};
406
423
}
407
424
408
425
} // namespace
@@ -416,26 +433,30 @@ EncodedImage VisionEncoderMiniCPM::encode(const ov::Tensor& image, const ov::Any
416
433
ctx_clip.image_size = config.image_size ;
417
434
std::copy (config.norm_mean .begin (), config.norm_mean .end (), ctx_clip.image_mean );
418
435
std::copy (config.norm_std .begin (), config.norm_std .end (), ctx_clip.image_std );
419
- EncodedImage encoded_image = llava_image_embed_make_with_bytes_slice (ctx_clip, image, encoder, config.max_slice_nums , config.scale_resolution , config.patch_size , 0 == config.max_slice_nums );
420
- encoded_image.resampled_image = resample_encoded_image (encoded_image);
436
+
437
+ auto [encoded_image, image_slice_result] = llava_image_embed_make_with_bytes_slice (ctx_clip, image, encoder, config.max_slice_nums , config.scale_resolution , config.patch_size , 0 == config.max_slice_nums );
438
+ encoded_image.resampled_image = resample_encoded_image (encoded_image, image_slice_result.slices , image_slice_result.target_size );
439
+ if (image_slice_result.slices ) {
440
+ encoded_image.slices_shape = image_slice_result.slices .get_shape ();
441
+ }
421
442
return encoded_image;
422
443
}
423
444
424
- ResampledImage VisionEncoderMiniCPM::resample_encoded_image (const EncodedImage& encoded_image) {
445
+ ResampledImage VisionEncoderMiniCPM::resample_encoded_image (const EncodedImage& encoded_image, const ov::Tensor& slices, const ImageSize& target_size ) {
425
446
const ov::Tensor& resampled_source = resample (encoded_image.resized_source , {encoded_image.resized_source_size });
426
447
std::vector<std::vector<ov::Tensor>> vision_embed_tensors;
427
- if (encoded_image. slices ) {
448
+ if (slices) {
428
449
size_t token_idx = 0 ;
429
- const ov::Shape& slices_shape = encoded_image. slices .get_shape ();
450
+ const ov::Shape& slices_shape = slices.get_shape ();
430
451
vision_embed_tensors.resize (slices_shape.at (0 ));
431
452
for (size_t i = 0 ; i < slices_shape.at (0 ); ++i) {
432
453
std::vector<ov::Tensor> vision_embeds;
433
454
vision_embeds.resize (slices_shape.at (1 ));
434
455
for (size_t ja = 0 ; ja < slices_shape.at (1 ); ++ja) {
435
456
size_t d2 = slices_shape.at (2 );
436
457
size_t d3 = slices_shape.at (3 );
437
- ov::Tensor encoded_view{ov::element::f32 , {1 , d2, d3}, encoded_image. slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
438
- vision_embeds[ja] = resample (encoded_view, {encoded_image. slices_size });
458
+ ov::Tensor encoded_view{ov::element::f32 , {1 , d2, d3}, slices.data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
459
+ vision_embeds[ja] = resample (encoded_view, {target_size });
439
460
}
440
461
vision_embed_tensors[i] = vision_embeds;
441
462
}
@@ -591,8 +612,8 @@ std::pair<std::string, std::vector<size_t>> InputsEmbedderMiniCPM::normalize_pro
591
612
expanded_tag += m_vlm_config.im_id_start + std::to_string (new_image_id) + m_vlm_config.im_id_end ;
592
613
}
593
614
expanded_tag += m_vlm_config.im_start + unk64 + m_vlm_config.im_end ;
594
- if ( encoded_image.slices ) {
595
- ov::Shape slices_shape = encoded_image. slices . get_shape ();
615
+ ov::Shape slices_shape = encoded_image.slices_shape ;
616
+ if ( slices_shape. size ()) {
596
617
for (size_t row_idx = 0 ; row_idx < slices_shape.at (0 ); ++row_idx) {
597
618
for (size_t col_idx = 0 ; col_idx < slices_shape.at (1 ); ++col_idx) {
598
619
expanded_tag += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end ;
@@ -652,9 +673,9 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& unified_p
652
673
++ids;
653
674
std::copy_n (emb, resampled_source.get_size (), inputs_embeds_data + std::distance (begin, ids) * m_vlm_config.hidden_size );
654
675
ids += m_vlm_config.query_num ;
655
- if (encoded_image.slices ) {
676
+ ov::Shape slices_shape = encoded_image.slices_shape ;
677
+ if (slices_shape.size ()) {
656
678
size_t token_idx = 0 ;
657
- const ov::Shape& slices_shape = encoded_image.slices .get_shape ();
658
679
for (size_t i = 0 ; i < slices_shape.at (0 ); ++i) {
659
680
for (size_t ja = 0 ; ja < slices_shape.at (1 ); ++ja) {
660
681
const ov::Tensor& vision_embed_tensor_i_j = encoded_image.resampled_image .vision_embed_tensors [i][ja];
0 commit comments