Skip to content

Commit b161e5c

Browse files
popovaanas-suvorov
andauthored
Removed 'slices' from EncodedImage (#2258)
`ov::Tensor slices` can be removed from `EncodedImage`, as it is used during resampling, which is currently a part of `encode()`, so there's no need to keep slices in `encode()` output. Tocket: CVS-167405 --------- Co-authored-by: Alexander Suvorov <[email protected]>
1 parent 3007e79 commit b161e5c

File tree

3 files changed

+45
-26
lines changed

3 files changed

+45
-26
lines changed

src/cpp/src/visual_language/minicpm/classes.cpp

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@ namespace {
1414

1515
std::string NATIVE_TAG = "<image>./</image>";
1616

17+
/**
18+
* @brief Represents the result of slicing an image into smaller patches.
19+
*
20+
* This struct is used in miniCPM inputs embedder to store the sliced image patches
21+
* and the target size of the processed image.
22+
*
23+
* @param slices A tensor containing the sliced image patches.
24+
* @param target_size The desired size of the image after processing.
25+
*/
26+
struct ImageSliceResult {
27+
ov::Tensor slices;
28+
ImageSize target_size;
29+
};
30+
31+
1732
int ensure_divide(int length, int patch_size) {
1833
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
1934
}
@@ -279,7 +294,7 @@ ov::Tensor prepare_vis_position_ids(
279294
return position_ids;
280295
}
281296

282-
EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
297+
std::pair<EncodedImage, ImageSliceResult> llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) {
283298
clip_image_u8 source = tensor_to_clip_image_u8(img);
284299
std::vector<std::vector<clip_image_u8>> imgs = slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split);
285300
std::vector<std::vector<ov::Tensor>> results;
@@ -379,6 +394,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
379394
}
380395
}
381396
}
397+
ImageSliceResult image_slice_result;
382398
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, ctx_clip.image_size / patch_size);
383399
encoder.set_tensor("position_ids", position_ids);
384400
encoder.infer();
@@ -387,7 +403,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
387403
if (1 == preprocessed.size()) {
388404
ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()};
389405
output_tensor.copy_to(resized_source);
390-
return {std::move(resized_source), resized_source_size};
406+
return {{std::move(resized_source), resized_source_size}, std::move(image_slice_result)};
391407
}
392408

393409
size_t old_hidden_size = output_tensor.get_shape().at(2);
@@ -396,13 +412,14 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
396412
std::copy_n(out, resized_source.get_size(), resized_source.data<float>());
397413

398414
size_t n_patches = tgt_sizes.at(1).height * tgt_sizes.at(1).width;
399-
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
415+
image_slice_result.slices = ov::Tensor{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
400416
for (size_t col = 0; col < preprocessed.size() - 1; ++col) {
401417
for (size_t row = 0; row < preprocessed.at(1).size(); ++row) {
402-
std::copy_n(out + (col * preprocessed.at(1).size() + row + 1) * n_patches * old_hidden_size, n_patches * old_hidden_size, encoded_slices.data<float>() + (col * preprocessed.at(1).size() + row) * n_patches * old_hidden_size);
418+
std::copy_n(out + (col * preprocessed.at(1).size() + row + 1) * n_patches * old_hidden_size, n_patches * old_hidden_size, image_slice_result.slices.data<float>() + (col * preprocessed.at(1).size() + row) * n_patches * old_hidden_size);
403419
}
404420
}
405-
return {resized_source, resized_source_size, encoded_slices, tgt_sizes.at(1)};
421+
image_slice_result.target_size = tgt_sizes.at(1);
422+
return {{std::move(resized_source), resized_source_size}, std::move(image_slice_result)};
406423
}
407424

408425
} // namespace
@@ -416,26 +433,30 @@ EncodedImage VisionEncoderMiniCPM::encode(const ov::Tensor& image, const ov::Any
416433
ctx_clip.image_size = config.image_size;
417434
std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
418435
std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
419-
EncodedImage encoded_image = llava_image_embed_make_with_bytes_slice(ctx_clip, image, encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
420-
encoded_image.resampled_image = resample_encoded_image(encoded_image);
436+
437+
auto [encoded_image, image_slice_result] = llava_image_embed_make_with_bytes_slice(ctx_clip, image, encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
438+
encoded_image.resampled_image = resample_encoded_image(encoded_image, image_slice_result.slices, image_slice_result.target_size);
439+
if (image_slice_result.slices) {
440+
encoded_image.slices_shape = image_slice_result.slices.get_shape();
441+
}
421442
return encoded_image;
422443
}
423444

424-
ResampledImage VisionEncoderMiniCPM::resample_encoded_image(const EncodedImage& encoded_image) {
445+
ResampledImage VisionEncoderMiniCPM::resample_encoded_image(const EncodedImage& encoded_image, const ov::Tensor& slices, const ImageSize& target_size) {
425446
const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
426447
std::vector<std::vector<ov::Tensor>> vision_embed_tensors;
427-
if (encoded_image.slices) {
448+
if (slices) {
428449
size_t token_idx = 0;
429-
const ov::Shape& slices_shape = encoded_image.slices.get_shape();
450+
const ov::Shape& slices_shape = slices.get_shape();
430451
vision_embed_tensors.resize(slices_shape.at(0));
431452
for (size_t i = 0; i < slices_shape.at(0); ++i) {
432453
std::vector<ov::Tensor> vision_embeds;
433454
vision_embeds.resize(slices_shape.at(1));
434455
for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
435456
size_t d2 = slices_shape.at(2);
436457
size_t d3 = slices_shape.at(3);
437-
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
438-
vision_embeds[ja] = resample(encoded_view, {encoded_image.slices_size});
458+
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
459+
vision_embeds[ja] = resample(encoded_view, {target_size});
439460
}
440461
vision_embed_tensors[i] = vision_embeds;
441462
}
@@ -591,8 +612,8 @@ std::pair<std::string, std::vector<size_t>> InputsEmbedderMiniCPM::normalize_pro
591612
expanded_tag += m_vlm_config.im_id_start + std::to_string(new_image_id) + m_vlm_config.im_id_end;
592613
}
593614
expanded_tag += m_vlm_config.im_start + unk64 + m_vlm_config.im_end;
594-
if (encoded_image.slices) {
595-
ov::Shape slices_shape = encoded_image.slices.get_shape();
615+
ov::Shape slices_shape = encoded_image.slices_shape;
616+
if (slices_shape.size()) {
596617
for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) {
597618
for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) {
598619
expanded_tag += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end;
@@ -652,9 +673,9 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& unified_p
652673
++ids;
653674
std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
654675
ids += m_vlm_config.query_num;
655-
if (encoded_image.slices) {
676+
ov::Shape slices_shape = encoded_image.slices_shape;
677+
if (slices_shape.size()) {
656678
size_t token_idx = 0;
657-
const ov::Shape& slices_shape = encoded_image.slices.get_shape();
658679
for (size_t i = 0; i < slices_shape.at(0); ++i) {
659680
for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
660681
const ov::Tensor& vision_embed_tensor_i_j = encoded_image.resampled_image.vision_embed_tensors[i][ja];

src/cpp/src/visual_language/minicpm/classes.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ class VisionEncoderMiniCPM : public VisionEncoder {
2525
VLMConfig m_vlm_config;
2626

2727
ov::Tensor resample(const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes);
28+
29+
ResampledImage resample_encoded_image(const EncodedImage& image, const ov::Tensor& slices, const ImageSize& target_sizes);
2830
public:
2931
VisionEncoderMiniCPM(
3032
const std::filesystem::path& model_dir,
@@ -38,7 +40,6 @@ class VisionEncoderMiniCPM : public VisionEncoder {
3840
const std::string& device,
3941
const ov::AnyMap device_config);
4042
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
41-
ResampledImage resample_encoded_image(const EncodedImage& image);
4243
};
4344

4445
class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {

src/cpp/src/visual_language/vision_encoder.hpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,12 @@ struct EncodedImage {
3636
/// @brief A size of an image used to compute embeddings for
3737
/// divided by ProcessorConfig's patch_size.
3838
ImageSize resized_source_size;
39-
/// @brief Embeddings of images obtained from a source image by
40-
/// slicing at no more than max_slice_nums pieces and resizing.
41-
/// The tensor's shape is
42-
/// [slice_y, slice_x, number_of_embeddings, embedding_size].
43-
/// slices_sizes.size() == slice_y * slice_x.
44-
ov::Tensor slices;
45-
/// @brief A size of images used to compute embeddings
46-
/// stored in slices member divided by ProcessorConfig's patch_size.
47-
ImageSize slices_size;
39+
40+
/// @brief Shape of embeddings of images obtained from a source image by slicing
41+
/// at no more than max_slice_nums pieces and resizing,
42+
/// This shape is [slice_y, slice_x, number_of_embeddings, embedding_size].
43+
/// Used only by MiniCPM
44+
ov::Shape slices_shape;
4845

4946
/// @brief Patches grid after llava_next preprocessing.
5047
/// Format: [num_patches_height, num_patches_width]

0 commit comments

Comments
 (0)