Skip to content

Commit 5561b73

Browse files
Wovchenayatarkanas-suvorovAndrei Kochin
authored
master: add Phi-4-multimodal-instruct (#2264)
Ticket: CVS-169086 --------- Co-authored-by: yatarkan <[email protected]> Co-authored-by: Alexander Suvorov <[email protected]> Co-authored-by: Andrei Kochin <[email protected]>
1 parent 12b853b commit 5561b73

File tree

18 files changed

+1080
-60
lines changed

18 files changed

+1080
-60
lines changed

site/docs/supported-models/_components/vlm-models-table/models.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,18 @@ export const VLM_MODELS: VLMModelType[] = [
8282
},
8383
],
8484
},
85+
{
86+
architecture: 'Phi4MMForCausalLM',
87+
models: [
88+
{
89+
name: 'phi4mm',
90+
loraSupport: false,
91+
links: [
92+
'https://huggingface.co/microsoft/Phi-4-multimodal-instruct',
93+
],
94+
},
95+
],
96+
},
8597
{
8698
architecture: 'Qwen2-VL',
8799
models: [

site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ The prompt can contain `<ov_genai_image_i>` with `i` replaced with an actual zer
1717
3. LLaVA-NeXT: `<image>`
1818
4. MiniCPM-V-2_6: `(<image>./</image>)\n`
1919
5. Phi-3-vision: `<|image_i|>\n` - the index starts with one
20-
6. Qwen2-VL: `<|vision_start|><|image_pad|><|vision_end|>`
21-
7. Qwen2.5-VL: `<|vision_start|><|image_pad|><|vision_end|>`
20+
6. Phi-4-multimodal-instruct: `<|image_i|>\n` - the index starts with one
21+
7. Qwen2-VL: `<|vision_start|><|image_pad|><|vision_end|>`
22+
8. Qwen2.5-VL: `<|vision_start|><|image_pad|><|vision_end|>`
2223

2324
If the prompt doesn't contain image tags, but images are provided, the tags are prepended to the prompt.
2425

src/cpp/include/openvino/genai/visual_language/pipeline.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
114114
/// LLaVA-NeXT: <image>
115115
/// MiniCPM-V-2_6: (<image>./</image>)\n
116116
/// Phi-3-vision: <|image_i|>\n - the index starts with one
117+
/// Phi-4-multimodal-instruct: <|image_i|>\n - the index starts with one
117118
/// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
118119
/// Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|>
119120
/// If the prompt doesn't contain image tags, but images are
@@ -143,6 +144,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
143144
/// LLaVA-NeXT: <image>
144145
/// MiniCPM-V-2_6: (<image>./</image>)\n
145146
/// Phi-3-vision: <|image_i|>\n - the index starts with one
147+
/// Phi-4-multimodal-instruct: <|image_i|>\n - the index starts with one
146148
/// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
147149
/// Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|>
148150
/// If the prompt doesn't contain image tags, but images are
@@ -173,6 +175,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
173175
/// LLaVA-NeXT: <image>
174176
/// MiniCPM-V-2_6: (<image>./</image>)\n
175177
/// Phi-3-vision: <|image_i|>\n - the index starts with one
178+
/// Phi-4-multimodal-instruct: <|image_i|>\n - the index starts with one
176179
/// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
177180
/// Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|>
178181
/// If the prompt doesn't contain image tags, but images are

src/cpp/src/debug_utils.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ inline ov::Tensor from_npy(const std::filesystem::path& npy) {
144144
tensor_type = ov::element::f32;
145145
} else if ("|u1" == type) {
146146
tensor_type = ov::element::u8;
147+
} else if ("<i8" == type) {
148+
tensor_type = ov::element::i64;
147149
} else {
148150
OPENVINO_THROW("Not implemented dtype");
149151
}

src/cpp/src/visual_language/inputs_embedder.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "visual_language/qwen2vl/classes.hpp"
1212
#include "visual_language/qwen2_5_vl/classes.hpp"
1313
#include "visual_language/phi3_vision/classes.hpp"
14+
#include "visual_language/phi4mm/classes.hpp"
1415
#include "visual_language/minicpm/classes.hpp"
1516
#include "visual_language/llava/classes.hpp"
1617
#include "visual_language/llava_next/classes.hpp"
@@ -192,6 +193,8 @@ InputsEmbedder::InputsEmbedder(const std::filesystem::path& model_dir,
192193
m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, model_dir, device, device_config);
193194
} else if (vlm_config.model_type == VLMModelType::PHI3_V) {
194195
m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, model_dir, device, device_config);
196+
} else if (vlm_config.model_type == VLMModelType::PHI4MM) {
197+
m_impl = std::make_shared<InputsEmbedderPhi4MM>(vlm_config, model_dir, device, device_config);
195198
} else if (vlm_config.model_type == VLMModelType::QWEN2_VL) {
196199
m_impl = std::make_shared<InputsEmbedderQwen2VL>(vlm_config, model_dir, device, device_config);
197200
} else if (vlm_config.model_type == VLMModelType::QWEN2_5_VL) {
@@ -218,6 +221,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
218221
m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
219222
} else if (vlm_config.model_type == VLMModelType::PHI3_V) {
220223
m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
224+
} else if (vlm_config.model_type == VLMModelType::PHI4MM) {
225+
m_impl = std::make_shared<InputsEmbedderPhi4MM>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
221226
} else if (vlm_config.model_type == VLMModelType::QWEN2_VL) {
222227
m_impl = std::make_shared<InputsEmbedderQwen2VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
223228
} else if (vlm_config.model_type == VLMModelType::QWEN2_5_VL) {

src/cpp/src/visual_language/inputs_embedder.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ class InputsEmbedder {
179179
friend class InputsEmbedderLLaVANext;
180180
friend class InputsEmbedderInternVLChat;
181181
friend class InputsEmbedderPhi3V;
182+
friend class InputsEmbedderPhi4MM;
182183
friend class InputsEmbedderQwen2VL;
183184
friend class InputsEmbedderQwen2_5_VL;
184185
};

src/cpp/src/visual_language/phi3_vision/classes.cpp

Lines changed: 55 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -20,48 +20,6 @@ void write_native(std::ostream& os, size_t idx) {
2020
os << "<|image_" << idx + 1 << "|>\n";
2121
}
2222

23-
std::string normalize_prompt_phi3(
24-
const std::string& prompt, size_t base_id, size_t n_images
25-
) {
26-
std::smatch match;
27-
std::regex_search(prompt, match, NATIVE_PATTERN);
28-
auto [image_prompt, image_sequence] = universal_to_native(prompt, write_native);
29-
if (!image_sequence.empty()) {
30-
OPENVINO_ASSERT(match.empty(), "Prompt can contain only one type of image tags.");
31-
verify_ids(image_sequence, base_id, n_images);
32-
return image_prompt;
33-
}
34-
// Restore ids from native tags
35-
if (!match.empty()) {
36-
size_t image_id = std::stoul(match.str(1));
37-
OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
38-
image_sequence.push_back(image_id - 1);
39-
constexpr int submatch_id_to_return = 1;
40-
for (std::sregex_token_iterator iter{
41-
match.suffix().first,
42-
prompt.end(),
43-
NATIVE_PATTERN,
44-
submatch_id_to_return
45-
}; iter != std::sregex_token_iterator{}; ++iter) {
46-
size_t image_id = std::stoul(*iter);
47-
OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
48-
image_sequence.push_back(image_id - 1);
49-
}
50-
if (!image_sequence.empty()) {
51-
verify_ids(image_sequence, base_id, n_images);
52-
return image_prompt;
53-
}
54-
}
55-
// Prepend native tags
56-
std::stringstream stream;
57-
for (size_t relative_id = 0; relative_id < n_images; relative_id++) {
58-
image_sequence.push_back(base_id + relative_id);
59-
write_native(stream, image_sequence.back());
60-
}
61-
stream << prompt;
62-
return stream.str();
63-
}
64-
6523
ov::Tensor padding_336(const ov::Tensor& unpadded) {
6624
ov::Shape _1ss3 = unpadded.get_shape();
6725
size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2);
@@ -468,31 +426,76 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
468426
return res;
469427
}
470428

429+
} // namespace
430+
431+
namespace phi_utils {
432+
std::string normalize_prompt(
433+
const std::string& prompt, size_t base_id, size_t n_images, const std::regex& native_pattern, void(*write_native)(std::ostream& os, size_t idx)
434+
) {
435+
std::smatch match;
436+
std::regex_search(prompt, match, native_pattern);
437+
auto [image_prompt, image_sequence] = universal_to_native(prompt, write_native);
438+
if (!image_sequence.empty()) {
439+
OPENVINO_ASSERT(match.empty(), "Prompt can contain only one type of image tags.");
440+
verify_ids(image_sequence, base_id, n_images);
441+
return image_prompt;
442+
}
443+
// Restore ids from native tags
444+
if (!match.empty()) {
445+
size_t image_id = std::stoul(match.str(1));
446+
OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
447+
image_sequence.push_back(image_id - 1);
448+
constexpr int submatch_id_to_return = 1;
449+
for (std::sregex_token_iterator iter{
450+
match.suffix().first,
451+
prompt.end(),
452+
native_pattern,
453+
submatch_id_to_return
454+
}; iter != std::sregex_token_iterator{}; ++iter) {
455+
size_t image_id = std::stoul(*iter);
456+
OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
457+
image_sequence.push_back(image_id - 1);
458+
}
459+
if (!image_sequence.empty()) {
460+
verify_ids(image_sequence, base_id, n_images);
461+
return image_prompt;
462+
}
463+
}
464+
// Prepend native tags
465+
std::stringstream stream;
466+
for (size_t relative_id = 0; relative_id < n_images; relative_id++) {
467+
image_sequence.push_back(base_id + relative_id);
468+
write_native(stream, image_sequence.back());
469+
}
470+
stream << prompt;
471+
return stream.str();
472+
}
473+
471474
/// @brief ov::Tensor is tokenized text, size_t is image tag
472-
std::vector<std::variant<ov::Tensor, size_t>> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) {
475+
std::vector<std::variant<ov::Tensor, size_t>> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern) {
473476
std::vector<std::variant<ov::Tensor, size_t>> tokenized;
474477
auto prefix_begin = text.begin();
475478
bool is_submatch = false;
476479
for (std::sregex_token_iterator iter{
477480
prefix_begin,
478481
text.end(),
479-
NATIVE_PATTERN,
482+
native_pattern,
480483
{0, 1} // Every match emits two values: whole match and submatch
481484
}; iter != std::sregex_token_iterator{}; ++iter) {
482485
if (is_submatch) {
483486
tokenized.push_back(std::stoul(iter->str()) - 1);
484487
} else {
485488
std::string regular_text{prefix_begin, iter->first};
486489
if (!regular_text.empty()) {
487-
tokenized.push_back(tokenizer.encode(regular_text, ov::genai::add_special_tokens(true)).input_ids);
490+
tokenized.push_back(tokenizer.encode(regular_text, {ov::genai::add_special_tokens(true)}).input_ids);
488491
}
489492
prefix_begin = iter->second;
490493
}
491494
is_submatch = !is_submatch;
492495
}
493496
std::string regular_text{prefix_begin, text.end()};
494497
if (!regular_text.empty()) {
495-
tokenized.push_back(tokenizer.encode(regular_text, ov::genai::add_special_tokens(true)).input_ids);
498+
tokenized.push_back(tokenizer.encode(regular_text, {ov::genai::add_special_tokens(true)}).input_ids);
496499
}
497500
return tokenized;
498501
}
@@ -580,7 +583,7 @@ std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::
580583
return chunks;
581584
}
582585

583-
} // namespace
586+
} // namespace phi_utils
584587

585588
EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
586589
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
@@ -664,7 +667,7 @@ InputsEmbedderPhi3V::InputsEmbedderPhi3V(
664667
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
665668

666669
std::pair<std::string, std::vector<size_t>> InputsEmbedderPhi3V::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector<EncodedImage>& images) const {
667-
return {normalize_prompt_phi3(prompt, base_id, images.size()), {}};
670+
return {phi_utils::normalize_prompt(prompt, base_id, images.size(), NATIVE_PATTERN, write_native), {}};
668671
}
669672

670673
ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector<size_t>& image_sequence) {
@@ -677,7 +680,7 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
677680
std::vector<std::variant<ov::Tensor, size_t>> new_chat_tokens;
678681
if (m_is_chat_conversation) {
679682
auto start_tokenizer_time = std::chrono::steady_clock::now();
680-
new_chat_tokens = split_tokenize(image_prompt, m_tokenizer);
683+
new_chat_tokens = phi_utils::split_tokenize(image_prompt, m_tokenizer, NATIVE_PATTERN);
681684
auto end_tokenizer_time = std::chrono::steady_clock::now();
682685
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
683686
} else {
@@ -690,16 +693,16 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
690693
templated_prompt = std::move(image_prompt);
691694
}
692695
auto start_tokenizer_time = std::chrono::steady_clock::now();
693-
new_chat_tokens = split_tokenize(templated_prompt, m_tokenizer);
696+
new_chat_tokens = phi_utils::split_tokenize(templated_prompt, m_tokenizer, NATIVE_PATTERN);
694697
auto end_tokenizer_time = std::chrono::steady_clock::now();
695698
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
696699
}
697-
ov::Tensor new_merged_tokens = insert_image_placeholders(new_chat_tokens, m_tokens_per_images);
700+
ov::Tensor new_merged_tokens = phi_utils::insert_image_placeholders(new_chat_tokens, m_tokens_per_images);
698701
ov::Tensor new_tokens = update_history(new_merged_tokens);
699702
m_prev_hist_length = m_kv_cache_state.get_state().size();
700703
m_kv_cache_state.add_inputs(new_tokens);
701704

702-
std::vector<std::variant<ov::Tensor, size_t>> tokens = drop_image_placeholders(new_tokens);
705+
std::vector<std::variant<ov::Tensor, size_t>> tokens = phi_utils::drop_image_placeholders(new_tokens);
703706
ov::Tensor inputs_embeds{ov::element::f32, {1, new_tokens.get_shape().at(1), m_vlm_config.hidden_size}};
704707
size_t offset = 0;
705708
CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard(m_embedding->get_request_queue().get());

src/cpp/src/visual_language/phi3_vision/classes.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@
1212

1313
namespace ov::genai {
1414

15+
namespace phi_utils {
16+
17+
std::string normalize_prompt(
18+
const std::string& prompt, size_t base_id, size_t n_images, const std::regex& native_pattern, void(*write_native)(std::ostream& os, size_t idx)
19+
);
20+
std::vector<std::variant<ov::Tensor, size_t>> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern);
21+
ov::Tensor insert_image_placeholders(const std::vector<std::variant<ov::Tensor, size_t>>& chunks, const std::vector<size_t>& tokens_per_images);
22+
std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::Tensor& tokens);
23+
24+
}
25+
1526
class VisionEncoderPhi3V : public VisionEncoder {
1627
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_hd_feature_transformer;
1728
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_vision_projection;

0 commit comments

Comments
 (0)