openvinotoolkit
diff --git a/‎site/docs/supported-models/_components/vlm-models-table/models.ts‎
Lines changed: 12 additions & 0 deletions b/‎site/docs/supported-models/_components/vlm-models-table/models.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx‎
Lines changed: 3 additions & 2 deletions b/‎site/docs/use-cases/image-processing/_sections/_usage_options/index.mdx‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/cpp/include/openvino/genai/visual_language/pipeline.hpp‎
Lines changed: 3 additions & 0 deletions b/‎src/cpp/include/openvino/genai/visual_language/pipeline.hpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/cpp/src/debug_utils.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/cpp/src/debug_utils.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cpp/src/visual_language/inputs_embedder.cpp‎
Lines changed: 5 additions & 0 deletions b/‎src/cpp/src/visual_language/inputs_embedder.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/cpp/src/visual_language/inputs_embedder.hpp‎
Lines changed: 1 addition & 0 deletions b/‎src/cpp/src/visual_language/inputs_embedder.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/cpp/src/visual_language/phi3_vision/classes.cpp‎
Lines changed: 55 additions & 52 deletions b/‎src/cpp/src/visual_language/phi3_vision/classes.cpp‎
Lines changed: 55 additions & 52 deletions
diff --git a/‎src/cpp/src/visual_language/phi3_vision/classes.hpp‎
Lines changed: 11 additions & 0 deletions b/‎src/cpp/src/visual_language/phi3_vision/classes.hpp‎
Lines changed: 11 additions & 0 deletions
@@ -82,6 +82,18 @@ export const VLM_MODELS: VLMModelType[] = [
       },
     ],
   },
+  {
+    architecture: 'Phi4MMForCausalLM',
+    models: [
+      {
+        name: 'phi4mm',
+        loraSupport: false,
+        links: [
+          'https://huggingface.co/microsoft/Phi-4-multimodal-instruct',
+        ],
+      },
+    ],
+  },
   {
     architecture: 'Qwen2-VL',
     models: [
 
@@ -17,8 +17,9 @@ The prompt can contain `<ov_genai_image_i>` with `i` replaced with an actual zer
 3. LLaVA-NeXT: `<image>`
 4. MiniCPM-V-2_6: `(<image>./</image>)\n`
 5. Phi-3-vision: `<|image_i|>\n` - the index starts with one
-6. Qwen2-VL: `<|vision_start|><|image_pad|><|vision_end|>`
-7. Qwen2.5-VL: `<|vision_start|><|image_pad|><|vision_end|>`
+6. Phi-4-multimodal-instruct: `<|image_i|>\n` - the index starts with one
+7. Qwen2-VL: `<|vision_start|><|image_pad|><|vision_end|>`
+8. Qwen2.5-VL: `<|vision_start|><|image_pad|><|vision_end|>`
 
 If the prompt doesn't contain image tags, but images are provided, the tags are prepended to the prompt.
 
 
@@ -114,6 +114,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// LLaVA-NeXT: <image>
     /// MiniCPM-V-2_6: (<image>./</image>)\n
     /// Phi-3-vision: <|image_i|>\n - the index starts with one
+    /// Phi-4-multimodal-instruct: <|image_i|>\n - the index starts with one
     /// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// If the prompt doesn't contain image tags, but images are
@@ -143,6 +144,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// LLaVA-NeXT: <image>
     /// MiniCPM-V-2_6: (<image>./</image>)\n
     /// Phi-3-vision: <|image_i|>\n - the index starts with one
+    /// Phi-4-multimodal-instruct: <|image_i|>\n - the index starts with one
     /// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// If the prompt doesn't contain image tags, but images are
@@ -173,6 +175,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// LLaVA-NeXT: <image>
     /// MiniCPM-V-2_6: (<image>./</image>)\n
     /// Phi-3-vision: <|image_i|>\n - the index starts with one
+    /// Phi-4-multimodal-instruct: <|image_i|>\n - the index starts with one
     /// Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|>
     /// If the prompt doesn't contain image tags, but images are
 
@@ -144,6 +144,8 @@ inline ov::Tensor from_npy(const std::filesystem::path& npy) {
         tensor_type = ov::element::f32;
     } else if ("|u1" == type) {
         tensor_type = ov::element::u8;
+    } else if ("<i8" == type) {
+        tensor_type = ov::element::i64;
     } else {
         OPENVINO_THROW("Not implemented dtype");
     }
 
@@ -11,6 +11,7 @@
 #include "visual_language/qwen2vl/classes.hpp"
 #include "visual_language/qwen2_5_vl/classes.hpp"
 #include "visual_language/phi3_vision/classes.hpp"
+#include "visual_language/phi4mm/classes.hpp"
 #include "visual_language/minicpm/classes.hpp"
 #include "visual_language/llava/classes.hpp"
 #include "visual_language/llava_next/classes.hpp"
@@ -192,6 +193,8 @@ InputsEmbedder::InputsEmbedder(const std::filesystem::path& model_dir,
         m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, model_dir, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::PHI3_V) {
         m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, model_dir, device, device_config);
+    } else if (vlm_config.model_type == VLMModelType::PHI4MM) {
+        m_impl = std::make_shared<InputsEmbedderPhi4MM>(vlm_config, model_dir, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::QWEN2_VL) {
         m_impl = std::make_shared<InputsEmbedderQwen2VL>(vlm_config, model_dir, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::QWEN2_5_VL) {
@@ -218,6 +221,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
         m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::PHI3_V) {
         m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
+    } else if (vlm_config.model_type == VLMModelType::PHI4MM) {
+        m_impl = std::make_shared<InputsEmbedderPhi4MM>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::QWEN2_VL) {
         m_impl = std::make_shared<InputsEmbedderQwen2VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::QWEN2_5_VL) {
 
@@ -179,6 +179,7 @@ class InputsEmbedder {
     friend class InputsEmbedderLLaVANext;
     friend class InputsEmbedderInternVLChat;
     friend class InputsEmbedderPhi3V;
+    friend class InputsEmbedderPhi4MM;
     friend class InputsEmbedderQwen2VL;
     friend class InputsEmbedderQwen2_5_VL;
 };
 
@@ -20,48 +20,6 @@ void write_native(std::ostream& os, size_t idx) {
     os << "<|image_" << idx + 1 << "|>\n";
 }
 
-std::string normalize_prompt_phi3(
-    const std::string& prompt, size_t base_id, size_t n_images
-) {
-    std::smatch match;
-    std::regex_search(prompt, match, NATIVE_PATTERN);
-    auto [image_prompt, image_sequence] = universal_to_native(prompt, write_native);
-    if (!image_sequence.empty()) {
-        OPENVINO_ASSERT(match.empty(), "Prompt can contain only one type of image tags.");
-        verify_ids(image_sequence, base_id, n_images);
-        return image_prompt;
-    }
-    // Restore ids from native tags
-    if (!match.empty()) {
-        size_t image_id = std::stoul(match.str(1));
-        OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
-        image_sequence.push_back(image_id - 1);
-        constexpr int submatch_id_to_return = 1;
-        for (std::sregex_token_iterator iter{
-            match.suffix().first,
-            prompt.end(),
-            NATIVE_PATTERN,
-            submatch_id_to_return
-        }; iter != std::sregex_token_iterator{}; ++iter) {
-            size_t image_id = std::stoul(*iter);
-            OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
-            image_sequence.push_back(image_id - 1);
-        }
-        if (!image_sequence.empty()) {
-            verify_ids(image_sequence, base_id, n_images);
-            return image_prompt;
-        }
-    }
-    // Prepend native tags
-    std::stringstream stream;
-    for (size_t relative_id = 0; relative_id < n_images; relative_id++) {
-        image_sequence.push_back(base_id + relative_id);
-        write_native(stream, image_sequence.back());
-    }
-    stream << prompt;
-    return stream.str();
-}
-
 ov::Tensor padding_336(const ov::Tensor& unpadded) {
     ov::Shape _1ss3 = unpadded.get_shape();
     size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2);
@@ -468,31 +426,76 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
     return res;
 }
 
+} // namespace
+
+namespace phi_utils {
+std::string normalize_prompt(
+    const std::string& prompt, size_t base_id, size_t n_images, const std::regex& native_pattern, void(*write_native)(std::ostream& os, size_t idx)
+) {
+    std::smatch match;
+    std::regex_search(prompt, match, native_pattern);
+    auto [image_prompt, image_sequence] = universal_to_native(prompt, write_native);
+    if (!image_sequence.empty()) {
+        OPENVINO_ASSERT(match.empty(), "Prompt can contain only one type of image tags.");
+        verify_ids(image_sequence, base_id, n_images);
+        return image_prompt;
+    }
+    // Restore ids from native tags
+    if (!match.empty()) {
+        size_t image_id = std::stoul(match.str(1));
+        OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
+        image_sequence.push_back(image_id - 1);
+        constexpr int submatch_id_to_return = 1;
+        for (std::sregex_token_iterator iter{
+            match.suffix().first,
+            prompt.end(),
+            native_pattern,
+            submatch_id_to_return
+        }; iter != std::sregex_token_iterator{}; ++iter) {
+            size_t image_id = std::stoul(*iter);
+            OPENVINO_ASSERT(image_id != 0, "Image tags must be greater than 0");
+            image_sequence.push_back(image_id - 1);
+        }
+        if (!image_sequence.empty()) {
+            verify_ids(image_sequence, base_id, n_images);
+            return image_prompt;
+        }
+    }
+    // Prepend native tags
+    std::stringstream stream;
+    for (size_t relative_id = 0; relative_id < n_images; relative_id++) {
+        image_sequence.push_back(base_id + relative_id);
+        write_native(stream, image_sequence.back());
+    }
+    stream << prompt;
+    return stream.str();
+}
+
 /// @brief ov::Tensor is tokenized text, size_t is image tag
-std::vector<std::variant<ov::Tensor, size_t>> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) {
+std::vector<std::variant<ov::Tensor, size_t>> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern) {
     std::vector<std::variant<ov::Tensor, size_t>> tokenized;
     auto prefix_begin = text.begin();
     bool is_submatch = false;
     for (std::sregex_token_iterator iter{
         prefix_begin,
         text.end(),
-        NATIVE_PATTERN,
+        native_pattern,
         {0, 1}  // Every match emits two values: whole match and submatch
     }; iter != std::sregex_token_iterator{}; ++iter) {
         if (is_submatch) {
             tokenized.push_back(std::stoul(iter->str()) - 1);
         } else {
             std::string regular_text{prefix_begin, iter->first};
             if (!regular_text.empty()) {
-                tokenized.push_back(tokenizer.encode(regular_text, ov::genai::add_special_tokens(true)).input_ids);
+                tokenized.push_back(tokenizer.encode(regular_text, {ov::genai::add_special_tokens(true)}).input_ids);
             }
             prefix_begin = iter->second;
         }
         is_submatch = !is_submatch;
     }
     std::string regular_text{prefix_begin, text.end()};
     if (!regular_text.empty()) {
-        tokenized.push_back(tokenizer.encode(regular_text, ov::genai::add_special_tokens(true)).input_ids);
+        tokenized.push_back(tokenizer.encode(regular_text, {ov::genai::add_special_tokens(true)}).input_ids);
     }
     return tokenized;
 }
@@ -580,7 +583,7 @@ std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::
     return chunks;
 }
 
-} // namespace
+}  // namespace phi_utils
 
 EncodedImage VisionEncoderPhi3V::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
     CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_vision_encoder.get());
@@ -664,7 +667,7 @@ InputsEmbedderPhi3V::InputsEmbedderPhi3V(
     IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
 
 std::pair<std::string, std::vector<size_t>> InputsEmbedderPhi3V::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector<EncodedImage>& images) const {
-    return {normalize_prompt_phi3(prompt, base_id, images.size()), {}};
+    return {phi_utils::normalize_prompt(prompt, base_id, images.size(), NATIVE_PATTERN, write_native), {}};
 }
 
 ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector<size_t>& image_sequence) {
@@ -677,7 +680,7 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
     std::vector<std::variant<ov::Tensor, size_t>> new_chat_tokens;
     if (m_is_chat_conversation) {
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        new_chat_tokens = split_tokenize(image_prompt, m_tokenizer);
+        new_chat_tokens = phi_utils::split_tokenize(image_prompt, m_tokenizer, NATIVE_PATTERN);
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
     } else {
@@ -690,16 +693,16 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
             templated_prompt = std::move(image_prompt);
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        new_chat_tokens = split_tokenize(templated_prompt, m_tokenizer);
+        new_chat_tokens = phi_utils::split_tokenize(templated_prompt, m_tokenizer, NATIVE_PATTERN);
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
     }
-    ov::Tensor new_merged_tokens = insert_image_placeholders(new_chat_tokens, m_tokens_per_images);
+    ov::Tensor new_merged_tokens = phi_utils::insert_image_placeholders(new_chat_tokens, m_tokens_per_images);
     ov::Tensor new_tokens = update_history(new_merged_tokens);
     m_prev_hist_length = m_kv_cache_state.get_state().size();
     m_kv_cache_state.add_inputs(new_tokens);
 
-    std::vector<std::variant<ov::Tensor, size_t>> tokens = drop_image_placeholders(new_tokens);
+    std::vector<std::variant<ov::Tensor, size_t>> tokens = phi_utils::drop_image_placeholders(new_tokens);
     ov::Tensor inputs_embeds{ov::element::f32, {1, new_tokens.get_shape().at(1), m_vlm_config.hidden_size}};
     size_t offset = 0;
     CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard(m_embedding->get_request_queue().get());
 
@@ -12,6 +12,17 @@
 
 namespace ov::genai {
 
+namespace phi_utils {
+
+std::string normalize_prompt(
+    const std::string& prompt, size_t base_id, size_t n_images, const std::regex& native_pattern, void(*write_native)(std::ostream& os, size_t idx)
+);
+std::vector<std::variant<ov::Tensor, size_t>> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern);
+ov::Tensor insert_image_placeholders(const std::vector<std::variant<ov::Tensor, size_t>>& chunks, const std::vector<size_t>& tokens_per_images);
+std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::Tensor& tokens);
+
+}
+
 class VisionEncoderPhi3V : public VisionEncoder {
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_hd_feature_transformer;
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_vision_projection;
Original file line number	Diff line number	Diff line change
`@@ -144,6 +144,8 @@ inline ov::Tensor from_npy(const std::filesystem::path& npy) {`
`144`	`144`	`tensor_type = ov::element::f32;`
`145`	`145`	`} else if ("\|u1" == type) {`
`146`	`146`	`tensor_type = ov::element::u8;`
	`147`	`+ } else if ("<i8" == type) {`
	`148`	`+ tensor_type = ov::element::i64;`
`147`	`149`	`} else {`
`148`	`150`	`OPENVINO_THROW("Not implemented dtype");`
`149`	`151`	`}`