@@ -20,48 +20,6 @@ void write_native(std::ostream& os, size_t idx) {
20
20
os << " <|image_" << idx + 1 << " |>\n " ;
21
21
}
22
22
23
- std::string normalize_prompt_phi3 (
24
- const std::string& prompt, size_t base_id, size_t n_images
25
- ) {
26
- std::smatch match;
27
- std::regex_search (prompt, match, NATIVE_PATTERN);
28
- auto [image_prompt, image_sequence] = universal_to_native (prompt, write_native);
29
- if (!image_sequence.empty ()) {
30
- OPENVINO_ASSERT (match.empty (), " Prompt can contain only one type of image tags." );
31
- verify_ids (image_sequence, base_id, n_images);
32
- return image_prompt;
33
- }
34
- // Restore ids from native tags
35
- if (!match.empty ()) {
36
- size_t image_id = std::stoul (match.str (1 ));
37
- OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
38
- image_sequence.push_back (image_id - 1 );
39
- constexpr int submatch_id_to_return = 1 ;
40
- for (std::sregex_token_iterator iter{
41
- match.suffix ().first ,
42
- prompt.end (),
43
- NATIVE_PATTERN,
44
- submatch_id_to_return
45
- }; iter != std::sregex_token_iterator{}; ++iter) {
46
- size_t image_id = std::stoul (*iter);
47
- OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
48
- image_sequence.push_back (image_id - 1 );
49
- }
50
- if (!image_sequence.empty ()) {
51
- verify_ids (image_sequence, base_id, n_images);
52
- return image_prompt;
53
- }
54
- }
55
- // Prepend native tags
56
- std::stringstream stream;
57
- for (size_t relative_id = 0 ; relative_id < n_images; relative_id++) {
58
- image_sequence.push_back (base_id + relative_id);
59
- write_native (stream, image_sequence.back ());
60
- }
61
- stream << prompt;
62
- return stream.str ();
63
- }
64
-
65
23
ov::Tensor padding_336 (const ov::Tensor& unpadded) {
66
24
ov::Shape _1ss3 = unpadded.get_shape ();
67
25
size_t s1 = _1ss3.at (1 ), s2 = _1ss3.at (2 );
@@ -468,31 +426,76 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
468
426
return res;
469
427
}
470
428
429
+ } // namespace
430
+
431
+ namespace phi_utils {
432
+ std::string normalize_prompt (
433
+ const std::string& prompt, size_t base_id, size_t n_images, const std::regex& native_pattern, void (*write_native)(std::ostream& os, size_t idx)
434
+ ) {
435
+ std::smatch match;
436
+ std::regex_search (prompt, match, native_pattern);
437
+ auto [image_prompt, image_sequence] = universal_to_native (prompt, write_native);
438
+ if (!image_sequence.empty ()) {
439
+ OPENVINO_ASSERT (match.empty (), " Prompt can contain only one type of image tags." );
440
+ verify_ids (image_sequence, base_id, n_images);
441
+ return image_prompt;
442
+ }
443
+ // Restore ids from native tags
444
+ if (!match.empty ()) {
445
+ size_t image_id = std::stoul (match.str (1 ));
446
+ OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
447
+ image_sequence.push_back (image_id - 1 );
448
+ constexpr int submatch_id_to_return = 1 ;
449
+ for (std::sregex_token_iterator iter{
450
+ match.suffix ().first ,
451
+ prompt.end (),
452
+ native_pattern,
453
+ submatch_id_to_return
454
+ }; iter != std::sregex_token_iterator{}; ++iter) {
455
+ size_t image_id = std::stoul (*iter);
456
+ OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
457
+ image_sequence.push_back (image_id - 1 );
458
+ }
459
+ if (!image_sequence.empty ()) {
460
+ verify_ids (image_sequence, base_id, n_images);
461
+ return image_prompt;
462
+ }
463
+ }
464
+ // Prepend native tags
465
+ std::stringstream stream;
466
+ for (size_t relative_id = 0 ; relative_id < n_images; relative_id++) {
467
+ image_sequence.push_back (base_id + relative_id);
468
+ write_native (stream, image_sequence.back ());
469
+ }
470
+ stream << prompt;
471
+ return stream.str ();
472
+ }
473
+
471
474
// / @brief ov::Tensor is tokenized text, size_t is image tag
472
- std::vector<std::variant<ov::Tensor, size_t >> split_tokenize (const std::string& text, ov::genai::Tokenizer& tokenizer) {
475
+ std::vector<std::variant<ov::Tensor, size_t >> split_tokenize (const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern ) {
473
476
std::vector<std::variant<ov::Tensor, size_t >> tokenized;
474
477
auto prefix_begin = text.begin ();
475
478
bool is_submatch = false ;
476
479
for (std::sregex_token_iterator iter{
477
480
prefix_begin,
478
481
text.end (),
479
- NATIVE_PATTERN ,
482
+ native_pattern ,
480
483
{0 , 1 } // Every match emits two values: whole match and submatch
481
484
}; iter != std::sregex_token_iterator{}; ++iter) {
482
485
if (is_submatch) {
483
486
tokenized.push_back (std::stoul (iter->str ()) - 1 );
484
487
} else {
485
488
std::string regular_text{prefix_begin, iter->first };
486
489
if (!regular_text.empty ()) {
487
- tokenized.push_back (tokenizer.encode (regular_text, ov::genai::add_special_tokens (true )).input_ids );
490
+ tokenized.push_back (tokenizer.encode (regular_text, { ov::genai::add_special_tokens (true )} ).input_ids );
488
491
}
489
492
prefix_begin = iter->second ;
490
493
}
491
494
is_submatch = !is_submatch;
492
495
}
493
496
std::string regular_text{prefix_begin, text.end ()};
494
497
if (!regular_text.empty ()) {
495
- tokenized.push_back (tokenizer.encode (regular_text, ov::genai::add_special_tokens (true )).input_ids );
498
+ tokenized.push_back (tokenizer.encode (regular_text, { ov::genai::add_special_tokens (true )} ).input_ids );
496
499
}
497
500
return tokenized;
498
501
}
@@ -580,7 +583,7 @@ std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::
580
583
return chunks;
581
584
}
582
585
583
- } // namespace
586
+ } // namespace phi_utils
584
587
585
588
EncodedImage VisionEncoderPhi3V::encode (const ov::Tensor& image, const ov::AnyMap& config_map) {
586
589
CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard (this ->m_ireq_queue_vision_encoder .get ());
@@ -664,7 +667,7 @@ InputsEmbedderPhi3V::InputsEmbedderPhi3V(
664
667
IInputsEmbedder (vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
665
668
666
669
std::pair<std::string, std::vector<size_t >> InputsEmbedderPhi3V::normalize_prompt (const std::string& prompt, size_t base_id, const std::vector<EncodedImage>& images) const {
667
- return {normalize_prompt_phi3 (prompt, base_id, images.size ()), {}};
670
+ return {phi_utils::normalize_prompt (prompt, base_id, images.size (), NATIVE_PATTERN, write_native ), {}};
668
671
}
669
672
670
673
ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds (const std::string& image_prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector<size_t >& image_sequence) {
@@ -677,7 +680,7 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
677
680
std::vector<std::variant<ov::Tensor, size_t >> new_chat_tokens;
678
681
if (m_is_chat_conversation) {
679
682
auto start_tokenizer_time = std::chrono::steady_clock::now ();
680
- new_chat_tokens = split_tokenize (image_prompt, m_tokenizer);
683
+ new_chat_tokens = phi_utils:: split_tokenize (image_prompt, m_tokenizer, NATIVE_PATTERN );
681
684
auto end_tokenizer_time = std::chrono::steady_clock::now ();
682
685
metrics.raw_metrics .tokenization_durations .emplace_back (PerfMetrics::get_microsec (end_tokenizer_time - start_tokenizer_time));
683
686
} else {
@@ -690,16 +693,16 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
690
693
templated_prompt = std::move (image_prompt);
691
694
}
692
695
auto start_tokenizer_time = std::chrono::steady_clock::now ();
693
- new_chat_tokens = split_tokenize (templated_prompt, m_tokenizer);
696
+ new_chat_tokens = phi_utils:: split_tokenize (templated_prompt, m_tokenizer, NATIVE_PATTERN );
694
697
auto end_tokenizer_time = std::chrono::steady_clock::now ();
695
698
metrics.raw_metrics .tokenization_durations .emplace_back (PerfMetrics::get_microsec (end_tokenizer_time - start_tokenizer_time));
696
699
}
697
- ov::Tensor new_merged_tokens = insert_image_placeholders (new_chat_tokens, m_tokens_per_images);
700
+ ov::Tensor new_merged_tokens = phi_utils:: insert_image_placeholders (new_chat_tokens, m_tokens_per_images);
698
701
ov::Tensor new_tokens = update_history (new_merged_tokens);
699
702
m_prev_hist_length = m_kv_cache_state.get_state ().size ();
700
703
m_kv_cache_state.add_inputs (new_tokens);
701
704
702
- std::vector<std::variant<ov::Tensor, size_t >> tokens = drop_image_placeholders (new_tokens);
705
+ std::vector<std::variant<ov::Tensor, size_t >> tokens = phi_utils:: drop_image_placeholders (new_tokens);
703
706
ov::Tensor inputs_embeds{ov::element::f32 , {1 , new_tokens.get_shape ().at (1 ), m_vlm_config.hidden_size }};
704
707
size_t offset = 0 ;
705
708
CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard (m_embedding->get_request_queue ().get ());
0 commit comments