From 9360b5775e08987ed07042fb42a720c7cf19e7d3 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 22 Jul 2025 09:28:08 +0200 Subject: [PATCH 1/6] Run finish_chat in start_chat. --- src/cpp/src/continuous_batching/pipeline.cpp | 1 + src/cpp/src/visual_language/pipeline.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index d720e07575..41cbc0d07b 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -279,6 +279,7 @@ std::vector ContinuousBatchingPipeline::generate( void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { + m_impl->finish_chat(); m_impl->start_chat(system_message); } diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 54997b5c6b..ea0f72ef39 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -461,6 +461,7 @@ VLMDecodedResults VLMPipeline::generate( } void VLMPipeline::start_chat(const std::string& system_message) { + m_pimpl->finish_chat(); m_pimpl->start_chat(system_message); } From 5d0e30f9e7e9aaf08d1ae7c283a4bfd102ad2098 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Tue, 22 Jul 2025 09:39:50 +0200 Subject: [PATCH 2/6] Remove redundant code from start_chat(). --- src/cpp/src/continuous_batching/pipeline_base.cpp | 1 - src/cpp/src/visual_language/pipeline.cpp | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 0291940bb1..923dddc29d 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -28,7 +28,6 @@ void ContinuousBatchingPipeline::IContinuousBatchingPipeline::start_chat(const s if (!system_message.empty()) { m_history.push_back({{"role", "system"}, {"content", system_message}}); } - m_image_id = 0; m_is_chat_conversation = true; }; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index ea0f72ef39..613687a150 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -299,18 +299,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ void start_chat(const std::string& system_message) override { OPENVINO_ASSERT(!m_is_npu, "start_chat() isn't supported in VLMPipeline for NPU device"); m_is_chat_conversation = true; - m_image_id = 0; - bool have_state = 0 != m_language.get_tensor("attention_mask").get_size(); - if (have_state) { - // Resetting state may be slow. - m_language.reset_state(); - // Since if is already introduced, move all resetting here. - m_language.get_tensor("attention_mask").set_shape({0, 0}); - } - auto kv_cache_state = m_inputs_embedder->get_kv_cache_state(); - if (!m_inputs_embedder->get_kv_cache_state().get_state().empty()) { - m_history.clear(); - } m_inputs_embedder->start_chat(system_message); if (system_message.empty()) { return; From e4d5f727d3e8871985a7ff19cb4745eb7f82b041 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 23 Jul 2025 11:33:32 +0200 Subject: [PATCH 3/6] Added tests, start_chat and finish_chat bindings for CB. --- .../py_continuous_batching_pipeline.cpp | 2 + tests/python_tests/test_vlm_pipeline.py | 58 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 83bcb9537a..afefe167db 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -335,6 +335,8 @@ void init_continuous_batching_pipeline(py::module_& m) { .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) + .def("start_chat", &ContinuousBatchingPipeline::start_chat, py::arg("system_message") = "") + .def("finish_chat", &ContinuousBatchingPipeline::finish_chat) .def( "generate", diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 774bcb6e1d..cf98acad66 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -543,6 +543,64 @@ def streamer(subword): assert results_with_cancel == results +@pytest.mark.precommit +@pytest.mark.parametrize("backend", attention_backend) +def test_start_chat_clears_history(backend): + callback_questions = [ + "Why is the Sun yellow?" + ] + models_path = get_ov_model(model_ids[0]) + ov_pipe = VLMPipeline(models_path, "CPU", ATTENTION_BACKEND=backend) + generation_config = ov_pipe.get_generation_config() + generation_config.max_new_tokens = 30 + + images = [] + for link in image_links_for_testing[1]: + images.append(get_image_by_link(link)) + + results_first_generate = "" + ov_pipe.start_chat() + results_first_generate += ov_pipe.generate( + callback_questions[0], images=images, generation_config=generation_config + ).texts[0] + + results_second_generate = "" + ov_pipe.start_chat() + results_second_generate += ov_pipe.generate( + callback_questions[0], images=images, generation_config=generation_config + ).texts[0] + + assert results_first_generate == results_second_generate + +@pytest.mark.precommit +def test_start_chat_clears_history_cb_api(): + callback_questions = [ + "Why is the Sun yellow?" + ] + models_path = get_ov_model(model_ids[0]) + ov_pipe = ContinuousBatchingPipeline(models_path, SchedulerConfig(), "CPU") + generation_config = GenerationConfig() + generation_config.max_new_tokens = 30 + + images = [] + for link in image_links_for_testing[1]: + images.append(get_image_by_link(link)) + + results_first_generate = "" + ov_pipe.start_chat("You are helpful assistant.") + results_first_generate = ov_pipe.generate( + [callback_questions[0]], images=[images], generation_config=[generation_config] + )[0].texts[0] + + results_second_generate = "" + ov_pipe.start_chat("You are helpful assistant.") + results_second_generate += ov_pipe.generate( + [callback_questions[0]], images=[images], generation_config=[generation_config] + )[0].texts[0] + + assert results_first_generate == results_second_generate + + @pytest.mark.precommit @pytest.mark.parametrize("model_id", model_ids) @pytest.mark.parametrize("iteration_images", [image_links_for_testing[1], []]) From 4487e59e6cf20deff5762a1070048e4a46f9d4da Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 23 Jul 2025 12:22:24 +0200 Subject: [PATCH 4/6] pyi file correction. --- src/python/openvino_genai/py_openvino_genai.pyi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 2e012b6c2d..da8a84feb9 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -373,6 +373,8 @@ class ContinuousBatchingPipeline: @typing.overload def add_request(self, request_id: int, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... + def finish_chat(self) -> None: + ... @typing.overload def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], int | None] | StreamerBase | None = None) -> list[EncodedGenerationResult]: ... @@ -393,6 +395,8 @@ class ContinuousBatchingPipeline: ... def has_non_finished_requests(self) -> bool: ... + def start_chat(self, system_message: str = '') -> None: + ... def step(self) -> None: ... class CppStdGenerator(Generator): From 94c861e405a35a981908e999fd5a2f176c31e138 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 23 Jul 2025 13:05:45 +0200 Subject: [PATCH 5/6] Fix pyi generation. --- src/python/py_continuous_batching_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index afefe167db..ef5d1132ff 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -273,7 +273,7 @@ void init_continuous_batching_pipeline(py::module_& m) { py::class_(m, "SparseAttentionConfig", sparse_attention_config_docstring) .def(py::init<>([](SparseAttentionMode mode, size_t num_last_dense_tokens_in_prefill, size_t num_retained_start_tokens_in_cache, size_t num_retained_recent_tokens_in_cache) { return SparseAttentionConfig{mode, num_last_dense_tokens_in_prefill, num_retained_start_tokens_in_cache, num_retained_recent_tokens_in_cache}; }), - py::arg("mode") = SparseAttentionMode::TRISHAPE, + py::arg_v("mode", SparseAttentionMode::TRISHAPE, "SparseAttentionMode.TRISHAPE"), py::arg("num_last_dense_tokens_in_prefill") = 100, py::arg("num_retained_start_tokens_in_cache") = 128, py::arg("num_retained_recent_tokens_in_cache") = 1920) From 7f5396866d5f866ce73a56c8771f677cc78f3269 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 23 Jul 2025 15:11:06 +0200 Subject: [PATCH 6/6] Remove not needed changes. --- src/python/py_continuous_batching_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index ef5d1132ff..afefe167db 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -273,7 +273,7 @@ void init_continuous_batching_pipeline(py::module_& m) { py::class_(m, "SparseAttentionConfig", sparse_attention_config_docstring) .def(py::init<>([](SparseAttentionMode mode, size_t num_last_dense_tokens_in_prefill, size_t num_retained_start_tokens_in_cache, size_t num_retained_recent_tokens_in_cache) { return SparseAttentionConfig{mode, num_last_dense_tokens_in_prefill, num_retained_start_tokens_in_cache, num_retained_recent_tokens_in_cache}; }), - py::arg_v("mode", SparseAttentionMode::TRISHAPE, "SparseAttentionMode.TRISHAPE"), + py::arg("mode") = SparseAttentionMode::TRISHAPE, py::arg("num_last_dense_tokens_in_prefill") = 100, py::arg("num_retained_start_tokens_in_cache") = 128, py::arg("num_retained_recent_tokens_in_cache") = 1920)