openvinotoolkit · Wovchena · Jul 25, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp
@@ -279,6 +279,7 @@ std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(
 
 
 void ContinuousBatchingPipeline::start_chat(const std::string& system_message) {
+    m_impl->finish_chat();
     m_impl->start_chat(system_message);
 }
 

diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp
@@ -28,7 +28,6 @@ void ContinuousBatchingPipeline::IContinuousBatchingPipeline::start_chat(const s
     if (!system_message.empty()) {
         m_history.push_back({{"role", "system"}, {"content", system_message}});
     }
-    m_image_id = 0;
     m_is_chat_conversation = true;
 };
 

diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -300,18 +300,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     void start_chat(const std::string& system_message) override {
         OPENVINO_ASSERT(!m_is_npu, "start_chat() isn't supported in VLMPipeline for NPU device");
         m_is_chat_conversation = true;
-        m_image_id = 0;
-        bool have_state = 0 != m_language.get_tensor("attention_mask").get_size();
-        if (have_state) {
-            // Resetting state may be slow.
-            m_language.reset_state();
-            // Since if is already introduced, move all resetting here.
-            m_language.get_tensor("attention_mask").set_shape({0, 0});
-        }
-        auto kv_cache_state = m_inputs_embedder->get_kv_cache_state();
-        if (!m_inputs_embedder->get_kv_cache_state().get_state().empty()) {
-            m_history.clear();
-        }
         m_inputs_embedder->start_chat(system_message);
         if (system_message.empty()) {
             return;
@@ -472,6 +460,7 @@ VLMDecodedResults VLMPipeline::generate(
 }
 
 void VLMPipeline::start_chat(const std::string& system_message) {
+    m_pimpl->finish_chat();
     m_pimpl->start_chat(system_message);
 }
 

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -413,6 +413,8 @@ class ContinuousBatchingPipeline:
     @typing.overload
     def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle:
         ...
+    def finish_chat(self) -> None:
+        ...
     @typing.overload
     def generate(self, input_ids: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[EncodedGenerationResult]:
         ...
@@ -433,6 +435,8 @@ class ContinuousBatchingPipeline:
         ...
     def has_non_finished_requests(self) -> bool:
         ...
+    def start_chat(self, system_message: str = '') -> None:
+        ...
     def step(self) -> None:
         ...
 class CppStdGenerator(Generator):

diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
@@ -337,6 +337,8 @@ void init_continuous_batching_pipeline(py::module_& m) {
         .def("step", &ContinuousBatchingPipeline::step)
         .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)
 
+        .def("start_chat", &ContinuousBatchingPipeline::start_chat, py::arg("system_message") = "")
+        .def("finish_chat", &ContinuousBatchingPipeline::finish_chat)
 
         .def(
             "generate",

diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
@@ -543,6 +543,64 @@ def streamer(subword):
     assert results_with_cancel == results
 
 
+@pytest.mark.precommit
+@pytest.mark.parametrize("backend", attention_backend)
+def test_start_chat_clears_history(backend):
+    callback_questions = [
+        "Why is the Sun yellow?"
+    ]
+    models_path = get_ov_model(model_ids[0])
+    ov_pipe = VLMPipeline(models_path, "CPU", ATTENTION_BACKEND=backend)
+    generation_config = ov_pipe.get_generation_config()
+    generation_config.max_new_tokens = 30
+
+    images = []
+    for link in image_links_for_testing[1]:
+        images.append(get_image_by_link(link))
+
+    results_first_generate = ""
+    ov_pipe.start_chat()
+    results_first_generate += ov_pipe.generate(
+        callback_questions[0], images=images, generation_config=generation_config
+    ).texts[0]
+
+    results_second_generate = ""
+    ov_pipe.start_chat()
+    results_second_generate += ov_pipe.generate(
+        callback_questions[0], images=images, generation_config=generation_config
+    ).texts[0]
+
+    assert results_first_generate == results_second_generate
+
+@pytest.mark.precommit
+def test_start_chat_clears_history_cb_api():
+    callback_questions = [
+        "Why is the Sun yellow?"
+    ]
+    models_path = get_ov_model(model_ids[0])
+    ov_pipe = ContinuousBatchingPipeline(models_path, SchedulerConfig(), "CPU")
+    generation_config = GenerationConfig()
+    generation_config.max_new_tokens = 30
+
+    images = []
+    for link in image_links_for_testing[1]:
+        images.append(get_image_by_link(link))
+
+    results_first_generate = ""
+    ov_pipe.start_chat("You are helpful assistant.")
+    results_first_generate = ov_pipe.generate(
+        [callback_questions[0]], images=[images], generation_config=[generation_config]
+    )[0].texts[0]
+
+    results_second_generate = ""
+    ov_pipe.start_chat("You are helpful assistant.")
+    results_second_generate += ov_pipe.generate(
+        [callback_questions[0]], images=[images], generation_config=[generation_config]
+    )[0].texts[0]
+
+    assert results_first_generate == results_second_generate
+
+
 @pytest.mark.precommit
 @pytest.mark.parametrize("model_id", model_ids)
 @pytest.mark.parametrize("iteration_images", [image_links_for_testing[1], []])
-Original file line number
+Diff line change
@@ Expand Up @@
     void ContinuousBatchingPipeline::start_chat(const std::string& system_message) {
+        m_impl->finish_chat();
         m_impl->start_chat(system_message);
     }
@@ Expand Down @@