Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/cpp/src/continuous_batching/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(


void ContinuousBatchingPipeline::start_chat(const std::string& system_message) {
m_impl->finish_chat();
m_impl->start_chat(system_message);
}

Expand Down
1 change: 0 additions & 1 deletion src/cpp/src/continuous_batching/pipeline_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ void ContinuousBatchingPipeline::IContinuousBatchingPipeline::start_chat(const s
if (!system_message.empty()) {
m_history.push_back({{"role", "system"}, {"content", system_message}});
}
m_image_id = 0;
m_is_chat_conversation = true;
};

Expand Down
13 changes: 1 addition & 12 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,18 +300,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
void start_chat(const std::string& system_message) override {
OPENVINO_ASSERT(!m_is_npu, "start_chat() isn't supported in VLMPipeline for NPU device");
m_is_chat_conversation = true;
m_image_id = 0;
bool have_state = 0 != m_language.get_tensor("attention_mask").get_size();
if (have_state) {
// Resetting state may be slow.
m_language.reset_state();
// Since if is already introduced, move all resetting here.
m_language.get_tensor("attention_mask").set_shape({0, 0});
}
auto kv_cache_state = m_inputs_embedder->get_kv_cache_state();
if (!m_inputs_embedder->get_kv_cache_state().get_state().empty()) {
m_history.clear();
}
m_inputs_embedder->start_chat(system_message);
Copy link
Preview

Copilot AI Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the removal of state reset logic, ensure that finish_chat() properly handles all the cleanup that was previously done here, including m_image_id reset, state reset, attention_mask reshaping, and history clearing.

Copilot uses AI. Check for mistakes.

if (system_message.empty()) {
return;
Expand Down Expand Up @@ -472,6 +460,7 @@ VLMDecodedResults VLMPipeline::generate(
}

void VLMPipeline::start_chat(const std::string& system_message) {
m_pimpl->finish_chat();
m_pimpl->start_chat(system_message);
}

Expand Down
4 changes: 4 additions & 0 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,8 @@ class ContinuousBatchingPipeline:
@typing.overload
def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle:
...
def finish_chat(self) -> None:
...
@typing.overload
def generate(self, input_ids: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[EncodedGenerationResult]:
...
Expand All @@ -433,6 +435,8 @@ class ContinuousBatchingPipeline:
...
def has_non_finished_requests(self) -> bool:
...
def start_chat(self, system_message: str = '') -> None:
...
def step(self) -> None:
...
class CppStdGenerator(Generator):
Expand Down
2 changes: 2 additions & 0 deletions src/python/py_continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,8 @@ void init_continuous_batching_pipeline(py::module_& m) {
.def("step", &ContinuousBatchingPipeline::step)
.def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests)

.def("start_chat", &ContinuousBatchingPipeline::start_chat, py::arg("system_message") = "")
.def("finish_chat", &ContinuousBatchingPipeline::finish_chat)

.def(
"generate",
Expand Down
58 changes: 58 additions & 0 deletions tests/python_tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,64 @@ def streamer(subword):
assert results_with_cancel == results


@pytest.mark.precommit
@pytest.mark.parametrize("backend", attention_backend)
def test_start_chat_clears_history(backend):
callback_questions = [
"Why is the Sun yellow?"
]
models_path = get_ov_model(model_ids[0])
ov_pipe = VLMPipeline(models_path, "CPU", ATTENTION_BACKEND=backend)
generation_config = ov_pipe.get_generation_config()
generation_config.max_new_tokens = 30

images = []
for link in image_links_for_testing[1]:
images.append(get_image_by_link(link))

results_first_generate = ""
ov_pipe.start_chat()
results_first_generate += ov_pipe.generate(
callback_questions[0], images=images, generation_config=generation_config
).texts[0]

results_second_generate = ""
ov_pipe.start_chat()
results_second_generate += ov_pipe.generate(
callback_questions[0], images=images, generation_config=generation_config
).texts[0]

assert results_first_generate == results_second_generate

@pytest.mark.precommit
def test_start_chat_clears_history_cb_api():
callback_questions = [
"Why is the Sun yellow?"
]
models_path = get_ov_model(model_ids[0])
ov_pipe = ContinuousBatchingPipeline(models_path, SchedulerConfig(), "CPU")
generation_config = GenerationConfig()
generation_config.max_new_tokens = 30

images = []
for link in image_links_for_testing[1]:
images.append(get_image_by_link(link))

results_first_generate = ""
ov_pipe.start_chat("You are helpful assistant.")
results_first_generate = ov_pipe.generate(
[callback_questions[0]], images=[images], generation_config=[generation_config]
)[0].texts[0]

results_second_generate = ""
ov_pipe.start_chat("You are helpful assistant.")
results_second_generate += ov_pipe.generate(
[callback_questions[0]], images=[images], generation_config=[generation_config]
)[0].texts[0]

assert results_first_generate == results_second_generate


@pytest.mark.precommit
@pytest.mark.parametrize("model_id", model_ids)
@pytest.mark.parametrize("iteration_images", [image_links_for_testing[1], []])
Expand Down