diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 3a0b77a98c..9ed76df47f 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -891,8 +891,7 @@ jobs: PYTHONPATH: "${{ env.BUILD_DIR }}:" run: | source ${{ env.OV_INSTALL_DIR }}/setupvars.sh - python3 -m pytest -v ${{ env.SRC_DIR }}/tests/python_tests/test_llm_pipeline.py - # python3 -m pytest -v ${{ env.SRC_DIR }}/tests/python_tests/test_structured_output.py + python3 -m pytest -v ${{ env.SRC_DIR }}/tests/python_tests/test_llm_pipeline.py -k "not test_perf_metrics_with_structured_output" Overall_Status: name: ci/gha_overall_status_linux diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index eac745ff9d..e5bcfec45c 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -7,6 +7,8 @@ #include "openvino/genai/visibility.hpp" #include #include +#include +#include #include namespace ov { @@ -27,6 +29,7 @@ using MicroSeconds = std::chrono::duration>; * @param m_batch_sizes Batch sizes for each generate call. * @param m_durations Total durations for each generate call in microseconds. * @param m_inference_durations Total inference duration for each generate call in microseconds. + * @param m_grammar_compile_times Time to compile the grammar in microseconds. */ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector generate_durations; @@ -39,6 +42,8 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { std::vector m_batch_sizes; std::vector m_durations; std::vector m_inference_durations; + + std::vector m_grammar_compile_times; }; /** @@ -49,6 +54,16 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair { float std; }; +/** +* @brief Structure to store list of durations in milliseconds. +*/ +struct OPENVINO_GENAI_EXPORTS SummaryStats { + float mean; + float std; + float min; + float max; +}; + /** * @brief Holds performance metrics for each generate call. * @@ -76,6 +91,8 @@ struct OPENVINO_GENAI_EXPORTS MeanStdPair { * @param get_generate_duration Returns the mean and standard deviation of generate duration. * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration. * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration. + * @param get_grammar_compiler_init_times Returns a map with the time to initialize the grammar compiler for each backend in milliseconds. + * @param get_grammar_compile_time Returns the time to compile the grammar in milliseconds. * @param get_microsec Converts a duration to microseconds. * @param m_evaluated Flag indicating if raw metrics were evaluated. * If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them. @@ -103,6 +120,10 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { MeanStdPair ipot; // Inference time (in ms) per output token. MeanStdPair throughput; // Tokens per second. + // Time to initialize grammar compiler for each backend in ms. + std::map grammar_compiler_init_times; + SummaryStats grammar_compile_time; // Time to compile grammar in ms. + MeanStdPair generate_duration; MeanStdPair inference_duration; MeanStdPair tokenization_duration = {-1.0f, -1.0f}; @@ -118,6 +139,9 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). MeanStdPair get_ipot(); // Inference time (in ms) per output token. MeanStdPair get_throughput(); // Tokens per second. + + std::map get_grammar_compiler_init_times(); + SummaryStats get_grammar_compile_time(); // in ms MeanStdPair get_inference_duration(); // in ms MeanStdPair get_generate_duration(); // in ms diff --git a/src/cpp/src/continuous_batching/pipeline_impl.cpp b/src/cpp/src/continuous_batching/pipeline_impl.cpp index 9f585d1a5d..575ee14339 100644 --- a/src/cpp/src/continuous_batching/pipeline_impl.cpp +++ b/src/cpp/src/continuous_batching/pipeline_impl.cpp @@ -421,11 +421,12 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorstart(); - + m_sampler->clear_structured_output_compile_times(); while (has_non_finished_requests()) { try { const auto infer_start = std::chrono::steady_clock::now(); step(); + // During prefill step (or steps if max_batch_size < prompt_len) we don't generate new tokens, // but still inference took place, so we need to add this time to the total inference duration. raw_perf_counters.m_inference_durations[0] += MicroSeconds(m_pipeline_metrics.inference_duration); @@ -444,6 +445,12 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorget_structured_output_times(); + perf_metrics.grammar_compiler_init_times = times.first; + for (const auto& t: times.second) { + raw_perf_counters.m_grammar_compile_times.emplace_back(t); + } + // waiting for competion of streaming streamer_ptr->end(); diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 558804030a..d3cb461830 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -149,6 +149,9 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { read_anymap_param(properties, "assistant_confidence_threshold", assistant_confidence_threshold); read_anymap_param(properties, "num_assistant_tokens", num_assistant_tokens); read_anymap_param(properties, "max_ngram_size", max_ngram_size); + + // Structured output + read_anymap_param(properties, "structured_output_config", structured_output_config); } StructuredOutputConfig::StructuredOutputConfig(const ov::AnyMap& properties) { diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index cf9034823b..6c69ba1d39 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -30,6 +30,15 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector& durations) { + if (durations.size() == 0) { + return {-1, -1, -1, -1}; + } + auto minmax = std::minmax_element(durations.begin(), durations.end()); + auto meanstd = calc_mean_and_std(durations); + return {meanstd.mean, meanstd.std, minmax.first->count() / 1000.0f, minmax.second->count() / 1000.0f}; +} + float PerfMetrics::get_load_time() { return load_time; } @@ -84,6 +93,16 @@ MeanStdPair PerfMetrics::get_inference_duration() { return inference_duration; } +std::map PerfMetrics::get_grammar_compiler_init_times() { + return grammar_compiler_init_times; +} + +SummaryStats PerfMetrics::get_grammar_compile_time() { + evaluate_statistics(); + return grammar_compile_time; +} + + float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) { return std::chrono::duration_cast(duration).count(); } @@ -124,6 +143,8 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations); ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); + grammar_compile_time = calc_full_stat(raw_metrics.m_grammar_compile_times); + generate_duration = calc_mean_and_std(raw_metrics.generate_durations); tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); @@ -136,9 +157,22 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); - + // Copy left value to res. PerfMetrics res = *this; + + // maps with grammar compiler init times should not have conflicting keys + // {{"xgrammar", 10}} + {{"llmguidance", 20}} = {{"grammar", 10}, {"llmguidance", 20}} - is OK! + // {{"xgrammar", 10}} + {{"xgrammar", 10}} = {{"xgrammar", 10}} - is OK! + // {{"xgrammar", 10}} + {{"xgrammar", 20}} = is NOT OK! Fails on assert! + for (const auto& [key, value] : right.grammar_compiler_init_times) { + auto it = res.grammar_compiler_init_times.find(key); + if (it != res.grammar_compiler_init_times.end()) { + OPENVINO_ASSERT(it->second == value, "Grammar compiler init time for the same backend should be the same. ", + "You are trying to accumulate metrics for different pipelines which is not allowed."); + } + res.grammar_compiler_init_times[key] = value; + } // Concatenate durations, batch_sizes first token times. auto& new_durations = res.raw_metrics.m_durations; @@ -170,6 +204,10 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); + // Concatenate structured output compilation times. + auto& new_grammar_compile_times = res.raw_metrics.m_grammar_compile_times; + new_grammar_compile_times.insert(new_grammar_compile_times.end(), right.raw_metrics.m_grammar_compile_times.begin(), right.raw_metrics.m_grammar_compile_times.end()); + res.num_generated_tokens += right.num_generated_tokens; res.num_input_tokens += right.num_input_tokens; res.m_evaluated = false; diff --git a/src/cpp/src/sampling/sampler.cpp b/src/cpp/src/sampling/sampler.cpp index 4801c6e67d..7a593dc6e1 100644 --- a/src/cpp/src/sampling/sampler.cpp +++ b/src/cpp/src/sampling/sampler.cpp @@ -241,6 +241,21 @@ std::map Sampler::GroupBeamSearcher::get_beam_idxs() { return next_beams; } +std::pair, std::vector> Sampler::get_structured_output_times() { + if (m_structured_output_controller) { + return m_structured_output_controller->get_times(); + } else { + // If compiled without structured output support, return empty times + return {{}, {}}; + } +} + +void Sampler::clear_structured_output_compile_times() { + if (m_structured_output_controller) { + m_structured_output_controller->clear_compile_times(); + } +} + void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output, const std::pair>& stop_strings) { diff --git a/src/cpp/src/sampling/sampler.hpp b/src/cpp/src/sampling/sampler.hpp index 2fb8648aac..52faa79702 100644 --- a/src/cpp/src/sampling/sampler.hpp +++ b/src/cpp/src/sampling/sampler.hpp @@ -123,6 +123,9 @@ class Sampler { void create_logit_processor(uint64_t request_id, const GenerationConfig& sampling_parameters, const TokenIds& prompt); std::map get_beam_idxs(SequenceGroup::CPtr sequence_group); + // pair with map with backend name and corresponding compiler init time, and vector of compile times for each concrete grammar + std::pair, std::vector> get_structured_output_times(); + void clear_structured_output_compile_times(); }; class Sampler::GroupBeamSearcher { diff --git a/src/cpp/src/sampling/structured_output/structured_output_controller.cpp b/src/cpp/src/sampling/structured_output/structured_output_controller.cpp index cdce0e054a..cf569d28e4 100644 --- a/src/cpp/src/sampling/structured_output/structured_output_controller.cpp +++ b/src/cpp/src/sampling/structured_output/structured_output_controller.cpp @@ -41,6 +41,8 @@ StructuredOutputController::get_logits_transformer(const ov::genai::GenerationCo } std::string backend_name = (*guided_gen_config).backend.value_or(get_default_backend_name()); + std::unique_lock lock(m_mutex); + // Check if backend already instantiated auto impl_it = m_impls.find(backend_name); if (impl_it == m_impls.end()) { @@ -52,12 +54,28 @@ StructuredOutputController::get_logits_transformer(const ov::genai::GenerationCo } // Create the backend instance and store it + const auto start = std::chrono::steady_clock::now(); m_impls[backend_name] = factory_it->second(m_tokenizer, m_vocab_size); impl_it = m_impls.find(backend_name); + const auto end = std::chrono::steady_clock::now(); + m_init_grammar_compiler_times[backend_name] = std::chrono::duration_cast(end - start).count(); } // Use the instantiated backend - return impl_it->second->get_logits_transformer(sampling_parameters); + const auto start = std::chrono::steady_clock::now(); + auto res = impl_it->second->get_logits_transformer(sampling_parameters); + const auto end = std::chrono::steady_clock::now(); + m_grammar_compile_times.emplace_back(std::chrono::duration_cast(end - start).count()); + return res; +} + +std::pair, std::vector> StructuredOutputController::get_times() const { + return {m_init_grammar_compiler_times, m_grammar_compile_times}; +} + +void StructuredOutputController::clear_compile_times() { + std::lock_guard lock(m_mutex); + m_grammar_compile_times.clear(); } } // namespace genai diff --git a/src/cpp/src/sampling/structured_output/structured_output_controller.hpp b/src/cpp/src/sampling/structured_output/structured_output_controller.hpp index da53361a10..c40e1b99e3 100644 --- a/src/cpp/src/sampling/structured_output/structured_output_controller.hpp +++ b/src/cpp/src/sampling/structured_output/structured_output_controller.hpp @@ -75,11 +75,16 @@ class StructuredOutputController { static void set_default_backend(const std::string& name); static std::string& get_default_backend_name(); static std::unordered_map& get_backend_registry(); - + + std::pair, std::vector> get_times() const; + void clear_compile_times(); private: + std::map m_init_grammar_compiler_times; + std::vector m_grammar_compile_times; std::unordered_map> m_impls; const ov::genai::Tokenizer& m_tokenizer; std::optional m_vocab_size; + std::mutex m_mutex; }; } // namespace genai diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 9b5bb942e6..31c67e7786 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -5,7 +5,7 @@ from __future__ import annotations import openvino._pyopenvino import os import typing -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'ExtendedPerfMetrics', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'ImageGenerationPerfMetrics', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawImageGenerationPerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'SDPerModelsPerfMetrics', 'SDPerfMetrics', 'Scheduler', 'SchedulerConfig', 'SpeechGenerationConfig', 'SpeechGenerationPerfMetrics', 'StopCriteria', 'StreamerBase', 'StreamingStatus', 'StructuredOutputConfig', 'T5EncoderModel', 'Text2ImagePipeline', 'Text2SpeechDecodedResults', 'Text2SpeechPipeline', 'TextEmbeddingPipeline', 'TextStreamer', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'ExtendedPerfMetrics', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'ImageGenerationPerfMetrics', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawImageGenerationPerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'SDPerModelsPerfMetrics', 'SDPerfMetrics', 'Scheduler', 'SchedulerConfig', 'SpeechGenerationConfig', 'SpeechGenerationPerfMetrics', 'StopCriteria', 'StreamerBase', 'StreamingStatus', 'StructuredOutputConfig', 'SummaryStats', 'T5EncoderModel', 'Text2ImagePipeline', 'Text2SpeechDecodedResults', 'Text2SpeechPipeline', 'TextEmbeddingPipeline', 'TextStreamer', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version'] class Adapter: """ Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. @@ -546,6 +546,12 @@ class ExtendedPerfMetrics: :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. :type get_detokenization_duration: MeanStdPair + :param get_grammar_compiler_init_times: Returns a map with the time to initialize the grammar compiler for each backend in milliseconds. + :type get_grammar_compiler_init_times: dict[str, float] + + :param get_grammar_compile_time: Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds. + :type get_grammar_compile_time: SummaryStats + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. :type raw_metrics: RawPerfMetrics """ @@ -1428,6 +1434,12 @@ class PerfMetrics: :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. :type get_detokenization_duration: MeanStdPair + :param get_grammar_compiler_init_times: Returns a map with the time to initialize the grammar compiler for each backend in milliseconds. + :type get_grammar_compiler_init_times: dict[str, float] + + :param get_grammar_compile_time: Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds. + :type get_grammar_compile_time: SummaryStats + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. :type raw_metrics: RawPerfMetrics """ @@ -1441,6 +1453,10 @@ class PerfMetrics: ... def get_generate_duration(self) -> MeanStdPair: ... + def get_grammar_compile_time(self) -> SummaryStats: + ... + def get_grammar_compiler_init_times(self) -> dict[str, float]: + ... def get_inference_duration(self) -> MeanStdPair: ... def get_ipot(self) -> MeanStdPair: @@ -1557,6 +1573,9 @@ class RawPerfMetrics: :param inference_durations : Total inference duration for each generate call in milliseconds. :type batch_sizes: list[float] + + :param grammar_compile_times: Time to compile the grammar in milliseconds. + :type grammar_compile_times: list[float] """ def __init__(self) -> None: ... @@ -1567,6 +1586,9 @@ class RawPerfMetrics: def generate_durations(self) -> list[float]: ... @property + def grammar_compile_times(self) -> list[float]: + ... + @property def inference_durations(self) -> list[float]: ... @property @@ -1987,6 +2009,23 @@ class StructuredOutputConfig: @regex.setter def regex(self, arg0: str | None) -> None: ... +class SummaryStats: + def __init__(self) -> None: + ... + def as_tuple(self) -> tuple: + ... + @property + def max(self) -> float: + ... + @property + def mean(self) -> float: + ... + @property + def min(self) -> float: + ... + @property + def std(self) -> float: + ... class T5EncoderModel: """ T5EncoderModel class. diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp index 48a18fbec3..6466c7269d 100644 --- a/src/python/py_perf_metrics.cpp +++ b/src/python/py_perf_metrics.cpp @@ -13,6 +13,7 @@ namespace py = pybind11; +using ov::genai::SummaryStats; using ov::genai::MeanStdPair; using ov::genai::PerfMetrics; using ov::genai::RawPerfMetrics; @@ -53,6 +54,9 @@ auto raw_perf_metrics_docstring = R"( :param inference_durations : Total inference duration for each generate call in milliseconds. :type batch_sizes: list[float] + + :param grammar_compile_times: Time to compile the grammar in milliseconds. + :type grammar_compile_times: list[float] )"; auto perf_metrics_docstring = R"( @@ -101,6 +105,12 @@ auto perf_metrics_docstring = R"( :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. :type get_detokenization_duration: MeanStdPair + :param get_grammar_compiler_init_times: Returns a map with the time to initialize the grammar compiler for each backend in milliseconds. + :type get_grammar_compiler_init_times: dict[str, float] + + :param get_grammar_compile_time: Returns the mean, standard deviation, min, and max of grammar compile times in milliseconds. + :type get_grammar_compile_time: SummaryStats + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. :type raw_metrics: RawPerfMetrics )"; @@ -188,6 +198,19 @@ void init_perf_metrics(py::module_& m) { }) .def_property_readonly("inference_durations", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::m_inference_durations); + }) + .def_property_readonly("grammar_compile_times", [](const RawPerfMetrics &rw) { + return pyutils::get_ms(rw, &RawPerfMetrics::m_grammar_compile_times); + }); + + py::class_(m, "SummaryStats") + .def(py::init<>()) + .def_readonly("mean", &SummaryStats::mean) + .def_readonly("std", &SummaryStats::std) + .def_readonly("min", &SummaryStats::min) + .def_readonly("max", &SummaryStats::max) + .def("as_tuple", [](const SummaryStats& self) { + return py::make_tuple(self.mean, self.std, self.min, self.max); }); py::class_(m, "MeanStdPair") @@ -201,6 +224,8 @@ void init_perf_metrics(py::module_& m) { py::class_(m, "PerfMetrics", perf_metrics_docstring) .def(py::init<>()) .def("get_load_time", &PerfMetrics::get_load_time) + .def("get_grammar_compiler_init_times", &PerfMetrics::get_grammar_compiler_init_times) + .def("get_grammar_compile_time", &PerfMetrics::get_grammar_compile_time) .def("get_num_generated_tokens", &PerfMetrics::get_num_generated_tokens) .def("get_num_input_tokens", &PerfMetrics::get_num_input_tokens) .def("get_ttft", &PerfMetrics::get_ttft) diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 09ea05f202..341648511a 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -334,6 +334,8 @@ ov::AnyMap kwargs_to_any_map(const py::kwargs& kwargs) { if (utils::py_object_is_any_map(value) && key == "config") { auto map = utils::py_object_to_any_map(value); params.insert(map.begin(), map.end()); + } else if (py::isinstance(value)) { + params[key] = py::cast(value); } else { if (py::isinstance(value)) { OPENVINO_ASSERT(!py::isinstance(value), "Property \"", key, "\" can't be None."); diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 23ef545d49..1ac4382fc2 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -10,6 +10,8 @@ import json import numpy as np from pathlib import Path +from typing import Literal +from pydantic import BaseModel, Field import openvino as ov import openvino_genai as ov_genai @@ -779,6 +781,44 @@ def test_perf_metrics(generation_config, prompt): assert len(raw_metrics.m_durations) > 0 +test_cases = [ + (dict(max_new_tokens=20), 'Generate json of a person'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.precommit +@pytest.mark.nightly +def test_perf_metrics_with_structured_output(generation_config, prompt): + class Person(BaseModel): + name: str = Field(pattern=r"^[A-Z][a-z]{1,20}$") + surname: str = Field(pattern=r"^[A-Z][a-z]{1,20}$") + age: int + city: Literal["Dublin", "Dubai", "Munich"] + generation_config.update(dict(structured_output_config=ov_genai.StructuredOutputConfig(json_schema=json.dumps(Person.model_json_schema())))) + + model_id = 'katuni4ka/tiny-random-gemma2' + _, _, models_path = download_and_convert_model(model_id) + ov_pipe = create_ov_pipeline(models_path) + perf_metrics = ov_pipe.generate([prompt], **generation_config).perf_metrics + raw_metrics = perf_metrics.raw_metrics + + assert len(perf_metrics.get_grammar_compiler_init_times()) > 0 + assert 'xgrammar' in perf_metrics.get_grammar_compiler_init_times() and perf_metrics.get_grammar_compiler_init_times()['xgrammar'] > 0.0 + + assert len(raw_metrics.grammar_compile_times) > 0 + + raw_compile_times = np.array(raw_metrics.grammar_compile_times) / 1000 + assert np.allclose(np.mean(raw_compile_times), perf_metrics.get_grammar_compile_time().mean) + assert np.allclose(np.std(raw_compile_times), perf_metrics.get_grammar_compile_time().std) + assert np.allclose(np.min(raw_compile_times), perf_metrics.get_grammar_compile_time().min) + assert np.allclose(np.max(raw_compile_times), perf_metrics.get_grammar_compile_time().max) + + # Check that metrics are correctly accumulated/concatenated + perf_metrics_2 = ov_pipe.generate([prompt], **generation_config).perf_metrics + raw_metrics_2 = perf_metrics_2.raw_metrics + accumulated_metrics = perf_metrics + perf_metrics_2 + assert accumulated_metrics.raw_metrics.grammar_compile_times == raw_metrics.grammar_compile_times + raw_metrics_2.grammar_compile_times + + @pytest.mark.parametrize("pipeline_type", get_main_pipeline_types()) @pytest.mark.parametrize("stop_str", {True, False}) @pytest.mark.precommit